use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class MLContextTest method testInputBinaryBlockMatrixDML.
@Test
public void testInputBinaryBlockMatrixDML() {
System.out.println("MLContextTest - input BinaryBlockMatrix DML");
List<String> list = new ArrayList<String>();
list.add("10,20,30");
list.add("40,50,60");
list.add("70,80,90");
JavaRDD<String> javaRddString = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
BinaryBlockMatrix binaryBlockMatrix = new BinaryBlockMatrix(dataFrame);
Script script = dml("avg = avg(M);").in("M", binaryBlockMatrix).out("avg");
double avg = ml.execute(script).getDouble("avg");
Assert.assertEquals(50.0, avg, 0.0);
}
use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class MLContextTest method testInputBinaryBlockMatrixPYDML.
@Test
public void testInputBinaryBlockMatrixPYDML() {
System.out.println("MLContextTest - input BinaryBlockMatrix PYDML");
List<String> list = new ArrayList<String>();
list.add("10,20,30");
list.add("40,50,60");
list.add("70,80,90");
JavaRDD<String> javaRddString = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
BinaryBlockMatrix binaryBlockMatrix = new BinaryBlockMatrix(dataFrame);
Script script = pydml("avg = avg(M)").in("M", binaryBlockMatrix).out("avg");
double avg = ml.execute(script).getDouble("avg");
Assert.assertEquals(50.0, avg, 0.0);
}
use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class MLResults method getDataFrameDoubleNoIDColumn.
/**
* Obtain an output as a {@code DataFrame} of doubles with no ID column.
* <p>
* The following matrix in DML:
* </p>
* <code>M = full('1 2 3 4', rows=2, cols=2);
* </code>
* <p>
* is equivalent to the following {@code DataFrame} of doubles:
* </p>
* <code>[1.0,2.0]
* <br>[3.0,4.0]
* </code>
*
* @param outputName
* the name of the output
* @return the output as a {@code DataFrame} of doubles with no ID column
*/
public Dataset<Row> getDataFrameDoubleNoIDColumn(String outputName) {
if (isFrameObject(outputName)) {
throw new MLContextException("This method currently supports only matrices");
}
MatrixObject mo = getMatrixObject(outputName);
Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false);
return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class MLResults method getDataFrameVectorNoIDColumn.
/**
* Obtain an output as a {@code DataFrame} of vectors with no ID column.
* <p>
* The following matrix in DML:
* </p>
* <code>M = full('1 2 3 4', rows=2, cols=2);
* </code>
* <p>
* is equivalent to the following {@code DataFrame} of vectors:
* </p>
* <code>[[1.0,2.0]]
* <br>[[3.0,4.0]]
* </code>
*
* @param outputName
* the name of the output
* @return the output as a {@code DataFrame} of vectors with no ID column
*/
public Dataset<Row> getDataFrameVectorNoIDColumn(String outputName) {
if (isFrameObject(outputName)) {
throw new MLContextException("This method currently supports only matrices");
}
MatrixObject mo = getMatrixObject(outputName);
Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, true);
return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
use of org.apache.spark.sql.Row in project incubator-systemml by apache.
the class RemoteDPParForSpark method getPartitionedInput.
@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
InputInfo ii = InputInfo.BinaryBlockInputInfo;
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
// NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
if (hasInputDataSet(dpf, mo)) {
DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
Dataset<Row> in = dsObj.getDataset();
// construct or reuse row ids
JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
in.javaRDD().zipWithIndex();
// convert row to row in matrix block format
return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
} else // binary block input rdd without grouping
if (!requiresGrouping(dpf, mo)) {
// get input rdd and data partitioning
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
} else // default binary block input rdd with grouping
{
// get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
// to reduce memory pressure for shuffle and subsequent
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
// data partitioning of input rdd
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
return in.flatMapToPair(dpfun);
}
}
Aggregations