Search in sources :

Example 6 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class MLContextTest method testInputBinaryBlockMatrixDML.

@Test
public void testInputBinaryBlockMatrixDML() {
    System.out.println("MLContextTest - input BinaryBlockMatrix DML");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    BinaryBlockMatrix binaryBlockMatrix = new BinaryBlockMatrix(dataFrame);
    Script script = dml("avg = avg(M);").in("M", binaryBlockMatrix).out("avg");
    double avg = ml.execute(script).getDouble("avg");
    Assert.assertEquals(50.0, avg, 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) BinaryBlockMatrix(org.apache.sysml.api.mlcontext.BinaryBlockMatrix) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 7 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class MLContextTest method testInputBinaryBlockMatrixPYDML.

@Test
public void testInputBinaryBlockMatrixPYDML() {
    System.out.println("MLContextTest - input BinaryBlockMatrix PYDML");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    BinaryBlockMatrix binaryBlockMatrix = new BinaryBlockMatrix(dataFrame);
    Script script = pydml("avg = avg(M)").in("M", binaryBlockMatrix).out("avg");
    double avg = ml.execute(script).getDouble("avg");
    Assert.assertEquals(50.0, avg, 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) BinaryBlockMatrix(org.apache.sysml.api.mlcontext.BinaryBlockMatrix) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 8 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class MLResults method getDataFrameDoubleNoIDColumn.

/**
 * Obtain an output as a {@code DataFrame} of doubles with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of doubles:
 * </p>
 * <code>[1.0,2.0]
 * <br>[3.0,4.0]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of doubles with no ID column
 */
public Dataset<Row> getDataFrameDoubleNoIDColumn(String outputName) {
    if (isFrameObject(outputName)) {
        throw new MLContextException("This method currently supports only matrices");
    }
    MatrixObject mo = getMatrixObject(outputName);
    Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, false);
    return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Row(org.apache.spark.sql.Row)

Example 9 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class MLResults method getDataFrameVectorNoIDColumn.

/**
 * Obtain an output as a {@code DataFrame} of vectors with no ID column.
 * <p>
 * The following matrix in DML:
 * </p>
 * <code>M = full('1 2 3 4', rows=2, cols=2);
 * </code>
 * <p>
 * is equivalent to the following {@code DataFrame} of vectors:
 * </p>
 * <code>[[1.0,2.0]]
 * <br>[[3.0,4.0]]
 * </code>
 *
 * @param outputName
 *            the name of the output
 * @return the output as a {@code DataFrame} of vectors with no ID column
 */
public Dataset<Row> getDataFrameVectorNoIDColumn(String outputName) {
    if (isFrameObject(outputName)) {
        throw new MLContextException("This method currently supports only matrices");
    }
    MatrixObject mo = getMatrixObject(outputName);
    Dataset<Row> df = MLContextConversionUtil.matrixObjectToDataFrame(mo, sparkExecutionContext, true);
    return df.drop(RDDConverterUtils.DF_ID_COLUMN);
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Row(org.apache.spark.sql.Row)

Example 10 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class RemoteDPParForSpark method getPartitionedInput.

@SuppressWarnings("unchecked")
private static JavaPairRDD<Long, Writable> getPartitionedInput(SparkExecutionContext sec, String matrixvar, OutputInfo oi, PartitionFormat dpf) {
    InputInfo ii = InputInfo.BinaryBlockInputInfo;
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    // NOTE: there will always be a checkpoint rdd on top of the input rdd and the dataset
    if (hasInputDataSet(dpf, mo)) {
        DatasetObject dsObj = (DatasetObject) mo.getRDDHandle().getLineageChilds().get(0).getLineageChilds().get(0);
        Dataset<Row> in = dsObj.getDataset();
        // construct or reuse row ids
        JavaPairRDD<Row, Long> prepinput = dsObj.containsID() ? in.javaRDD().mapToPair(new DataFrameExtractIDFunction(in.schema().fieldIndex(RDDConverterUtils.DF_ID_COLUMN))) : // zip row index
        in.javaRDD().zipWithIndex();
        // convert row to row in matrix block format
        return prepinput.mapToPair(new DataFrameToRowBinaryBlockFunction(mc.getCols(), dsObj.isVectorBased(), dsObj.containsID()));
    } else // binary block input rdd without grouping
    if (!requiresGrouping(dpf, mo)) {
        // get input rdd and data partitioning
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    } else // default binary block input rdd with grouping
    {
        // get input rdd, avoid unnecessary caching if input is checkpoint and not cached yet
        // to reduce memory pressure for shuffle and subsequent
        JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
        if (mo.getRDDHandle().isCheckpointRDD() && !sec.isRDDCached(in.id()))
            in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
        // data partitioning of input rdd
        DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, dpf._dpf, dpf._N);
        return in.flatMapToPair(dpfun);
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DatasetObject(org.apache.sysml.runtime.instructions.spark.data.DatasetObject) DataFrameExtractIDFunction(org.apache.sysml.runtime.instructions.spark.utils.RDDConverterUtils.DataFrameExtractIDFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Row(org.apache.spark.sql.Row)

Aggregations

Row (org.apache.spark.sql.Row)129 Test (org.junit.Test)60 Script (org.apache.sysml.api.mlcontext.Script)53 StructType (org.apache.spark.sql.types.StructType)50 ArrayList (java.util.ArrayList)48 StructField (org.apache.spark.sql.types.StructField)46 SparkSession (org.apache.spark.sql.SparkSession)43 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)19 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)19 MLResults (org.apache.sysml.api.mlcontext.MLResults)18 DenseVector (org.apache.spark.ml.linalg.DenseVector)16 Vector (org.apache.spark.ml.linalg.Vector)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)15 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 SQLContext (org.apache.spark.sql.SQLContext)12 User (uk.gov.gchq.gaffer.user.User)12 HashSet (java.util.HashSet)10 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)9 Tuple2 (scala.Tuple2)9 GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)9