Search in sources :

Example 26 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testInputBinaryBlockMatrixDML.

@Test
public void testInputBinaryBlockMatrixDML() {
    System.out.println("MLContextTest - input BinaryBlockMatrix DML");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    BinaryBlockMatrix binaryBlockMatrix = new BinaryBlockMatrix(dataFrame);
    Script script = dml("avg = avg(M);").in("M", binaryBlockMatrix).out("avg");
    double avg = ml.execute(script).getDouble("avg");
    Assert.assertEquals(50.0, avg, 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) BinaryBlockMatrix(org.apache.sysml.api.mlcontext.BinaryBlockMatrix) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 27 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumDMLDoublesWithNoIDColumn.

@Test
public void testDataFrameSumDMLDoublesWithNoIDColumn() {
    System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_DOUBLES);
    Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 450.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Test(org.junit.Test)

Example 28 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLVectorWithNoIDColumn.

@Test
public void testDataFrameSumPYDMLVectorWithNoIDColumn() {
    System.out.println("MLContextTest - DataFrame sum PYDML, vector with no ID column");
    List<Vector> list = new ArrayList<Vector>();
    list.add(Vectors.dense(1.0, 2.0, 3.0));
    list.add(Vectors.dense(4.0, 5.0, 6.0));
    list.add(Vectors.dense(7.0, 8.0, 9.0));
    JavaRDD<Vector> javaRddVector = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) Test(org.junit.Test)

Example 29 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class FrameRDDConverterUtils method binaryBlockToDataFrame.

public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<Long, FrameBlock> in, MatrixCharacteristics mc, ValueType[] schema) {
    if (!mc.colsKnown())
        throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
    //convert binary block to rows rdd 
    JavaRDD<Row> rowRDD = in.flatMap(new BinaryBlockToDataFrameFunction());
    //create data frame schema
    if (schema == null)
        schema = UtilFunctions.nCopies((int) mc.getCols(), ValueType.STRING);
    StructType dfSchema = convertFrameSchemaToDFSchema(schema, true);
    //rdd to data frame conversion
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) StructType(org.apache.spark.sql.types.StructType) Row(org.apache.spark.sql.Row)

Example 30 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextConversionUtil method determineFrameFormatIfNeeded.

/**
	 * If the FrameFormat of the DataFrame has not been explicitly specified,
	 * attempt to determine the proper FrameFormat.
	 *
	 * @param dataFrame
	 *            the Spark {@code DataFrame}
	 * @param frameMetadata
	 *            the frame metadata, if available
	 */
public static void determineFrameFormatIfNeeded(Dataset<Row> dataFrame, FrameMetadata frameMetadata) {
    FrameFormat frameFormat = frameMetadata.getFrameFormat();
    if (frameFormat != null) {
        return;
    }
    StructType schema = dataFrame.schema();
    boolean hasID = false;
    try {
        schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
        hasID = true;
    } catch (IllegalArgumentException iae) {
    }
    FrameFormat ff = hasID ? FrameFormat.DF_WITH_INDEX : FrameFormat.DF;
    frameMetadata.setFrameFormat(ff);
}
Also used : StructType(org.apache.spark.sql.types.StructType)

Aggregations

StructType (org.apache.spark.sql.types.StructType)44 StructField (org.apache.spark.sql.types.StructField)40 Row (org.apache.spark.sql.Row)38 ArrayList (java.util.ArrayList)37 Test (org.junit.Test)32 Script (org.apache.sysml.api.mlcontext.Script)31 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)17 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)15 Vector (org.apache.spark.ml.linalg.Vector)12 Tuple2 (scala.Tuple2)6 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)5 MLResults (org.apache.sysml.api.mlcontext.MLResults)5 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)5 SparkSession (org.apache.spark.sql.SparkSession)4 DataType (org.apache.spark.sql.types.DataType)4 ValueType (org.apache.sysml.parser.Expression.ValueType)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)3 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)3 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)3 HashMap (java.util.HashMap)2