Search in sources :

Example 91 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLMllibVectorWithNoIDColumn.

@Test
public void testDataFrameSumPYDMLMllibVectorWithNoIDColumn() {
    System.out.println("MLContextTest - DataFrame sum PYDML, mllib vector with no ID column");
    List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<org.apache.spark.mllib.linalg.Vector>();
    list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
    list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
    list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
    JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 92 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class MLContextTest method testOutputDataFramePYDMLDoublesWithIDColumn.

@Test
public void testOutputDataFramePYDMLDoublesWithIDColumn() {
    System.out.println("MLContextTest - output DataFrame PYDML, doubles with ID column");
    String s = "M = full('1 2 3 4', rows=2, cols=2)";
    Script script = pydml(s).out("M");
    MLResults results = ml.execute(script);
    Dataset<Row> dataFrame = results.getDataFrameDoubleWithIDColumn("M");
    List<Row> list = dataFrame.collectAsList();
    Row row1 = list.get(0);
    Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
    Assert.assertEquals(1.0, row1.getDouble(1), 0.0);
    Assert.assertEquals(2.0, row1.getDouble(2), 0.0);
    Row row2 = list.get(1);
    Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
    Assert.assertEquals(3.0, row2.getDouble(1), 0.0);
    Assert.assertEquals(4.0, row2.getDouble(2), 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) MLResults(org.apache.sysml.api.mlcontext.MLResults) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 93 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class MLContextTest method testDataFrameGoodMetadataPYDML.

@Test
public void testDataFrameGoodMetadataPYDML() {
    System.out.println("MLContextTest - DataFrame good metadata PYDML");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(3, 3, 9);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 450.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Test(org.junit.Test)

Example 94 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class MLContextFrameTest method testFrame.

public void testFrame(FrameFormat format, SCRIPT_TYPE script_type, IO_TYPE inputType, IO_TYPE outputType) {
    System.out.println("MLContextTest - Frame JavaRDD<String> for format: " + format + " Script: " + script_type);
    List<String> listA = new ArrayList<String>();
    List<String> listB = new ArrayList<String>();
    FrameMetadata fmA = null, fmB = null;
    Script script = null;
    ValueType[] schemaA = { ValueType.INT, ValueType.STRING, ValueType.DOUBLE, ValueType.BOOLEAN };
    List<ValueType> lschemaA = Arrays.asList(schemaA);
    FrameSchema fschemaA = new FrameSchema(lschemaA);
    ValueType[] schemaB = { ValueType.STRING, ValueType.DOUBLE, ValueType.BOOLEAN };
    List<ValueType> lschemaB = Arrays.asList(schemaB);
    FrameSchema fschemaB = new FrameSchema(lschemaB);
    if (inputType != IO_TYPE.FILE) {
        if (format == FrameFormat.CSV) {
            listA.add("1,Str2,3.0,true");
            listA.add("4,Str5,6.0,false");
            listA.add("7,Str8,9.0,true");
            listB.add("Str12,13.0,true");
            listB.add("Str25,26.0,false");
            fmA = new FrameMetadata(FrameFormat.CSV, fschemaA, 3, 4);
            fmB = new FrameMetadata(FrameFormat.CSV, fschemaB, 2, 3);
        } else if (format == FrameFormat.IJV) {
            listA.add("1 1 1");
            listA.add("1 2 Str2");
            listA.add("1 3 3.0");
            listA.add("1 4 true");
            listA.add("2 1 4");
            listA.add("2 2 Str5");
            listA.add("2 3 6.0");
            listA.add("2 4 false");
            listA.add("3 1 7");
            listA.add("3 2 Str8");
            listA.add("3 3 9.0");
            listA.add("3 4 true");
            listB.add("1 1 Str12");
            listB.add("1 2 13.0");
            listB.add("1 3 true");
            listB.add("2 1 Str25");
            listB.add("2 2 26.0");
            listB.add("2 3 false");
            fmA = new FrameMetadata(FrameFormat.IJV, fschemaA, 3, 4);
            fmB = new FrameMetadata(FrameFormat.IJV, fschemaB, 2, 3);
        }
        JavaRDD<String> javaRDDA = sc.parallelize(listA);
        JavaRDD<String> javaRDDB = sc.parallelize(listB);
        if (inputType == IO_TYPE.DATAFRAME) {
            JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDA, CSV_DELIM, schemaA);
            JavaRDD<Row> javaRddRowB = FrameRDDConverterUtils.csvToRowRDD(sc, javaRDDB, CSV_DELIM, schemaB);
            // Create DataFrame
            StructType dfSchemaA = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaA, false);
            Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, dfSchemaA);
            StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
            Dataset<Row> dataFrameB = spark.createDataFrame(javaRddRowB, dfSchemaB);
            if (script_type == SCRIPT_TYPE.DML)
                script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).out("A").out("C");
            else if (script_type == SCRIPT_TYPE.PYDML)
                // DO NOT USE ; at the end of any statment, it throws NPE
                script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", dataFrameA, fmA).in("B", dataFrameB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
        } else {
            if (inputType == IO_TYPE.JAVA_RDD_STR_CSV || inputType == IO_TYPE.JAVA_RDD_STR_IJV) {
                if (script_type == SCRIPT_TYPE.DML)
                    script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).out("A").out("C");
                else if (script_type == SCRIPT_TYPE.PYDML)
                    // DO NOT USE ; at the end of any statment, it throws
                    // NPE
                    script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", javaRDDA, fmA).in("B", javaRDDB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
            } else if (inputType == IO_TYPE.RDD_STR_CSV || inputType == IO_TYPE.RDD_STR_IJV) {
                RDD<String> rddA = JavaRDD.toRDD(javaRDDA);
                RDD<String> rddB = JavaRDD.toRDD(javaRDDB);
                if (script_type == SCRIPT_TYPE.DML)
                    script = dml("A[2:3,2:4]=B;C=A[2:3,2:3]").in("A", rddA, fmA).in("B", rddB, fmB).out("A").out("C");
                else if (script_type == SCRIPT_TYPE.PYDML)
                    // DO NOT USE ; at the end of any statment, it throws
                    // NPE
                    script = pydml("A[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("A", rddA, fmA).in("B", rddB, fmB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
            }
        }
    } else {
        // Input type is file
        String fileA = null, fileB = null;
        if (format == FrameFormat.CSV) {
            fileA = baseDirectory + File.separator + "FrameA.csv";
            fileB = baseDirectory + File.separator + "FrameB.csv";
        } else if (format == FrameFormat.IJV) {
            fileA = baseDirectory + File.separator + "FrameA.ijv";
            fileB = baseDirectory + File.separator + "FrameB.ijv";
        }
        if (script_type == SCRIPT_TYPE.DML)
            script = dml("A=read($A); B=read($B);A[2:3,2:4]=B;C=A[2:3,2:3];A[1,1]=234").in("$A", fileA, fmA).in("$B", fileB, fmB).out("A").out("C");
        else if (script_type == SCRIPT_TYPE.PYDML)
            // DO NOT USE ; at the end of any statment, it throws NPE
            script = pydml("A=load($A)\nB=load($B)\nA[$X:$Y,$X:$Z]=B\nC=A[$X:$Y,$X:$Y]").in("$A", fileA).in("$B", fileB).in("$X", 1).in("$Y", 3).in("$Z", 4).out("A").out("C");
    }
    MLResults mlResults = ml.execute(script);
    // Validate output schema
    List<ValueType> lschemaOutA = Arrays.asList(mlResults.getFrameObject("A").getSchema());
    List<ValueType> lschemaOutC = Arrays.asList(mlResults.getFrameObject("C").getSchema());
    Assert.assertEquals(ValueType.INT, lschemaOutA.get(0));
    Assert.assertEquals(ValueType.STRING, lschemaOutA.get(1));
    Assert.assertEquals(ValueType.DOUBLE, lschemaOutA.get(2));
    Assert.assertEquals(ValueType.BOOLEAN, lschemaOutA.get(3));
    Assert.assertEquals(ValueType.STRING, lschemaOutC.get(0));
    Assert.assertEquals(ValueType.DOUBLE, lschemaOutC.get(1));
    if (outputType == IO_TYPE.JAVA_RDD_STR_CSV) {
        JavaRDD<String> javaRDDStringCSVA = mlResults.getJavaRDDStringCSV("A");
        List<String> linesA = javaRDDStringCSVA.collect();
        Assert.assertEquals("1,Str2,3.0,true", linesA.get(0));
        Assert.assertEquals("4,Str12,13.0,true", linesA.get(1));
        Assert.assertEquals("7,Str25,26.0,false", linesA.get(2));
        JavaRDD<String> javaRDDStringCSVC = mlResults.getJavaRDDStringCSV("C");
        List<String> linesC = javaRDDStringCSVC.collect();
        Assert.assertEquals("Str12,13.0", linesC.get(0));
        Assert.assertEquals("Str25,26.0", linesC.get(1));
    } else if (outputType == IO_TYPE.JAVA_RDD_STR_IJV) {
        JavaRDD<String> javaRDDStringIJVA = mlResults.getJavaRDDStringIJV("A");
        List<String> linesA = javaRDDStringIJVA.collect();
        Assert.assertEquals("1 1 1", linesA.get(0));
        Assert.assertEquals("1 2 Str2", linesA.get(1));
        Assert.assertEquals("1 3 3.0", linesA.get(2));
        Assert.assertEquals("1 4 true", linesA.get(3));
        Assert.assertEquals("2 1 4", linesA.get(4));
        Assert.assertEquals("2 2 Str12", linesA.get(5));
        Assert.assertEquals("2 3 13.0", linesA.get(6));
        Assert.assertEquals("2 4 true", linesA.get(7));
        JavaRDD<String> javaRDDStringIJVC = mlResults.getJavaRDDStringIJV("C");
        List<String> linesC = javaRDDStringIJVC.collect();
        Assert.assertEquals("1 1 Str12", linesC.get(0));
        Assert.assertEquals("1 2 13.0", linesC.get(1));
        Assert.assertEquals("2 1 Str25", linesC.get(2));
        Assert.assertEquals("2 2 26.0", linesC.get(3));
    } else if (outputType == IO_TYPE.RDD_STR_CSV) {
        RDD<String> rddStringCSVA = mlResults.getRDDStringCSV("A");
        Iterator<String> iteratorA = rddStringCSVA.toLocalIterator();
        Assert.assertEquals("1,Str2,3.0,true", iteratorA.next());
        Assert.assertEquals("4,Str12,13.0,true", iteratorA.next());
        Assert.assertEquals("7,Str25,26.0,false", iteratorA.next());
        RDD<String> rddStringCSVC = mlResults.getRDDStringCSV("C");
        Iterator<String> iteratorC = rddStringCSVC.toLocalIterator();
        Assert.assertEquals("Str12,13.0", iteratorC.next());
        Assert.assertEquals("Str25,26.0", iteratorC.next());
    } else if (outputType == IO_TYPE.RDD_STR_IJV) {
        RDD<String> rddStringIJVA = mlResults.getRDDStringIJV("A");
        Iterator<String> iteratorA = rddStringIJVA.toLocalIterator();
        Assert.assertEquals("1 1 1", iteratorA.next());
        Assert.assertEquals("1 2 Str2", iteratorA.next());
        Assert.assertEquals("1 3 3.0", iteratorA.next());
        Assert.assertEquals("1 4 true", iteratorA.next());
        Assert.assertEquals("2 1 4", iteratorA.next());
        Assert.assertEquals("2 2 Str12", iteratorA.next());
        Assert.assertEquals("2 3 13.0", iteratorA.next());
        Assert.assertEquals("2 4 true", iteratorA.next());
        Assert.assertEquals("3 1 7", iteratorA.next());
        Assert.assertEquals("3 2 Str25", iteratorA.next());
        Assert.assertEquals("3 3 26.0", iteratorA.next());
        Assert.assertEquals("3 4 false", iteratorA.next());
        RDD<String> rddStringIJVC = mlResults.getRDDStringIJV("C");
        Iterator<String> iteratorC = rddStringIJVC.toLocalIterator();
        Assert.assertEquals("1 1 Str12", iteratorC.next());
        Assert.assertEquals("1 2 13.0", iteratorC.next());
        Assert.assertEquals("2 1 Str25", iteratorC.next());
        Assert.assertEquals("2 2 26.0", iteratorC.next());
    } else if (outputType == IO_TYPE.DATAFRAME) {
        Dataset<Row> dataFrameA = mlResults.getDataFrame("A").drop(RDDConverterUtils.DF_ID_COLUMN);
        StructType dfschemaA = dataFrameA.schema();
        StructField structTypeA = dfschemaA.apply(0);
        Assert.assertEquals(DataTypes.LongType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(1);
        Assert.assertEquals(DataTypes.StringType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(2);
        Assert.assertEquals(DataTypes.DoubleType, structTypeA.dataType());
        structTypeA = dfschemaA.apply(3);
        Assert.assertEquals(DataTypes.BooleanType, structTypeA.dataType());
        List<Row> listAOut = dataFrameA.collectAsList();
        Row row1 = listAOut.get(0);
        Assert.assertEquals("Mismatch with expected value", Long.valueOf(1), row1.get(0));
        Assert.assertEquals("Mismatch with expected value", "Str2", row1.get(1));
        Assert.assertEquals("Mismatch with expected value", 3.0, row1.get(2));
        Assert.assertEquals("Mismatch with expected value", true, row1.get(3));
        Row row2 = listAOut.get(1);
        Assert.assertEquals("Mismatch with expected value", Long.valueOf(4), row2.get(0));
        Assert.assertEquals("Mismatch with expected value", "Str12", row2.get(1));
        Assert.assertEquals("Mismatch with expected value", 13.0, row2.get(2));
        Assert.assertEquals("Mismatch with expected value", true, row2.get(3));
        Dataset<Row> dataFrameC = mlResults.getDataFrame("C").drop(RDDConverterUtils.DF_ID_COLUMN);
        StructType dfschemaC = dataFrameC.schema();
        StructField structTypeC = dfschemaC.apply(0);
        Assert.assertEquals(DataTypes.StringType, structTypeC.dataType());
        structTypeC = dfschemaC.apply(1);
        Assert.assertEquals(DataTypes.DoubleType, structTypeC.dataType());
        List<Row> listCOut = dataFrameC.collectAsList();
        Row row3 = listCOut.get(0);
        Assert.assertEquals("Mismatch with expected value", "Str12", row3.get(0));
        Assert.assertEquals("Mismatch with expected value", 13.0, row3.get(1));
        Row row4 = listCOut.get(1);
        Assert.assertEquals("Mismatch with expected value", "Str25", row4.get(0));
        Assert.assertEquals("Mismatch with expected value", 26.0, row4.get(1));
    } else {
        String[][] frameA = mlResults.getFrameAs2DStringArray("A");
        Assert.assertEquals("Str2", frameA[0][1]);
        Assert.assertEquals("3.0", frameA[0][2]);
        Assert.assertEquals("13.0", frameA[1][2]);
        Assert.assertEquals("true", frameA[1][3]);
        Assert.assertEquals("Str25", frameA[2][1]);
        String[][] frameC = mlResults.getFrameAs2DStringArray("C");
        Assert.assertEquals("Str12", frameC[0][0]);
        Assert.assertEquals("Str25", frameC[1][0]);
        Assert.assertEquals("13.0", frameC[0][1]);
        Assert.assertEquals("26.0", frameC[1][1]);
    }
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) FrameSchema(org.apache.sysml.api.mlcontext.FrameSchema) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) RDD(org.apache.spark.rdd.RDD) StructField(org.apache.spark.sql.types.StructField) Iterator(scala.collection.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata)

Example 95 with Row

use of org.apache.spark.sql.Row in project incubator-systemml by apache.

the class FrameRDDConverterUtils method binaryBlockToDataFrame.

public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<Long, FrameBlock> in, MatrixCharacteristics mc, ValueType[] schema) {
    if (!mc.colsKnown())
        throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
    // convert binary block to rows rdd
    JavaRDD<Row> rowRDD = in.flatMap(new BinaryBlockToDataFrameFunction());
    // create data frame schema
    if (schema == null)
        schema = UtilFunctions.nCopies((int) mc.getCols(), ValueType.STRING);
    StructType dfSchema = convertFrameSchemaToDFSchema(schema, true);
    // rdd to data frame conversion
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) StructType(org.apache.spark.sql.types.StructType) Row(org.apache.spark.sql.Row)

Aggregations

Row (org.apache.spark.sql.Row)129 Test (org.junit.Test)60 Script (org.apache.sysml.api.mlcontext.Script)53 StructType (org.apache.spark.sql.types.StructType)50 ArrayList (java.util.ArrayList)48 StructField (org.apache.spark.sql.types.StructField)46 SparkSession (org.apache.spark.sql.SparkSession)43 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)19 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)19 MLResults (org.apache.sysml.api.mlcontext.MLResults)18 DenseVector (org.apache.spark.ml.linalg.DenseVector)16 Vector (org.apache.spark.ml.linalg.Vector)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)15 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 SQLContext (org.apache.spark.sql.SQLContext)12 User (uk.gov.gchq.gaffer.user.User)12 HashSet (java.util.HashSet)10 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)9 Tuple2 (scala.Tuple2)9 GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)9