Search in sources :

Example 36 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextUtil method doesDataFrameLookLikeMatrix.

/**
 * Examine the DataFrame schema to determine whether the data appears to be
 * a matrix.
 *
 * @param df
 *            the DataFrame
 * @return {@code true} if the DataFrame appears to be a matrix,
 *         {@code false} otherwise
 */
public static boolean doesDataFrameLookLikeMatrix(Dataset<Row> df) {
    StructType schema = df.schema();
    StructField[] fields = schema.fields();
    if (fields == null) {
        return true;
    }
    for (StructField field : fields) {
        DataType dataType = field.dataType();
        if ((dataType != DataTypes.DoubleType) && (dataType != DataTypes.IntegerType) && (dataType != DataTypes.LongType) && (!(dataType instanceof org.apache.spark.ml.linalg.VectorUDT)) && (!(dataType instanceof org.apache.spark.mllib.linalg.VectorUDT))) {
            // }
            return false;
        }
    }
    return true;
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) DataType(org.apache.spark.sql.types.DataType)

Example 37 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class FrameTest method testFrameGeneral.

private void testFrameGeneral(InputInfo iinfo, OutputInfo oinfo, boolean bFromDataFrame, boolean bToDataFrame) throws IOException, DMLException, ParseException {
    boolean oldConfig = DMLScript.USE_LOCAL_SPARK_CONFIG;
    DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    RUNTIME_PLATFORM oldRT = DMLScript.rtplatform;
    DMLScript.rtplatform = RUNTIME_PLATFORM.HYBRID_SPARK;
    int rowstart = 234, rowend = 1478, colstart = 125, colend = 568;
    int bRows = rowend - rowstart + 1, bCols = colend - colstart + 1;
    int rowstartC = 124, rowendC = 1178, colstartC = 143, colendC = 368;
    int cRows = rowendC - rowstartC + 1, cCols = colendC - colstartC + 1;
    HashMap<String, ValueType[]> outputSchema = new HashMap<String, ValueType[]>();
    HashMap<String, MatrixCharacteristics> outputMC = new HashMap<String, MatrixCharacteristics>();
    TestConfiguration config = getTestConfiguration(TEST_NAME);
    loadTestConfiguration(config);
    List<String> proArgs = new ArrayList<String>();
    proArgs.add(input("A"));
    proArgs.add(Integer.toString(rows));
    proArgs.add(Integer.toString(cols));
    proArgs.add(input("B"));
    proArgs.add(Integer.toString(bRows));
    proArgs.add(Integer.toString(bCols));
    proArgs.add(Integer.toString(rowstart));
    proArgs.add(Integer.toString(rowend));
    proArgs.add(Integer.toString(colstart));
    proArgs.add(Integer.toString(colend));
    proArgs.add(output("A"));
    proArgs.add(Integer.toString(rowstartC));
    proArgs.add(Integer.toString(rowendC));
    proArgs.add(Integer.toString(colstartC));
    proArgs.add(Integer.toString(colendC));
    proArgs.add(output("C"));
    fullDMLScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME + ".dml";
    ValueType[] schema = schemaMixedLarge;
    // initialize the frame data.
    List<ValueType> lschema = Arrays.asList(schema);
    fullRScriptName = SCRIPT_DIR + TEST_DIR + TEST_NAME + ".R";
    rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + rowstart + " " + rowend + " " + colstart + " " + colend + " " + expectedDir() + " " + rowstartC + " " + rowendC + " " + colstartC + " " + colendC;
    double sparsity = sparsity1;
    double[][] A = getRandomMatrix(rows, cols, min, max, sparsity, 1111);
    writeInputFrameWithMTD("A", A, true, schema, oinfo);
    sparsity = sparsity2;
    double[][] B = getRandomMatrix((int) (bRows), (int) (bCols), min, max, sparsity, 2345);
    ValueType[] schemaB = new ValueType[bCols];
    for (int i = 0; i < bCols; ++i) schemaB[i] = schema[colstart - 1 + i];
    List<ValueType> lschemaB = Arrays.asList(schemaB);
    writeInputFrameWithMTD("B", B, true, schemaB, oinfo);
    ValueType[] schemaC = new ValueType[colendC - colstartC + 1];
    for (int i = 0; i < cCols; ++i) schemaC[i] = schema[colstartC - 1 + i];
    Dataset<Row> dfA = null, dfB = null;
    if (bFromDataFrame) {
        // Create DataFrame for input A
        StructType dfSchemaA = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schema, false);
        JavaRDD<Row> rowRDDA = FrameRDDConverterUtils.csvToRowRDD(sc, input("A"), DataExpression.DEFAULT_DELIM_DELIMITER, schema);
        dfA = spark.createDataFrame(rowRDDA, dfSchemaA);
        // Create DataFrame for input B
        StructType dfSchemaB = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(schemaB, false);
        JavaRDD<Row> rowRDDB = FrameRDDConverterUtils.csvToRowRDD(sc, input("B"), DataExpression.DEFAULT_DELIM_DELIMITER, schemaB);
        dfB = spark.createDataFrame(rowRDDB, dfSchemaB);
    }
    try {
        Script script = ScriptFactory.dmlFromFile(fullDMLScriptName);
        String format = "csv";
        if (oinfo == OutputInfo.TextCellOutputInfo)
            format = "text";
        if (bFromDataFrame) {
            script.in("A", dfA);
        } else {
            JavaRDD<String> aIn = sc.textFile(input("A"));
            FrameSchema fs = new FrameSchema(lschema);
            FrameFormat ff = (format.equals("text")) ? FrameFormat.IJV : FrameFormat.CSV;
            FrameMetadata fm = new FrameMetadata(ff, fs, rows, cols);
            script.in("A", aIn, fm);
        }
        if (bFromDataFrame) {
            script.in("B", dfB);
        } else {
            JavaRDD<String> bIn = sc.textFile(input("B"));
            FrameSchema fs = new FrameSchema(lschemaB);
            FrameFormat ff = (format.equals("text")) ? FrameFormat.IJV : FrameFormat.CSV;
            FrameMetadata fm = new FrameMetadata(ff, fs, bRows, bCols);
            script.in("B", bIn, fm);
        }
        // Output one frame to HDFS and get one as RDD //TODO HDFS input/output to do
        script.out("A", "C");
        // set positional argument values
        for (int argNum = 1; argNum <= proArgs.size(); argNum++) {
            script.in("$" + argNum, proArgs.get(argNum - 1));
        }
        MLResults results = ml.execute(script);
        format = "csv";
        if (iinfo == InputInfo.TextCellInputInfo)
            format = "text";
        String fName = output("AB");
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(fName);
        } catch (IOException e) {
            throw new DMLRuntimeException("Error: While deleting file on HDFS");
        }
        if (!bToDataFrame) {
            if (format.equals("text")) {
                JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("A");
                javaRDDStringIJV.saveAsTextFile(fName);
            } else {
                JavaRDD<String> javaRDDStringCSV = results.getJavaRDDStringCSV("A");
                javaRDDStringCSV.saveAsTextFile(fName);
            }
        } else {
            Dataset<Row> df = results.getDataFrame("A");
            // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
            MatrixCharacteristics mc = new MatrixCharacteristics(rows, cols, -1, -1, -1);
            JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, bFromDataFrame).mapToPair(new LongFrameToLongWritableFrameFunction());
            rddOut.saveAsHadoopFile(output("AB"), LongWritable.class, FrameBlock.class, OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        }
        fName = output("C");
        try {
            MapReduceTool.deleteFileIfExistOnHDFS(fName);
        } catch (IOException e) {
            throw new DMLRuntimeException("Error: While deleting file on HDFS");
        }
        if (!bToDataFrame) {
            if (format.equals("text")) {
                JavaRDD<String> javaRDDStringIJV = results.getJavaRDDStringIJV("C");
                javaRDDStringIJV.saveAsTextFile(fName);
            } else {
                JavaRDD<String> javaRDDStringCSV = results.getJavaRDDStringCSV("C");
                javaRDDStringCSV.saveAsTextFile(fName);
            }
        } else {
            Dataset<Row> df = results.getDataFrame("C");
            // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
            MatrixCharacteristics mc = new MatrixCharacteristics(cRows, cCols, -1, -1, -1);
            JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, bFromDataFrame).mapToPair(new LongFrameToLongWritableFrameFunction());
            rddOut.saveAsHadoopFile(fName, LongWritable.class, FrameBlock.class, OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        }
        runRScript(true);
        outputSchema.put("AB", schema);
        outputMC.put("AB", new MatrixCharacteristics(rows, cols, -1, -1));
        outputSchema.put("C", schemaC);
        outputMC.put("C", new MatrixCharacteristics(cRows, cCols, -1, -1));
        for (String file : config.getOutputFiles()) {
            MatrixCharacteristics md = outputMC.get(file);
            FrameBlock frameBlock = readDMLFrameFromHDFS(file, iinfo, md);
            FrameBlock frameRBlock = readRFrameFromHDFS(file + ".csv", InputInfo.CSVInputInfo, md);
            ValueType[] schemaOut = outputSchema.get(file);
            verifyFrameData(frameBlock, frameRBlock, schemaOut);
            System.out.println("File " + file + " processed successfully.");
        }
        System.out.println("Frame MLContext test completed successfully.");
    } finally {
        DMLScript.rtplatform = oldRT;
        DMLScript.USE_LOCAL_SPARK_CONFIG = oldConfig;
    }
}
Also used : FrameFormat(org.apache.sysml.api.mlcontext.FrameFormat) StructType(org.apache.spark.sql.types.StructType) HashMap(java.util.HashMap) MLResults(org.apache.sysml.api.mlcontext.MLResults) TestConfiguration(org.apache.sysml.test.integration.TestConfiguration) ArrayList(java.util.ArrayList) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) Script(org.apache.sysml.api.mlcontext.Script) DMLScript(org.apache.sysml.api.DMLScript) ValueType(org.apache.sysml.parser.Expression.ValueType) FrameSchema(org.apache.sysml.api.mlcontext.FrameSchema) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) Row(org.apache.spark.sql.Row) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata)

Example 38 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLDoublesWithNoIDColumn.

@Test
public void testDataFrameSumPYDMLDoublesWithNoIDColumn() {
    System.out.println("MLContextTest - DataFrame sum PYDML, doubles with no ID column");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_DOUBLES);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 450.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Test(org.junit.Test)

Example 39 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLMllibVectorWithIDColumn.

@Test
public void testDataFrameSumPYDMLMllibVectorWithIDColumn() {
    System.out.println("MLContextTest - DataFrame sum PYDML, mllib vector with ID column");
    List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>>();
    list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
    list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
    list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
    JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 40 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testInputMatrixBlockPYDML.

@Test
public void testInputMatrixBlockPYDML() {
    System.out.println("MLContextTest - input MatrixBlock PYDML");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Matrix m = new Matrix(dataFrame);
    MatrixBlock matrixBlock = m.toMatrixBlock();
    Script script = pydml("avg = avg(M)").in("M", matrixBlock).out("avg");
    double avg = ml.execute(script).getDouble("avg");
    Assert.assertEquals(50.0, avg, 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Matrix(org.apache.sysml.api.mlcontext.Matrix) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Aggregations

StructType (org.apache.spark.sql.types.StructType)56 Row (org.apache.spark.sql.Row)49 StructField (org.apache.spark.sql.types.StructField)48 ArrayList (java.util.ArrayList)43 Test (org.junit.Test)37 Script (org.apache.sysml.api.mlcontext.Script)35 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)17 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)17 DenseVector (org.apache.spark.ml.linalg.DenseVector)15 Vector (org.apache.spark.ml.linalg.Vector)14 SparkSession (org.apache.spark.sql.SparkSession)11 MLResults (org.apache.sysml.api.mlcontext.MLResults)6 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)6 Tuple2 (scala.Tuple2)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)5 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)5 DataType (org.apache.spark.sql.types.DataType)4 JavaRDD (org.apache.spark.api.java.JavaRDD)3 ValueType (org.apache.sysml.parser.Expression.ValueType)3 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)3