Search in sources :

Example 26 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified");
    List<Vector> list = new ArrayList<Vector>();
    list.add(Vectors.dense(1.0, 2.0, 3.0));
    list.add(Vectors.dense(4.0, 5.0, 6.0));
    list.add(Vectors.dense(7.0, 8.0, 9.0));
    JavaRDD<Vector> javaRddVector = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 27 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class DataFrameVectorFrameConversionTest method createDataFrame.

@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
    // create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    // create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    // create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 28 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class DataFrameVectorScriptTest method createDataFrame.

@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
    // create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    // create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    // create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 29 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextFrameTest method testInputFrameAndMatrixOutputMatrixAndFrame.

@Test
public void testInputFrameAndMatrixOutputMatrixAndFrame() {
    System.out.println("MLContextFrameTest - input frame and matrix, output matrix and frame");
    Row[] rowsA = { RowFactory.create("Doc1", "Feat1", 10), RowFactory.create("Doc1", "Feat2", 20), RowFactory.create("Doc2", "Feat1", 31) };
    JavaRDD<Row> javaRddRowA = sc.parallelize(Arrays.asList(rowsA));
    List<StructField> fieldsA = new ArrayList<StructField>();
    fieldsA.add(DataTypes.createStructField("myID", DataTypes.StringType, true));
    fieldsA.add(DataTypes.createStructField("FeatureName", DataTypes.StringType, true));
    fieldsA.add(DataTypes.createStructField("FeatureValue", DataTypes.IntegerType, true));
    StructType schemaA = DataTypes.createStructType(fieldsA);
    Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);
    String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: false ,recode: [ myID, FeatureName ]}\");";
    Script script = dml(dmlString).in("A", dataFrameA, new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length)).out("tA").out("tAM");
    MLResults results = ml.execute(script);
    double[][] matrixtA = results.getMatrixAs2DDoubleArray("tA");
    Assert.assertEquals(10.0, matrixtA[0][2], 0.0);
    Assert.assertEquals(20.0, matrixtA[1][2], 0.0);
    Assert.assertEquals(31.0, matrixtA[2][2], 0.0);
    Dataset<Row> dataFrame_tA = results.getMatrix("tA").toDF();
    System.out.println("Number of matrix tA rows = " + dataFrame_tA.count());
    dataFrame_tA.printSchema();
    dataFrame_tA.show();
    Dataset<Row> dataFrame_tAM = results.getFrame("tAM").toDF();
    System.out.println("Number of frame tAM rows = " + dataFrame_tAM.count());
    dataFrame_tAM.printSchema();
    dataFrame_tAM.show();
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata) Test(org.junit.Test)

Example 30 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextFrameTest method testTransform.

@Test
public void testTransform() {
    System.out.println("MLContextFrameTest - transform");
    Row[] rowsA = { RowFactory.create("\"`@(\"(!&", 2, "20news-bydate-train/comp.os.ms-windows.misc/9979"), RowFactory.create("\"`@(\"\"(!&\"", 3, "20news-bydate-train/comp.os.ms-windows.misc/9979") };
    JavaRDD<Row> javaRddRowA = sc.parallelize(Arrays.asList(rowsA));
    List<StructField> fieldsA = new ArrayList<StructField>();
    fieldsA.add(DataTypes.createStructField("featureName", DataTypes.StringType, true));
    fieldsA.add(DataTypes.createStructField("featureValue", DataTypes.IntegerType, true));
    fieldsA.add(DataTypes.createStructField("id", DataTypes.StringType, true));
    StructType schemaA = DataTypes.createStructType(fieldsA);
    Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);
    String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: false ,recode: [ featureName, id ]}\");";
    Script script = dml(dmlString).in("A", dataFrameA, new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length)).out("tA").out("tAM");
    ml.setExplain(true);
    ml.setExplainLevel(ExplainLevel.RECOMPILE_HOPS);
    MLResults results = ml.execute(script);
    double[][] matrixtA = results.getMatrixAs2DDoubleArray("tA");
    Assert.assertEquals(1.0, matrixtA[0][2], 0.0);
    Dataset<Row> dataFrame_tA = results.getMatrix("tA").toDF();
    System.out.println("Number of matrix tA rows = " + dataFrame_tA.count());
    dataFrame_tA.printSchema();
    dataFrame_tA.show();
    Dataset<Row> dataFrame_tAM = results.getFrame("tAM").toDF();
    System.out.println("Number of frame tAM rows = " + dataFrame_tAM.count());
    dataFrame_tAM.printSchema();
    dataFrame_tAM.show();
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata) Test(org.junit.Test)

Aggregations

StructField (org.apache.spark.sql.types.StructField)52 StructType (org.apache.spark.sql.types.StructType)48 Row (org.apache.spark.sql.Row)45 ArrayList (java.util.ArrayList)43 Test (org.junit.Test)37 Script (org.apache.sysml.api.mlcontext.Script)34 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)20 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)17 DenseVector (org.apache.spark.ml.linalg.DenseVector)15 Vector (org.apache.spark.ml.linalg.Vector)15 Tuple2 (scala.Tuple2)7 SparkSession (org.apache.spark.sql.SparkSession)6 DataType (org.apache.spark.sql.types.DataType)5 MLResults (org.apache.sysml.api.mlcontext.MLResults)5 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)5 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)4 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)4 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)3 JavaRDD (org.apache.spark.api.java.JavaRDD)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2