Search in sources :

Example 6 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextConversionUtil method determineMatrixFormatIfNeeded.

/**
	 * If the MatrixFormat of the DataFrame has not been explicitly specified,
	 * attempt to determine the proper MatrixFormat.
	 * 
	 * @param dataFrame
	 *            the Spark {@code DataFrame}
	 * @param matrixMetadata
	 *            the matrix metadata, if available
	 */
public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
    MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat();
    if (matrixFormat != null) {
        return;
    }
    StructType schema = dataFrame.schema();
    boolean hasID = false;
    try {
        schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
        hasID = true;
    } catch (IllegalArgumentException iae) {
    }
    StructField[] fields = schema.fields();
    MatrixFormat mf = null;
    if (hasID) {
        if (fields[1].dataType() instanceof VectorUDT) {
            mf = MatrixFormat.DF_VECTOR_WITH_INDEX;
        } else {
            mf = MatrixFormat.DF_DOUBLES_WITH_INDEX;
        }
    } else {
        if (fields[0].dataType() instanceof VectorUDT) {
            mf = MatrixFormat.DF_VECTOR;
        } else {
            mf = MatrixFormat.DF_DOUBLES;
        }
    }
    if (mf == null) {
        throw new MLContextException("DataFrame format not recognized as an accepted SystemML MatrixFormat");
    }
    matrixMetadata.setMatrixFormat(mf);
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType)

Example 7 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class RDDConverterUtilsExt method addIDToDataFrame.

/**
	 * Add element indices as new column to DataFrame
	 *
	 * @param df input data frame
	 * @param sparkSession the Spark Session
	 * @param nameOfCol name of index column
	 * @return new data frame
	 */
public static Dataset<Row> addIDToDataFrame(Dataset<Row> df, SparkSession sparkSession, String nameOfCol) {
    StructField[] oldSchema = df.schema().fields();
    StructField[] newSchema = new StructField[oldSchema.length + 1];
    for (int i = 0; i < oldSchema.length; i++) {
        newSchema[i] = oldSchema[i];
    }
    newSchema[oldSchema.length] = DataTypes.createStructField(nameOfCol, DataTypes.DoubleType, false);
    // JavaRDD<Row> newRows = df.rdd().toJavaRDD().map(new AddRowID());
    JavaRDD<Row> newRows = df.rdd().toJavaRDD().zipWithIndex().map(new AddRowID());
    return sparkSession.createDataFrame(newRows, new StructType(newSchema));
}
Also used : StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) Row(org.apache.spark.sql.Row)

Example 8 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextFrameTest method testInputFrameAndMatrixOutputMatrixAndFrame.

@Test
public void testInputFrameAndMatrixOutputMatrixAndFrame() {
    System.out.println("MLContextFrameTest - input frame and matrix, output matrix and frame");
    Row[] rowsA = { RowFactory.create("Doc1", "Feat1", 10), RowFactory.create("Doc1", "Feat2", 20), RowFactory.create("Doc2", "Feat1", 31) };
    JavaRDD<Row> javaRddRowA = sc.parallelize(Arrays.asList(rowsA));
    List<StructField> fieldsA = new ArrayList<StructField>();
    fieldsA.add(DataTypes.createStructField("myID", DataTypes.StringType, true));
    fieldsA.add(DataTypes.createStructField("FeatureName", DataTypes.StringType, true));
    fieldsA.add(DataTypes.createStructField("FeatureValue", DataTypes.IntegerType, true));
    StructType schemaA = DataTypes.createStructType(fieldsA);
    Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);
    String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: false ,recode: [ myID, FeatureName ]}\");";
    Script script = dml(dmlString).in("A", dataFrameA, new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length)).out("tA").out("tAM");
    MLResults results = ml.execute(script);
    double[][] matrixtA = results.getMatrixAs2DDoubleArray("tA");
    Assert.assertEquals(10.0, matrixtA[0][2], 0.0);
    Assert.assertEquals(20.0, matrixtA[1][2], 0.0);
    Assert.assertEquals(31.0, matrixtA[2][2], 0.0);
    Dataset<Row> dataFrame_tA = results.getMatrix("tA").toDF();
    System.out.println("Number of matrix tA rows = " + dataFrame_tA.count());
    dataFrame_tA.printSchema();
    dataFrame_tA.show();
    Dataset<Row> dataFrame_tAM = results.getFrame("tAM").toDF();
    System.out.println("Number of frame tAM rows = " + dataFrame_tAM.count());
    dataFrame_tAM.printSchema();
    dataFrame_tAM.show();
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata) Test(org.junit.Test)

Example 9 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextFrameTest method testTransform.

@Test
public void testTransform() {
    System.out.println("MLContextFrameTest - transform");
    Row[] rowsA = { RowFactory.create("\"`@(\"(!&", 2, "20news-bydate-train/comp.os.ms-windows.misc/9979"), RowFactory.create("\"`@(\"\"(!&\"", 3, "20news-bydate-train/comp.os.ms-windows.misc/9979") };
    JavaRDD<Row> javaRddRowA = sc.parallelize(Arrays.asList(rowsA));
    List<StructField> fieldsA = new ArrayList<StructField>();
    fieldsA.add(DataTypes.createStructField("featureName", DataTypes.StringType, true));
    fieldsA.add(DataTypes.createStructField("featureValue", DataTypes.IntegerType, true));
    fieldsA.add(DataTypes.createStructField("id", DataTypes.StringType, true));
    StructType schemaA = DataTypes.createStructType(fieldsA);
    Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);
    String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: false ,recode: [ featureName, id ]}\");";
    Script script = dml(dmlString).in("A", dataFrameA, new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length)).out("tA").out("tAM");
    ml.setExplain(true);
    ml.setExplainLevel(ExplainLevel.RECOMPILE_HOPS);
    MLResults results = ml.execute(script);
    double[][] matrixtA = results.getMatrixAs2DDoubleArray("tA");
    Assert.assertEquals(1.0, matrixtA[0][2], 0.0);
    Dataset<Row> dataFrame_tA = results.getMatrix("tA").toDF();
    System.out.println("Number of matrix tA rows = " + dataFrame_tA.count());
    dataFrame_tA.printSchema();
    dataFrame_tA.show();
    Dataset<Row> dataFrame_tAM = results.getFrame("tAM").toDF();
    System.out.println("Number of frame tAM rows = " + dataFrame_tAM.count());
    dataFrame_tAM.printSchema();
    dataFrame_tAM.show();
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata) Test(org.junit.Test)

Example 10 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextFrameTest method testInputFrameAndMatrixOutputMatrix.

@Test
public void testInputFrameAndMatrixOutputMatrix() {
    System.out.println("MLContextFrameTest - input frame and matrix, output matrix");
    List<String> dataA = new ArrayList<String>();
    dataA.add("Test1,4.0");
    dataA.add("Test2,5.0");
    dataA.add("Test3,6.0");
    JavaRDD<String> javaRddStringA = sc.parallelize(dataA);
    ValueType[] schema = { ValueType.STRING, ValueType.DOUBLE };
    List<String> dataB = new ArrayList<String>();
    dataB.add("1.0");
    dataB.add("2.0");
    JavaRDD<String> javaRddStringB = sc.parallelize(dataB);
    JavaRDD<Row> javaRddRowA = FrameRDDConverterUtils.csvToRowRDD(sc, javaRddStringA, CSV_DELIM, schema);
    JavaRDD<Row> javaRddRowB = javaRddStringB.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fieldsA = new ArrayList<StructField>();
    fieldsA.add(DataTypes.createStructField("1", DataTypes.StringType, true));
    fieldsA.add(DataTypes.createStructField("2", DataTypes.DoubleType, true));
    StructType schemaA = DataTypes.createStructType(fieldsA);
    Dataset<Row> dataFrameA = spark.createDataFrame(javaRddRowA, schemaA);
    List<StructField> fieldsB = new ArrayList<StructField>();
    fieldsB.add(DataTypes.createStructField("1", DataTypes.DoubleType, true));
    StructType schemaB = DataTypes.createStructType(fieldsB);
    Dataset<Row> dataFrameB = spark.createDataFrame(javaRddRowB, schemaB);
    String dmlString = "[tA, tAM] = transformencode (target = A, spec = \"{ids: true ,recode: [ 1, 2 ]}\");\n" + "C = tA %*% B;\n" + "M = s * C;";
    Script script = dml(dmlString).in("A", dataFrameA, new FrameMetadata(FrameFormat.CSV, dataFrameA.count(), (long) dataFrameA.columns().length)).in("B", dataFrameB, new MatrixMetadata(MatrixFormat.CSV, dataFrameB.count(), (long) dataFrameB.columns().length)).in("s", 2).out("M");
    MLResults results = ml.execute(script);
    double[][] matrix = results.getMatrixAs2DDoubleArray("M");
    Assert.assertEquals(6.0, matrix[0][0], 0.0);
    Assert.assertEquals(12.0, matrix[1][0], 0.0);
    Assert.assertEquals(18.0, matrix[2][0], 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MLResults(org.apache.sysml.api.mlcontext.MLResults) ArrayList(java.util.ArrayList) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) CommaSeparatedValueStringToDoubleArrayRow(org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) FrameMetadata(org.apache.sysml.api.mlcontext.FrameMetadata) Test(org.junit.Test)

Aggregations

StructType (org.apache.spark.sql.types.StructType)44 StructField (org.apache.spark.sql.types.StructField)40 Row (org.apache.spark.sql.Row)38 ArrayList (java.util.ArrayList)37 Test (org.junit.Test)32 Script (org.apache.sysml.api.mlcontext.Script)31 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)17 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)15 Vector (org.apache.spark.ml.linalg.Vector)12 Tuple2 (scala.Tuple2)6 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)5 MLResults (org.apache.sysml.api.mlcontext.MLResults)5 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)5 SparkSession (org.apache.spark.sql.SparkSession)4 DataType (org.apache.spark.sql.types.DataType)4 ValueType (org.apache.sysml.parser.Expression.ValueType)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)3 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)3 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)3 HashMap (java.util.HashMap)2