Search in sources :

Example 41 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testDataFrameGoodMetadataPYDML.

@Test
public void testDataFrameGoodMetadataPYDML() {
    System.out.println("MLContextTest - DataFrame good metadata PYDML");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(3, 3, 9);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 450.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Test(org.junit.Test)

Example 42 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");
    List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
    list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
    list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
    list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
    JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector) Test(org.junit.Test)

Example 43 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLDoublesWithIDColumnSortCheck.

@Test
public void testDataFrameSumPYDMLDoublesWithIDColumnSortCheck() {
    System.out.println("MLContextTest - DataFrame sum PYDML ID, doubles with ID column sort check");
    List<String> list = new ArrayList<String>();
    list.add("3,7,8,9");
    list.add("1,1,2,3");
    list.add("2,4,5,6");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_DOUBLES_WITH_INDEX);
    Script script = pydml("print('M[0,0]: ' + scalar(M[0,0]))").in("M", dataFrame, mm);
    setExpectedStdOut("M[0,0]: 1.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Test(org.junit.Test)

Example 44 with StructType

use of org.apache.spark.sql.types.StructType in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLDoublesWithIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumPYDMLDoublesWithIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum PYDML, doubles with ID column, no format specified");
    List<String> list = new ArrayList<String>();
    list.add("1,2,2,2");
    list.add("2,3,3,3");
    list.add("3,4,4,4");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame);
    setExpectedStdOut("sum: 27.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Aggregations

StructType (org.apache.spark.sql.types.StructType)44 StructField (org.apache.spark.sql.types.StructField)40 Row (org.apache.spark.sql.Row)38 ArrayList (java.util.ArrayList)37 Test (org.junit.Test)32 Script (org.apache.sysml.api.mlcontext.Script)31 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)17 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)15 Vector (org.apache.spark.ml.linalg.Vector)12 Tuple2 (scala.Tuple2)6 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)5 MLResults (org.apache.sysml.api.mlcontext.MLResults)5 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)5 SparkSession (org.apache.spark.sql.SparkSession)4 DataType (org.apache.spark.sql.types.DataType)4 ValueType (org.apache.sysml.parser.Expression.ValueType)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)3 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)3 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)3 HashMap (java.util.HashMap)2