Search in sources :

Example 36 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLMllibVectorWithIDColumn.

@Test
public void testDataFrameSumPYDMLMllibVectorWithIDColumn() {
    System.out.println("MLContextTest - DataFrame sum PYDML, mllib vector with ID column");
    List<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> list = new ArrayList<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>>();
    list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(1.0, org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0)));
    list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(2.0, org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0)));
    list.add(new Tuple2<Double, org.apache.spark.mllib.linalg.Vector>(3.0, org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0)));
    JavaRDD<Tuple2<Double, org.apache.spark.mllib.linalg.Vector>> javaRddTuple = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleMllibVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 37 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testInputMatrixBlockPYDML.

@Test
public void testInputMatrixBlockPYDML() {
    System.out.println("MLContextTest - input MatrixBlock PYDML");
    List<String> list = new ArrayList<String>();
    list.add("10,20,30");
    list.add("40,50,60");
    list.add("70,80,90");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.StringType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.StringType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Matrix m = new Matrix(dataFrame);
    MatrixBlock matrixBlock = m.toMatrixBlock();
    Script script = pydml("avg = avg(M)").in("M", matrixBlock).out("avg");
    double avg = ml.execute(script).getDouble("avg");
    Assert.assertEquals(50.0, avg, 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Matrix(org.apache.sysml.api.mlcontext.Matrix) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 38 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumDMLVectorWithNoIDColumn.

@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
    System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");
    List<Vector> list = new ArrayList<Vector>();
    list.add(Vectors.dense(1.0, 2.0, 3.0));
    list.add(Vectors.dense(4.0, 5.0, 6.0));
    list.add(Vectors.dense(7.0, 8.0, 9.0));
    JavaRDD<Vector> javaRddVector = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
    Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 39 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLDoublesWithIDColumnSortCheck.

@Test
public void testDataFrameSumPYDMLDoublesWithIDColumnSortCheck() {
    System.out.println("MLContextTest - DataFrame sum PYDML ID, doubles with ID column sort check");
    List<String> list = new ArrayList<String>();
    list.add("3,7,8,9");
    list.add("1,1,2,3");
    list.add("2,4,5,6");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_DOUBLES_WITH_INDEX);
    Script script = pydml("print('M[0,0]: ' + scalar(M[0,0]))").in("M", dataFrame, mm);
    setExpectedStdOut("M[0,0]: 1.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Test(org.junit.Test)

Example 40 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testGetTuple1DML.

@Test
public void testGetTuple1DML() {
    System.out.println("MLContextTest - Get Tuple1<Matrix> DML");
    JavaRDD<String> javaRddString = sc.parallelize(Stream.of("1,2,3", "4,5,6", "7,8,9").collect(Collectors.toList()));
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> df = spark.createDataFrame(javaRddRow, schema);
    Script script = dml("N=M*2").in("M", df).out("N");
    Tuple1<Matrix> tuple = ml.execute(script).getTuple("N");
    double[][] n = tuple._1().to2DDoubleArray();
    Assert.assertEquals(2.0, n[0][0], 0);
    Assert.assertEquals(4.0, n[0][1], 0);
    Assert.assertEquals(6.0, n[0][2], 0);
    Assert.assertEquals(8.0, n[1][0], 0);
    Assert.assertEquals(10.0, n[1][1], 0);
    Assert.assertEquals(12.0, n[1][2], 0);
    Assert.assertEquals(14.0, n[2][0], 0);
    Assert.assertEquals(16.0, n[2][1], 0);
    Assert.assertEquals(18.0, n[2][2], 0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Matrix(org.apache.sysml.api.mlcontext.Matrix) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Aggregations

StructField (org.apache.spark.sql.types.StructField)52 StructType (org.apache.spark.sql.types.StructType)48 Row (org.apache.spark.sql.Row)45 ArrayList (java.util.ArrayList)43 Test (org.junit.Test)37 Script (org.apache.sysml.api.mlcontext.Script)34 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)20 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)17 DenseVector (org.apache.spark.ml.linalg.DenseVector)15 Vector (org.apache.spark.ml.linalg.Vector)15 Tuple2 (scala.Tuple2)7 SparkSession (org.apache.spark.sql.SparkSession)6 DataType (org.apache.spark.sql.types.DataType)5 MLResults (org.apache.sysml.api.mlcontext.MLResults)5 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)5 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)4 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)4 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)3 JavaRDD (org.apache.spark.api.java.JavaRDD)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2