Search in sources :

Example 41 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumDMLVectorWithIDColumn.

@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
    System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");
    List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
    list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
    list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
    list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
    JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
    Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 42 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumDMLDoublesWithNoIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumDMLDoublesWithNoIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum DML, doubles with no ID column, no format specified");
    List<String> list = new ArrayList<String>();
    list.add("2,2,2");
    list.add("3,3,3");
    list.add("4,4,4");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
    setExpectedStdOut("sum: 27.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 43 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");
    List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
    list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
    list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
    list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
    JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 44 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLDoublesWithIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumPYDMLDoublesWithIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum PYDML, doubles with ID column, no format specified");
    List<String> list = new ArrayList<String>();
    list.add("1,2,2,2");
    list.add("2,3,3,3");
    list.add("3,4,4,4");
    JavaRDD<String> javaRddString = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddString.map(new CommaSeparatedValueStringToDoubleArrayRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C2", DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C3", DataTypes.DoubleType, true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame);
    setExpectedStdOut("sum: 27.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) Test(org.junit.Test)

Example 45 with StructField

use of org.apache.spark.sql.types.StructField in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLMllibVectorWithNoIDColumn.

@Test
public void testDataFrameSumPYDMLMllibVectorWithNoIDColumn() {
    System.out.println("MLContextTest - DataFrame sum PYDML, mllib vector with no ID column");
    List<org.apache.spark.mllib.linalg.Vector> list = new ArrayList<org.apache.spark.mllib.linalg.Vector>();
    list.add(org.apache.spark.mllib.linalg.Vectors.dense(1.0, 2.0, 3.0));
    list.add(org.apache.spark.mllib.linalg.Vectors.dense(4.0, 5.0, 6.0));
    list.add(org.apache.spark.mllib.linalg.Vectors.dense(7.0, 8.0, 9.0));
    JavaRDD<org.apache.spark.mllib.linalg.Vector> javaRddVector = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddVector.map(new MllibVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", new org.apache.spark.mllib.linalg.VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Aggregations

StructField (org.apache.spark.sql.types.StructField)52 StructType (org.apache.spark.sql.types.StructType)48 Row (org.apache.spark.sql.Row)45 ArrayList (java.util.ArrayList)43 Test (org.junit.Test)37 Script (org.apache.sysml.api.mlcontext.Script)34 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)20 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)17 DenseVector (org.apache.spark.ml.linalg.DenseVector)15 Vector (org.apache.spark.ml.linalg.Vector)15 Tuple2 (scala.Tuple2)7 SparkSession (org.apache.spark.sql.SparkSession)6 DataType (org.apache.spark.sql.types.DataType)5 MLResults (org.apache.sysml.api.mlcontext.MLResults)5 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)5 FrameMetadata (org.apache.sysml.api.mlcontext.FrameMetadata)4 CommaSeparatedValueStringToDoubleArrayRow (org.apache.sysml.test.integration.mlcontext.MLContextTest.CommaSeparatedValueStringToDoubleArrayRow)4 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)3 JavaRDD (org.apache.spark.api.java.JavaRDD)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2