Search in sources :

Example 11 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumDMLVectorWithNoIDColumn.

@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
    System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");
    List<Vector> list = new ArrayList<Vector>();
    list.add(Vectors.dense(1.0, 2.0, 3.0));
    list.add(Vectors.dense(4.0, 5.0, 6.0));
    list.add(Vectors.dense(7.0, 8.0, 9.0));
    JavaRDD<Vector> javaRddVector = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
    Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 12 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumDMLVectorWithIDColumn.

@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
    System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");
    List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
    list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
    list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
    list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
    JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
    Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 13 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");
    List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
    list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
    list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
    list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
    JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 14 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class FrameRDDConverterUtils method convertDFSchemaToFrameSchema.

/**
 * NOTE: regarding the support of vector columns, we make the following
 * schema restriction: single vector column, which allows inference of
 * the vector length without data access and covers the common case.
 *
 * @param dfschema schema as StructType
 * @param colnames column names
 * @param fschema array of SystemML ValueTypes
 * @param containsID if true, contains ID column
 * @return 0-based column index of vector column, -1 if no vector.
 */
public static int convertDFSchemaToFrameSchema(StructType dfschema, String[] colnames, ValueType[] fschema, boolean containsID) {
    // basic meta data
    int off = containsID ? 1 : 0;
    boolean containsVect = false;
    int lenVect = fschema.length - (dfschema.fields().length - off) + 1;
    int colVect = -1;
    // process individual columns
    for (int i = off, pos = 0; i < dfschema.fields().length; i++) {
        StructField structType = dfschema.apply(i);
        colnames[pos] = structType.name();
        if (structType.dataType() == DataTypes.DoubleType || structType.dataType() == DataTypes.FloatType)
            fschema[pos++] = ValueType.DOUBLE;
        else if (structType.dataType() == DataTypes.LongType || structType.dataType() == DataTypes.IntegerType)
            fschema[pos++] = ValueType.INT;
        else if (structType.dataType() == DataTypes.BooleanType)
            fschema[pos++] = ValueType.BOOLEAN;
        else if (structType.dataType() instanceof VectorUDT) {
            if (containsVect)
                throw new RuntimeException("Found invalid second vector column.");
            String name = colnames[pos];
            colVect = pos;
            for (int j = 0; j < lenVect; j++) {
                colnames[pos] = name + "v" + j;
                fschema[pos++] = ValueType.DOUBLE;
            }
            containsVect = true;
        } else
            fschema[pos++] = ValueType.STRING;
    }
    return colVect;
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructField(org.apache.spark.sql.types.StructField) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 15 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class RDDConverterUtils method binaryBlockToDataFrame.

public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<MatrixIndexes, MatrixBlock> in, MatrixCharacteristics mc, boolean toVector) {
    if (!mc.colsKnown())
        throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
    // slice blocks into rows, align and convert into data frame rows
    JavaRDD<Row> rowsRDD = in.flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getRowsPerBlock())).groupByKey().map(new ConvertRowBlocksToRows((int) mc.getCols(), mc.getColsPerBlock(), toVector));
    // create data frame schema
    List<StructField> fields = new ArrayList<>();
    fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false));
    if (toVector)
        fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
    else {
        // row
        for (int i = 1; i <= mc.getCols(); i++) fields.add(DataTypes.createStructField("C" + i, DataTypes.DoubleType, false));
    }
    // rdd to data frame conversion
    return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) StructField(org.apache.spark.sql.types.StructField) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) LabeledPoint(org.apache.spark.ml.feature.LabeledPoint)

Aggregations

VectorUDT (org.apache.spark.ml.linalg.VectorUDT)16 StructField (org.apache.spark.sql.types.StructField)16 Row (org.apache.spark.sql.Row)14 StructType (org.apache.spark.sql.types.StructType)13 ArrayList (java.util.ArrayList)12 DenseVector (org.apache.spark.ml.linalg.DenseVector)11 Vector (org.apache.spark.ml.linalg.Vector)11 Script (org.apache.sysml.api.mlcontext.Script)9 Test (org.junit.Test)9 Tuple2 (scala.Tuple2)5 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)4 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)3 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 DataType (org.apache.spark.sql.types.DataType)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2 IOException (java.io.IOException)1 VectorBuilder (net.jgp.labs.spark.x.udf.VectorBuilder)1 Function (org.apache.spark.api.java.function.Function)1 PairFlatMapFunction (org.apache.spark.api.java.function.PairFlatMapFunction)1 LabeledPoint (org.apache.spark.ml.feature.LabeledPoint)1