Search in sources :

Example 1 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class MLContextConversionUtil method determineMatrixFormatIfNeeded.

/**
	 * If the MatrixFormat of the DataFrame has not been explicitly specified,
	 * attempt to determine the proper MatrixFormat.
	 * 
	 * @param dataFrame
	 *            the Spark {@code DataFrame}
	 * @param matrixMetadata
	 *            the matrix metadata, if available
	 */
public static void determineMatrixFormatIfNeeded(Dataset<Row> dataFrame, MatrixMetadata matrixMetadata) {
    MatrixFormat matrixFormat = matrixMetadata.getMatrixFormat();
    if (matrixFormat != null) {
        return;
    }
    StructType schema = dataFrame.schema();
    boolean hasID = false;
    try {
        schema.fieldIndex(RDDConverterUtils.DF_ID_COLUMN);
        hasID = true;
    } catch (IllegalArgumentException iae) {
    }
    StructField[] fields = schema.fields();
    MatrixFormat mf = null;
    if (hasID) {
        if (fields[1].dataType() instanceof VectorUDT) {
            mf = MatrixFormat.DF_VECTOR_WITH_INDEX;
        } else {
            mf = MatrixFormat.DF_DOUBLES_WITH_INDEX;
        }
    } else {
        if (fields[0].dataType() instanceof VectorUDT) {
            mf = MatrixFormat.DF_VECTOR;
        } else {
            mf = MatrixFormat.DF_DOUBLES;
        }
    }
    if (mf == null) {
        throw new MLContextException("DataFrame format not recognized as an accepted SystemML MatrixFormat");
    }
    matrixMetadata.setMatrixFormat(mf);
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructField(org.apache.spark.sql.types.StructField) StructType(org.apache.spark.sql.types.StructType)

Example 2 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumPYDMLVectorWithNoIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum PYDML, vector with no ID column, no format specified");
    List<Vector> list = new ArrayList<Vector>();
    list.add(Vectors.dense(1.0, 2.0, 3.0));
    list.add(Vectors.dense(4.0, 5.0, 6.0));
    list.add(Vectors.dense(7.0, 8.0, 9.0));
    JavaRDD<Vector> javaRddVector = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector) Test(org.junit.Test)

Example 3 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumPYDMLVectorWithIDColumn.

@Test
public void testDataFrameSumPYDMLVectorWithIDColumn() {
    System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column");
    List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
    list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
    list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
    list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
    JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
    Script script = pydml("print('sum: ' + sum(M))").in("M", dataFrame, mm);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) MatrixMetadata(org.apache.sysml.api.mlcontext.MatrixMetadata) Vector(org.apache.spark.ml.linalg.Vector) Test(org.junit.Test)

Example 4 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class MLContextTest method testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified.

@Test
public void testDataFrameSumDMLVectorWithNoIDColumnNoFormatSpecified() {
    System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column, no format specified");
    List<Vector> list = new ArrayList<Vector>();
    list.add(Vectors.dense(1.0, 2.0, 3.0));
    list.add(Vectors.dense(4.0, 5.0, 6.0));
    list.add(Vectors.dense(7.0, 8.0, 9.0));
    JavaRDD<Vector> javaRddVector = sc.parallelize(list);
    JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
    List<StructField> fields = new ArrayList<StructField>();
    fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
    StructType schema = DataTypes.createStructType(fields);
    Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
    Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame);
    setExpectedStdOut("sum: 45.0");
    ml.execute(script);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector) Test(org.junit.Test)

Example 5 with VectorUDT

use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.

the class DataFrameVectorFrameConversionTest method createDataFrame.

@SuppressWarnings("resource")
private Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) throws DMLRuntimeException {
    //create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.sliceOperations(i, i, j, j + colsVector - 1, new MatrixBlock()));
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    //create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    //create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Aggregations

VectorUDT (org.apache.spark.ml.linalg.VectorUDT)14 StructField (org.apache.spark.sql.types.StructField)14 ArrayList (java.util.ArrayList)12 Row (org.apache.spark.sql.Row)12 StructType (org.apache.spark.sql.types.StructType)11 Vector (org.apache.spark.ml.linalg.Vector)9 Script (org.apache.sysml.api.mlcontext.Script)8 Test (org.junit.Test)8 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)5 Tuple2 (scala.Tuple2)5 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 DenseVector (org.apache.spark.ml.linalg.DenseVector)2 DataType (org.apache.spark.sql.types.DataType)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2 IOException (java.io.IOException)1 Function (org.apache.spark.api.java.function.Function)1 PairFlatMapFunction (org.apache.spark.api.java.function.PairFlatMapFunction)1 LabeledPoint (org.apache.spark.ml.feature.LabeledPoint)1