use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.
the class MLContextTest method testDataFrameSumDMLVectorWithNoIDColumn.
@Test
public void testDataFrameSumDMLVectorWithNoIDColumn() {
System.out.println("MLContextTest - DataFrame sum DML, vector with no ID column");
List<Vector> list = new ArrayList<Vector>();
list.add(Vectors.dense(1.0, 2.0, 3.0));
list.add(Vectors.dense(4.0, 5.0, 6.0));
list.add(Vectors.dense(7.0, 8.0, 9.0));
JavaRDD<Vector> javaRddVector = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddVector.map(new VectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR);
Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.
the class MLContextTest method testDataFrameSumDMLVectorWithIDColumn.
@Test
public void testDataFrameSumDMLVectorWithIDColumn() {
System.out.println("MLContextTest - DataFrame sum DML, vector with ID column");
List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
MatrixMetadata mm = new MatrixMetadata(MatrixFormat.DF_VECTOR_WITH_INDEX);
Script script = dml("print('sum: ' + sum(M));").in("M", dataFrame, mm);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.
the class MLContextTest method testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified.
@Test
public void testDataFrameSumPYDMLVectorWithIDColumnNoFormatSpecified() {
System.out.println("MLContextTest - DataFrame sum PYDML, vector with ID column, no format specified");
List<Tuple2<Double, Vector>> list = new ArrayList<Tuple2<Double, Vector>>();
list.add(new Tuple2<Double, Vector>(1.0, Vectors.dense(1.0, 2.0, 3.0)));
list.add(new Tuple2<Double, Vector>(2.0, Vectors.dense(4.0, 5.0, 6.0)));
list.add(new Tuple2<Double, Vector>(3.0, Vectors.dense(7.0, 8.0, 9.0)));
JavaRDD<Tuple2<Double, Vector>> javaRddTuple = sc.parallelize(list);
JavaRDD<Row> javaRddRow = javaRddTuple.map(new DoubleVectorRow());
List<StructField> fields = new ArrayList<StructField>();
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
fields.add(DataTypes.createStructField("C1", new VectorUDT(), true));
StructType schema = DataTypes.createStructType(fields);
Dataset<Row> dataFrame = spark.createDataFrame(javaRddRow, schema);
Script script = dml("print('sum: ' + sum(M))").in("M", dataFrame);
setExpectedStdOut("sum: 45.0");
ml.execute(script);
}
use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.
the class FrameRDDConverterUtils method convertDFSchemaToFrameSchema.
/**
* NOTE: regarding the support of vector columns, we make the following
* schema restriction: single vector column, which allows inference of
* the vector length without data access and covers the common case.
*
* @param dfschema schema as StructType
* @param colnames column names
* @param fschema array of SystemML ValueTypes
* @param containsID if true, contains ID column
* @return 0-based column index of vector column, -1 if no vector.
*/
public static int convertDFSchemaToFrameSchema(StructType dfschema, String[] colnames, ValueType[] fschema, boolean containsID) {
// basic meta data
int off = containsID ? 1 : 0;
boolean containsVect = false;
int lenVect = fschema.length - (dfschema.fields().length - off) + 1;
int colVect = -1;
// process individual columns
for (int i = off, pos = 0; i < dfschema.fields().length; i++) {
StructField structType = dfschema.apply(i);
colnames[pos] = structType.name();
if (structType.dataType() == DataTypes.DoubleType || structType.dataType() == DataTypes.FloatType)
fschema[pos++] = ValueType.DOUBLE;
else if (structType.dataType() == DataTypes.LongType || structType.dataType() == DataTypes.IntegerType)
fschema[pos++] = ValueType.INT;
else if (structType.dataType() == DataTypes.BooleanType)
fschema[pos++] = ValueType.BOOLEAN;
else if (structType.dataType() instanceof VectorUDT) {
if (containsVect)
throw new RuntimeException("Found invalid second vector column.");
String name = colnames[pos];
colVect = pos;
for (int j = 0; j < lenVect; j++) {
colnames[pos] = name + "v" + j;
fschema[pos++] = ValueType.DOUBLE;
}
containsVect = true;
} else
fschema[pos++] = ValueType.STRING;
}
return colVect;
}
use of org.apache.spark.ml.linalg.VectorUDT in project incubator-systemml by apache.
the class RDDConverterUtils method binaryBlockToDataFrame.
public static Dataset<Row> binaryBlockToDataFrame(SparkSession sparkSession, JavaPairRDD<MatrixIndexes, MatrixBlock> in, MatrixCharacteristics mc, boolean toVector) {
if (!mc.colsKnown())
throw new RuntimeException("Number of columns needed to convert binary block to data frame.");
// slice blocks into rows, align and convert into data frame rows
JavaRDD<Row> rowsRDD = in.flatMapToPair(new SliceBinaryBlockToRowsFunction(mc.getRowsPerBlock())).groupByKey().map(new ConvertRowBlocksToRows((int) mc.getCols(), mc.getColsPerBlock(), toVector));
// create data frame schema
List<StructField> fields = new ArrayList<>();
fields.add(DataTypes.createStructField(DF_ID_COLUMN, DataTypes.DoubleType, false));
if (toVector)
fields.add(DataTypes.createStructField("C1", new VectorUDT(), false));
else {
// row
for (int i = 1; i <= mc.getCols(); i++) fields.add(DataTypes.createStructField("C" + i, DataTypes.DoubleType, false));
}
// rdd to data frame conversion
return sparkSession.createDataFrame(rowsRDD.rdd(), DataTypes.createStructType(fields));
}
Aggregations