use of org.apache.spark.ml.linalg.DenseVector in project incubator-systemml by apache.
the class MLContextTest method testOutputDataFrameOfVectorsDML.
@Test
public void testOutputDataFrameOfVectorsDML() {
System.out.println("MLContextTest - output DataFrame of vectors DML");
String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
Script script = dml(s).out("m");
MLResults results = ml.execute(script);
Dataset<Row> df = results.getDataFrame("m", true);
Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
// verify column types
StructType schema = sortedDF.schema();
StructField[] fields = schema.fields();
StructField idColumn = fields[0];
StructField vectorColumn = fields[1];
Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);
List<Row> list = sortedDF.collectAsList();
Row row1 = list.get(0);
Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
Vector v1 = (DenseVector) row1.get(1);
double[] arr1 = v1.toArray();
Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);
Row row2 = list.get(1);
Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
Vector v2 = (DenseVector) row2.get(1);
double[] arr2 = v2.toArray();
Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}
use of org.apache.spark.ml.linalg.DenseVector in project incubator-systemml by apache.
the class DataFrameVectorFrameConversionTest method createDataFrame.
@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
// create in-memory list of rows
List<Row> list = new ArrayList<Row>();
int off = (containsID ? 1 : 0);
int clen = mb.getNumColumns() + off - colsVector + 1;
for (int i = 0; i < mb.getNumRows(); i++) {
Object[] row = new Object[clen];
if (containsID)
row[0] = (double) i + 1;
for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
if (schema[j2] != ValueType.OBJECT) {
row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
} else {
double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
row[j2 + off] = new DenseVector(tmp);
j += colsVector - 1;
}
}
list.add(RowFactory.create(row));
}
// create data frame schema
List<StructField> fields = new ArrayList<StructField>();
if (containsID)
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
for (int j = 0; j < schema.length; j++) {
DataType dt = null;
switch(schema[j]) {
case STRING:
dt = DataTypes.StringType;
break;
case DOUBLE:
dt = DataTypes.DoubleType;
break;
case INT:
dt = DataTypes.LongType;
break;
case OBJECT:
dt = new VectorUDT();
break;
default:
throw new RuntimeException("Unsupported value type.");
}
fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
}
StructType dfSchema = DataTypes.createStructType(fields);
// create rdd and data frame
JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
JavaRDD<Row> rowRDD = sc.parallelize(list);
return sparkSession.createDataFrame(rowRDD, dfSchema);
}
use of org.apache.spark.ml.linalg.DenseVector in project incubator-systemml by apache.
the class DataFrameVectorScriptTest method createDataFrame.
@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
// create in-memory list of rows
List<Row> list = new ArrayList<Row>();
int off = (containsID ? 1 : 0);
int clen = mb.getNumColumns() + off - colsVector + 1;
for (int i = 0; i < mb.getNumRows(); i++) {
Object[] row = new Object[clen];
if (containsID)
row[0] = (double) i + 1;
for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
if (schema[j2] != ValueType.OBJECT) {
row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
} else {
double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
row[j2 + off] = new DenseVector(tmp);
j += colsVector - 1;
}
}
list.add(RowFactory.create(row));
}
// create data frame schema
List<StructField> fields = new ArrayList<StructField>();
if (containsID)
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
for (int j = 0; j < schema.length; j++) {
DataType dt = null;
switch(schema[j]) {
case STRING:
dt = DataTypes.StringType;
break;
case DOUBLE:
dt = DataTypes.DoubleType;
break;
case INT:
dt = DataTypes.LongType;
break;
case OBJECT:
dt = new VectorUDT();
break;
default:
throw new RuntimeException("Unsupported value type.");
}
fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
}
StructType dfSchema = DataTypes.createStructType(fields);
// create rdd and data frame
JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
JavaRDD<Row> rowRDD = sc.parallelize(list);
return sparkSession.createDataFrame(rowRDD, dfSchema);
}
use of org.apache.spark.ml.linalg.DenseVector in project systemml by apache.
the class MLContextTest method testOutputDataFrameOfVectorsDML.
@Test
public void testOutputDataFrameOfVectorsDML() {
System.out.println("MLContextTest - output DataFrame of vectors DML");
String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
Script script = dml(s).out("m");
MLResults results = ml.execute(script);
Dataset<Row> df = results.getDataFrame("m", true);
Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
// verify column types
StructType schema = sortedDF.schema();
StructField[] fields = schema.fields();
StructField idColumn = fields[0];
StructField vectorColumn = fields[1];
Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);
List<Row> list = sortedDF.collectAsList();
Row row1 = list.get(0);
Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
Vector v1 = (DenseVector) row1.get(1);
double[] arr1 = v1.toArray();
Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);
Row row2 = list.get(1);
Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
Vector v2 = (DenseVector) row2.get(1);
double[] arr2 = v2.toArray();
Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}
use of org.apache.spark.ml.linalg.DenseVector in project mmtf-spark by sbl-sdsc.
the class DatasetRegressor method main.
/**
* @param args args[0] path to parquet file, args[1] name of the prediction column
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.err.println("Usage: " + DatasetRegressor.class.getSimpleName() + " <parquet file> <prediction column name>");
System.exit(1);
}
// name of the prediction column
String label = args[1];
long start = System.nanoTime();
SparkSession spark = SparkSession.builder().master("local[*]").appName(DatasetRegressor.class.getSimpleName()).getOrCreate();
Dataset<Row> data = spark.read().parquet(args[0]).cache();
int featureCount = ((DenseVector) data.first().getAs("features")).numActives();
System.out.println("Feature count: " + featureCount);
System.out.println("Dataset size : " + data.count());
double testFraction = 0.3;
long seed = 123;
LinearRegression lr = new LinearRegression().setLabelCol(label).setFeaturesCol("features");
SparkRegressor reg = new SparkRegressor(lr, label, testFraction, seed);
System.out.println(reg.fit(data));
GBTRegressor gbt = new GBTRegressor().setLabelCol(label).setFeaturesCol("features");
reg = new SparkRegressor(gbt, label, testFraction, seed);
System.out.println(reg.fit(data));
GeneralizedLinearRegression glr = new GeneralizedLinearRegression().setLabelCol(label).setFeaturesCol("features").setFamily("gaussian").setLink("identity").setMaxIter(10).setRegParam(0.3);
reg = new SparkRegressor(glr, label, testFraction, seed);
System.out.println(reg.fit(data));
long end = System.nanoTime();
System.out.println((end - start) / 1E9 + " sec");
}
Aggregations