Search in sources :

Example 1 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project incubator-systemml by apache.

the class MLContextTest method testOutputDataFrameOfVectorsDML.

@Test
public void testOutputDataFrameOfVectorsDML() {
    System.out.println("MLContextTest - output DataFrame of vectors DML");
    String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
    Script script = dml(s).out("m");
    MLResults results = ml.execute(script);
    Dataset<Row> df = results.getDataFrame("m", true);
    Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
    // verify column types
    StructType schema = sortedDF.schema();
    StructField[] fields = schema.fields();
    StructField idColumn = fields[0];
    StructField vectorColumn = fields[1];
    Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
    Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);
    List<Row> list = sortedDF.collectAsList();
    Row row1 = list.get(0);
    Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
    Vector v1 = (DenseVector) row1.get(1);
    double[] arr1 = v1.toArray();
    Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);
    Row row2 = list.get(1);
    Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
    Vector v2 = (DenseVector) row2.get(1);
    double[] arr2 = v2.toArray();
    Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) MLResults(org.apache.sysml.api.mlcontext.MLResults) StructField(org.apache.spark.sql.types.StructField) DoubleType(org.apache.spark.sql.types.DoubleType) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 2 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project incubator-systemml by apache.

the class DataFrameVectorFrameConversionTest method createDataFrame.

@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
    // create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    // create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    // create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 3 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project incubator-systemml by apache.

the class DataFrameVectorScriptTest method createDataFrame.

@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
    // create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    // create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    // create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 4 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project systemml by apache.

the class MLContextTest method testOutputDataFrameOfVectorsDML.

@Test
public void testOutputDataFrameOfVectorsDML() {
    System.out.println("MLContextTest - output DataFrame of vectors DML");
    String s = "m=matrix('1 2 3 4',rows=2,cols=2);";
    Script script = dml(s).out("m");
    MLResults results = ml.execute(script);
    Dataset<Row> df = results.getDataFrame("m", true);
    Dataset<Row> sortedDF = df.sort(RDDConverterUtils.DF_ID_COLUMN);
    // verify column types
    StructType schema = sortedDF.schema();
    StructField[] fields = schema.fields();
    StructField idColumn = fields[0];
    StructField vectorColumn = fields[1];
    Assert.assertTrue(idColumn.dataType() instanceof DoubleType);
    Assert.assertTrue(vectorColumn.dataType() instanceof VectorUDT);
    List<Row> list = sortedDF.collectAsList();
    Row row1 = list.get(0);
    Assert.assertEquals(1.0, row1.getDouble(0), 0.0);
    Vector v1 = (DenseVector) row1.get(1);
    double[] arr1 = v1.toArray();
    Assert.assertArrayEquals(new double[] { 1.0, 2.0 }, arr1, 0.0);
    Row row2 = list.get(1);
    Assert.assertEquals(2.0, row2.getDouble(0), 0.0);
    Vector v2 = (DenseVector) row2.get(1);
    double[] arr2 = v2.toArray();
    Assert.assertArrayEquals(new double[] { 3.0, 4.0 }, arr2, 0.0);
}
Also used : Script(org.apache.sysml.api.mlcontext.Script) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) MLResults(org.apache.sysml.api.mlcontext.MLResults) StructField(org.apache.spark.sql.types.StructField) DoubleType(org.apache.spark.sql.types.DoubleType) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector) DenseVector(org.apache.spark.ml.linalg.DenseVector) DenseVector(org.apache.spark.ml.linalg.DenseVector) Test(org.junit.Test)

Example 5 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project mmtf-spark by sbl-sdsc.

the class DatasetRegressor method main.

/**
 * @param args args[0] path to parquet file, args[1] name of the prediction column
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    if (args.length != 2) {
        System.err.println("Usage: " + DatasetRegressor.class.getSimpleName() + " <parquet file> <prediction column name>");
        System.exit(1);
    }
    // name of the prediction column
    String label = args[1];
    long start = System.nanoTime();
    SparkSession spark = SparkSession.builder().master("local[*]").appName(DatasetRegressor.class.getSimpleName()).getOrCreate();
    Dataset<Row> data = spark.read().parquet(args[0]).cache();
    int featureCount = ((DenseVector) data.first().getAs("features")).numActives();
    System.out.println("Feature count: " + featureCount);
    System.out.println("Dataset size : " + data.count());
    double testFraction = 0.3;
    long seed = 123;
    LinearRegression lr = new LinearRegression().setLabelCol(label).setFeaturesCol("features");
    SparkRegressor reg = new SparkRegressor(lr, label, testFraction, seed);
    System.out.println(reg.fit(data));
    GBTRegressor gbt = new GBTRegressor().setLabelCol(label).setFeaturesCol("features");
    reg = new SparkRegressor(gbt, label, testFraction, seed);
    System.out.println(reg.fit(data));
    GeneralizedLinearRegression glr = new GeneralizedLinearRegression().setLabelCol(label).setFeaturesCol("features").setFamily("gaussian").setLink("identity").setMaxIter(10).setRegParam(0.3);
    reg = new SparkRegressor(glr, label, testFraction, seed);
    System.out.println(reg.fit(data));
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec");
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) GeneralizedLinearRegression(org.apache.spark.ml.regression.GeneralizedLinearRegression) GBTRegressor(org.apache.spark.ml.regression.GBTRegressor) Row(org.apache.spark.sql.Row) LinearRegression(org.apache.spark.ml.regression.LinearRegression) GeneralizedLinearRegression(org.apache.spark.ml.regression.GeneralizedLinearRegression) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Aggregations

DenseVector (org.apache.spark.ml.linalg.DenseVector)9 Row (org.apache.spark.sql.Row)8 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)6 StructField (org.apache.spark.sql.types.StructField)6 StructType (org.apache.spark.sql.types.StructType)6 ArrayList (java.util.ArrayList)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4 DataType (org.apache.spark.sql.types.DataType)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 Vector (org.apache.spark.ml.linalg.Vector)2 SparkSession (org.apache.spark.sql.SparkSession)2 DoubleType (org.apache.spark.sql.types.DoubleType)2 MLResults (org.apache.sysml.api.mlcontext.MLResults)2 Script (org.apache.sysml.api.mlcontext.Script)2 Test (org.junit.Test)2 DecisionTreeClassifier (org.apache.spark.ml.classification.DecisionTreeClassifier)1 LogisticRegression (org.apache.spark.ml.classification.LogisticRegression)1 MultilayerPerceptronClassifier (org.apache.spark.ml.classification.MultilayerPerceptronClassifier)1 RandomForestClassifier (org.apache.spark.ml.classification.RandomForestClassifier)1 SparseVector (org.apache.spark.ml.linalg.SparseVector)1