Search in sources :

Example 6 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project mmtf-spark by sbl-sdsc.

the class DatasetClassifier method main.

/**
 * @param args args[0] path to parquet file, args[1] name of classification column
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    if (args.length != 2) {
        System.err.println("Usage: " + DatasetClassifier.class.getSimpleName() + " <parquet file> <classification column name>");
        System.exit(1);
    }
    // name of the class label
    String label = args[1];
    long start = System.nanoTime();
    SparkSession spark = SparkSession.builder().master("local[*]").appName(DatasetClassifier.class.getSimpleName()).getOrCreate();
    Dataset<Row> data = spark.read().parquet(args[0]).cache();
    int featureCount = 0;
    Object vector = data.first().getAs("features");
    if (vector instanceof DenseVector) {
        featureCount = ((DenseVector) vector).numActives();
    } else if (vector instanceof SparseVector) {
        featureCount = ((SparseVector) vector).numActives();
    }
    System.out.println("Feature count            : " + featureCount);
    int classCount = (int) data.select(label).distinct().count();
    System.out.println("Class count              : " + classCount);
    System.out.println("Dataset size (unbalanced): " + data.count());
    data.groupBy(label).count().show(classCount);
    data = DatasetBalancer.downsample(data, label, 1);
    System.out.println("Dataset size (balanced)  : " + data.count());
    data.groupBy(label).count().show(classCount);
    double testFraction = 0.3;
    long seed = 123;
    SparkMultiClassClassifier mcc;
    Map<String, String> metrics;
    DecisionTreeClassifier dtc = new DecisionTreeClassifier();
    mcc = new SparkMultiClassClassifier(dtc, label, testFraction, seed);
    metrics = mcc.fit(data);
    System.out.println(metrics);
    RandomForestClassifier rfc = new RandomForestClassifier();
    mcc = new SparkMultiClassClassifier(rfc, label, testFraction, seed);
    metrics = mcc.fit(data);
    System.out.println(metrics);
    LogisticRegression lr = new LogisticRegression();
    mcc = new SparkMultiClassClassifier(lr, label, testFraction, seed);
    metrics = mcc.fit(data);
    System.out.println(metrics);
    // specify layers for the neural network
    // input layer: dimension of feature vector
    // output layer: number of classes
    int[] layers = new int[] { featureCount, 10, classCount };
    MultilayerPerceptronClassifier mpc = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(128).setSeed(1234L).setMaxIter(200);
    mcc = new SparkMultiClassClassifier(mpc, label, testFraction, seed);
    metrics = mcc.fit(data);
    System.out.println(metrics);
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec");
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) SparseVector(org.apache.spark.ml.linalg.SparseVector) RandomForestClassifier(org.apache.spark.ml.classification.RandomForestClassifier) MultilayerPerceptronClassifier(org.apache.spark.ml.classification.MultilayerPerceptronClassifier) DecisionTreeClassifier(org.apache.spark.ml.classification.DecisionTreeClassifier) Row(org.apache.spark.sql.Row) LogisticRegression(org.apache.spark.ml.classification.LogisticRegression) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 7 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project systemml by apache.

the class DataFrameVectorFrameConversionTest method createDataFrame.

@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
    // create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    // create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    // create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 8 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project systemml by apache.

the class DataFrameVectorScriptTest method createDataFrame.

@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
    // create in-memory list of rows
    List<Row> list = new ArrayList<Row>();
    int off = (containsID ? 1 : 0);
    int clen = mb.getNumColumns() + off - colsVector + 1;
    for (int i = 0; i < mb.getNumRows(); i++) {
        Object[] row = new Object[clen];
        if (containsID)
            row[0] = (double) i + 1;
        for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
            if (schema[j2] != ValueType.OBJECT) {
                row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
            } else {
                double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
                row[j2 + off] = new DenseVector(tmp);
                j += colsVector - 1;
            }
        }
        list.add(RowFactory.create(row));
    }
    // create data frame schema
    List<StructField> fields = new ArrayList<StructField>();
    if (containsID)
        fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
    for (int j = 0; j < schema.length; j++) {
        DataType dt = null;
        switch(schema[j]) {
            case STRING:
                dt = DataTypes.StringType;
                break;
            case DOUBLE:
                dt = DataTypes.DoubleType;
                break;
            case INT:
                dt = DataTypes.LongType;
                break;
            case OBJECT:
                dt = new VectorUDT();
                break;
            default:
                throw new RuntimeException("Unsupported value type.");
        }
        fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
    }
    StructType dfSchema = DataTypes.createStructType(fields);
    // create rdd and data frame
    JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
    JavaRDD<Row> rowRDD = sc.parallelize(list);
    return sparkSession.createDataFrame(rowRDD, dfSchema);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) VectorUDT(org.apache.spark.ml.linalg.VectorUDT) StructType(org.apache.spark.sql.types.StructType) ArrayList(java.util.ArrayList) StructField(org.apache.spark.sql.types.StructField) DataType(org.apache.spark.sql.types.DataType) Row(org.apache.spark.sql.Row) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 9 with DenseVector

use of org.apache.spark.ml.linalg.DenseVector in project jpmml-sparkml by jpmml.

the class VectorUtil method toList.

public static List<Double> toList(Vector vector) {
    DenseVector denseVector = vector.toDense();
    double[] values = denseVector.values();
    return Doubles.asList(values);
}
Also used : DenseVector(org.apache.spark.ml.linalg.DenseVector)

Aggregations

DenseVector (org.apache.spark.ml.linalg.DenseVector)9 Row (org.apache.spark.sql.Row)8 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)6 StructField (org.apache.spark.sql.types.StructField)6 StructType (org.apache.spark.sql.types.StructType)6 ArrayList (java.util.ArrayList)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4 DataType (org.apache.spark.sql.types.DataType)4 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)4 Vector (org.apache.spark.ml.linalg.Vector)2 SparkSession (org.apache.spark.sql.SparkSession)2 DoubleType (org.apache.spark.sql.types.DoubleType)2 MLResults (org.apache.sysml.api.mlcontext.MLResults)2 Script (org.apache.sysml.api.mlcontext.Script)2 Test (org.junit.Test)2 DecisionTreeClassifier (org.apache.spark.ml.classification.DecisionTreeClassifier)1 LogisticRegression (org.apache.spark.ml.classification.LogisticRegression)1 MultilayerPerceptronClassifier (org.apache.spark.ml.classification.MultilayerPerceptronClassifier)1 RandomForestClassifier (org.apache.spark.ml.classification.RandomForestClassifier)1 SparseVector (org.apache.spark.ml.linalg.SparseVector)1