use of org.apache.spark.ml.linalg.DenseVector in project mmtf-spark by sbl-sdsc.
the class DatasetClassifier method main.
/**
* @param args args[0] path to parquet file, args[1] name of classification column
* @throws IOException
* @throws StructureException
*/
public static void main(String[] args) throws IOException {
if (args.length != 2) {
System.err.println("Usage: " + DatasetClassifier.class.getSimpleName() + " <parquet file> <classification column name>");
System.exit(1);
}
// name of the class label
String label = args[1];
long start = System.nanoTime();
SparkSession spark = SparkSession.builder().master("local[*]").appName(DatasetClassifier.class.getSimpleName()).getOrCreate();
Dataset<Row> data = spark.read().parquet(args[0]).cache();
int featureCount = 0;
Object vector = data.first().getAs("features");
if (vector instanceof DenseVector) {
featureCount = ((DenseVector) vector).numActives();
} else if (vector instanceof SparseVector) {
featureCount = ((SparseVector) vector).numActives();
}
System.out.println("Feature count : " + featureCount);
int classCount = (int) data.select(label).distinct().count();
System.out.println("Class count : " + classCount);
System.out.println("Dataset size (unbalanced): " + data.count());
data.groupBy(label).count().show(classCount);
data = DatasetBalancer.downsample(data, label, 1);
System.out.println("Dataset size (balanced) : " + data.count());
data.groupBy(label).count().show(classCount);
double testFraction = 0.3;
long seed = 123;
SparkMultiClassClassifier mcc;
Map<String, String> metrics;
DecisionTreeClassifier dtc = new DecisionTreeClassifier();
mcc = new SparkMultiClassClassifier(dtc, label, testFraction, seed);
metrics = mcc.fit(data);
System.out.println(metrics);
RandomForestClassifier rfc = new RandomForestClassifier();
mcc = new SparkMultiClassClassifier(rfc, label, testFraction, seed);
metrics = mcc.fit(data);
System.out.println(metrics);
LogisticRegression lr = new LogisticRegression();
mcc = new SparkMultiClassClassifier(lr, label, testFraction, seed);
metrics = mcc.fit(data);
System.out.println(metrics);
// specify layers for the neural network
// input layer: dimension of feature vector
// output layer: number of classes
int[] layers = new int[] { featureCount, 10, classCount };
MultilayerPerceptronClassifier mpc = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(128).setSeed(1234L).setMaxIter(200);
mcc = new SparkMultiClassClassifier(mpc, label, testFraction, seed);
metrics = mcc.fit(data);
System.out.println(metrics);
long end = System.nanoTime();
System.out.println((end - start) / 1E9 + " sec");
}
use of org.apache.spark.ml.linalg.DenseVector in project systemml by apache.
the class DataFrameVectorFrameConversionTest method createDataFrame.
@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
// create in-memory list of rows
List<Row> list = new ArrayList<Row>();
int off = (containsID ? 1 : 0);
int clen = mb.getNumColumns() + off - colsVector + 1;
for (int i = 0; i < mb.getNumRows(); i++) {
Object[] row = new Object[clen];
if (containsID)
row[0] = (double) i + 1;
for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
if (schema[j2] != ValueType.OBJECT) {
row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
} else {
double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
row[j2 + off] = new DenseVector(tmp);
j += colsVector - 1;
}
}
list.add(RowFactory.create(row));
}
// create data frame schema
List<StructField> fields = new ArrayList<StructField>();
if (containsID)
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
for (int j = 0; j < schema.length; j++) {
DataType dt = null;
switch(schema[j]) {
case STRING:
dt = DataTypes.StringType;
break;
case DOUBLE:
dt = DataTypes.DoubleType;
break;
case INT:
dt = DataTypes.LongType;
break;
case OBJECT:
dt = new VectorUDT();
break;
default:
throw new RuntimeException("Unsupported value type.");
}
fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
}
StructType dfSchema = DataTypes.createStructType(fields);
// create rdd and data frame
JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
JavaRDD<Row> rowRDD = sc.parallelize(list);
return sparkSession.createDataFrame(rowRDD, dfSchema);
}
use of org.apache.spark.ml.linalg.DenseVector in project systemml by apache.
the class DataFrameVectorScriptTest method createDataFrame.
@SuppressWarnings("resource")
private static Dataset<Row> createDataFrame(SparkSession sparkSession, MatrixBlock mb, boolean containsID, ValueType[] schema) {
// create in-memory list of rows
List<Row> list = new ArrayList<Row>();
int off = (containsID ? 1 : 0);
int clen = mb.getNumColumns() + off - colsVector + 1;
for (int i = 0; i < mb.getNumRows(); i++) {
Object[] row = new Object[clen];
if (containsID)
row[0] = (double) i + 1;
for (int j = 0, j2 = 0; j < mb.getNumColumns(); j++, j2++) {
if (schema[j2] != ValueType.OBJECT) {
row[j2 + off] = UtilFunctions.doubleToObject(schema[j2], mb.quickGetValue(i, j));
} else {
double[] tmp = DataConverter.convertToDoubleVector(mb.slice(i, i, j, j + colsVector - 1, new MatrixBlock()), false);
row[j2 + off] = new DenseVector(tmp);
j += colsVector - 1;
}
}
list.add(RowFactory.create(row));
}
// create data frame schema
List<StructField> fields = new ArrayList<StructField>();
if (containsID)
fields.add(DataTypes.createStructField(RDDConverterUtils.DF_ID_COLUMN, DataTypes.DoubleType, true));
for (int j = 0; j < schema.length; j++) {
DataType dt = null;
switch(schema[j]) {
case STRING:
dt = DataTypes.StringType;
break;
case DOUBLE:
dt = DataTypes.DoubleType;
break;
case INT:
dt = DataTypes.LongType;
break;
case OBJECT:
dt = new VectorUDT();
break;
default:
throw new RuntimeException("Unsupported value type.");
}
fields.add(DataTypes.createStructField("C" + (j + 1), dt, true));
}
StructType dfSchema = DataTypes.createStructType(fields);
// create rdd and data frame
JavaSparkContext sc = new JavaSparkContext(sparkSession.sparkContext());
JavaRDD<Row> rowRDD = sc.parallelize(list);
return sparkSession.createDataFrame(rowRDD, dfSchema);
}
use of org.apache.spark.ml.linalg.DenseVector in project jpmml-sparkml by jpmml.
the class VectorUtil method toList.
public static List<Double> toList(Vector vector) {
DenseVector denseVector = vector.toDense();
double[] values = denseVector.values();
return Doubles.asList(values);
}
Aggregations