use of org.apache.spark.ml.feature.StringIndexerModel in project mmtf-spark by sbl-sdsc.
the class SparkMultiClassClassifier method fit.
/**
* Dataset must at least contain the following two columns:
* label: the class labels
* features: feature vector
* @param data
* @return map with metrics
*/
public Map<String, String> fit(Dataset<Row> data) {
int classCount = (int) data.select(label).distinct().count();
StringIndexerModel labelIndexer = new StringIndexer().setInputCol(label).setOutputCol("indexedLabel").fit(data);
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = data.randomSplit(new double[] { 1.0 - testFraction, testFraction }, seed);
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
String[] labels = labelIndexer.labels();
System.out.println();
System.out.println("Class\tTrain\tTest");
for (String l : labels) {
System.out.println(l + "\t" + trainingData.select(label).filter(label + " = '" + l + "'").count() + "\t" + testData.select(label).filter(label + " = '" + l + "'").count());
}
// Set input columns
predictor.setLabelCol("indexedLabel").setFeaturesCol("features");
// Convert indexed labels back to original labels.
IndexToString labelConverter = new IndexToString().setInputCol("prediction").setOutputCol("predictedLabel").setLabels(labelIndexer.labels());
// Chain indexers and forest in a Pipeline
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { labelIndexer, predictor, labelConverter });
// Train model. This also runs the indexers.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
Dataset<Row> predictions = model.transform(testData).cache();
// Display some sample predictions
System.out.println();
System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());
predictions.sample(false, 0.1, seed).show(25);
predictions = predictions.withColumnRenamed(label, "stringLabel");
predictions = predictions.withColumnRenamed("indexedLabel", label);
// collect metrics
Dataset<Row> pred = predictions.select("prediction", label);
Map<String, String> metrics = new LinkedHashMap<>();
metrics.put("Method", predictor.getClass().getSimpleName());
if (classCount == 2) {
BinaryClassificationMetrics b = new BinaryClassificationMetrics(pred);
metrics.put("AUC", Float.toString((float) b.areaUnderROC()));
}
MulticlassMetrics m = new MulticlassMetrics(pred);
metrics.put("F", Float.toString((float) m.weightedFMeasure()));
metrics.put("Accuracy", Float.toString((float) m.accuracy()));
metrics.put("Precision", Float.toString((float) m.weightedPrecision()));
metrics.put("Recall", Float.toString((float) m.weightedRecall()));
metrics.put("False Positive Rate", Float.toString((float) m.weightedFalsePositiveRate()));
metrics.put("True Positive Rate", Float.toString((float) m.weightedTruePositiveRate()));
metrics.put("", "\nConfusion Matrix\n" + Arrays.toString(labels) + "\n" + m.confusionMatrix().toString());
return metrics;
}
use of org.apache.spark.ml.feature.StringIndexerModel in project jpmml-sparkml by jpmml.
the class StringIndexerModelConverter method encodeFeatures.
@Override
public List<Feature> encodeFeatures(SparkMLEncoder encoder) {
StringIndexerModel transformer = getTransformer();
Feature feature = encoder.getOnlyFeature(transformer.getInputCol());
List<String> categories = new ArrayList<>();
categories.addAll(Arrays.asList(transformer.labels()));
String handleInvalid = transformer.getHandleInvalid();
Field<?> field = encoder.toCategorical(feature.getName(), categories);
if (field instanceof DataField) {
DataField dataField = (DataField) field;
InvalidValueTreatmentMethod invalidValueTreatmentMethod;
switch(handleInvalid) {
case "keep":
invalidValueTreatmentMethod = InvalidValueTreatmentMethod.AS_IS;
break;
case "error":
invalidValueTreatmentMethod = InvalidValueTreatmentMethod.RETURN_INVALID;
break;
default:
throw new IllegalArgumentException(handleInvalid);
}
InvalidValueDecorator invalidValueDecorator = new InvalidValueDecorator().setInvalidValueTreatment(invalidValueTreatmentMethod);
encoder.addDecorator(dataField.getName(), invalidValueDecorator);
} else if (field instanceof DerivedField) {
// Ignored
} else {
throw new IllegalArgumentException();
}
switch(handleInvalid) {
case "keep":
Apply setApply = PMMLUtil.createApply("isIn", feature.ref());
for (String category : categories) {
setApply.addExpressions(PMMLUtil.createConstant(category, feature.getDataType()));
}
categories.add(StringIndexerModelConverter.LABEL_UNKNOWN);
Apply apply = PMMLUtil.createApply("if", setApply, feature.ref(), PMMLUtil.createConstant(StringIndexerModelConverter.LABEL_UNKNOWN, DataType.STRING));
field = encoder.createDerivedField(FeatureUtil.createName("handleInvalid", feature), OpType.CATEGORICAL, feature.getDataType(), apply);
break;
default:
break;
}
return Collections.<Feature>singletonList(new CategoricalFeature(encoder, field, categories));
}
Aggregations