use of org.apache.spark.ml.evaluation.RegressionEvaluator in project mmtf-spark by sbl-sdsc.
the class SparkRegressor method fit.
/**
* Dataset must at least contain the following two columns:
* label: the class labels
* features: feature vector
* @param data
* @return map with metrics
*/
public Map<String, String> fit(Dataset<Row> data) {
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = data.randomSplit(new double[] { 1.0 - testFraction, testFraction }, seed);
Dataset<Row> trainingData = splits[0];
Dataset<Row> testData = splits[1];
// Train a RandomForest model.
predictor.setLabelCol(label).setFeaturesCol("features");
// Chain indexer and forest in a Pipeline
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { predictor });
// Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
Dataset<Row> predictions = model.transform(testData);
// Display some sample predictions
System.out.println("Sample predictions: " + predictor.getClass().getSimpleName());
String primaryKey = predictions.columns()[0];
predictions.select(primaryKey, label, "prediction").sample(false, 0.1, seed).show(50);
Map<String, String> metrics = new LinkedHashMap<>();
metrics.put("Method", predictor.getClass().getSimpleName());
// Select (prediction, true label) and compute test error
RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol(label).setPredictionCol("prediction").setMetricName("rmse");
metrics.put("rmse", Double.toString(evaluator.evaluate(predictions)));
return metrics;
}
Aggregations