use of org.apache.spark.ml.feature.VectorIndexer in project net.jgp.labs.spark by jgperrin.
the class RandomForestRegressorInPipelineApp method main.
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("RandomForestRegressorApp").master("local[*]").getOrCreate();
// $example on$
// Load and parse the data file, converting it to a DataFrame.
Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
df.show(20, false);
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = df.randomSplit(new double[] { 1, 0 });
// Dataset<Row>[] splits = df.randomSplit(new double[] {0.7, 0.3});
// splits[0];
Dataset<Row> trainingData = df;
// Dataset<Row> testData = splits[1];
Dataset<Row> testData = spark.read().format("libsvm").load("data/sample-ml/simplegauss_test.txt");
// Train a RandomForest model.
RandomForestRegressor rf = new RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures");
// Chain indexer and forest in a Pipeline
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { featureIndexer, rf });
// Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
Dataset<Row> predictions = model.transform(testData);
// Select example rows to display.
predictions.select("prediction", "label", "features").show(5);
// Select (prediction, true label) and compute test error
RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
double rmse = evaluator.evaluate(predictions);
System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
RandomForestRegressionModel rfModel = (RandomForestRegressionModel) (model.stages()[1]);
System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
// $example off$
Double feature = 2.0;
Vector features = Vectors.dense(feature);
double p = rfModel.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
feature = 11.0;
features = Vectors.dense(feature);
p = rfModel.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
spark.stop();
}
use of org.apache.spark.ml.feature.VectorIndexer in project net.jgp.labs.spark by jgperrin.
the class RandomForestRegressorApp method main.
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("RandomForestRegressorApp").master("local[*]").getOrCreate();
// Load and parse the data file, converting it to a DataFrame.
Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
df.show(20, false);
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
// Split the data into training and test sets (30% held out for testing)
Dataset<Row>[] splits = df.randomSplit(new double[] { 1, 0 });
// Dataset<Row>[] splits = df.randomSplit(new double[] {0.7, 0.3});
// splits[0];
Dataset<Row> trainingData = df;
// Dataset<Row> testData = splits[1];
Dataset<Row> testData = spark.read().format("libsvm").load("data/sample-ml/simplegauss_test.txt");
// Train a RandomForest model.
RandomForestRegressor rf = new RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures");
// Chain indexer and forest in a Pipeline
Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { featureIndexer, rf });
// Train model. This also runs the indexer.
PipelineModel model = pipeline.fit(trainingData);
// Make predictions.
Dataset<Row> predictions = model.transform(testData);
// Select example rows to display.
predictions.select("prediction", "label", "features").show(5);
// Select (prediction, true label) and compute test error
RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
double rmse = evaluator.evaluate(predictions);
System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
RandomForestRegressionModel rfModel = (RandomForestRegressionModel) (model.stages()[1]);
System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
// $example off$
Double feature = 2.0;
Vector features = Vectors.dense(feature);
double p = rfModel.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
feature = 11.0;
features = Vectors.dense(feature);
p = rfModel.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
spark.stop();
}
use of org.apache.spark.ml.feature.VectorIndexer in project net.jgp.labs.spark by jgperrin.
the class GradientBoostedTreeRegressorApp method main.
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("GradientBoostedTreeRegressorApp").master("local[*]").getOrCreate();
// Load and parse the data file, converting it to a DataFrame.
Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
// Automatically identify categorical features, and index them.
// Set maxCategories so features with > 4 distinct values are treated as continuous.
VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
// Train a GBT model
GBTRegressor gbt = new GBTRegressor().setLabelCol("label").setFeaturesCol("features").setMaxIter(100);
// Train model
GBTRegressionModel model = gbt.fit(df);
// Make predictions
Dataset<Row> predictions = model.transform(df);
// Select example rows to display
predictions.show(20, false);
// Select (prediction, true label) and compute test error.
RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
double rmse = evaluator.evaluate(predictions);
System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
System.out.println("Learned regression GBT model:\n" + model.toDebugString());
Double feature = 2.0;
Vector features = Vectors.dense(feature);
double p = model.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
feature = 11.0;
features = Vectors.dense(feature);
p = model.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
spark.stop();
}
Aggregations