Search in sources :

Example 1 with RandomForestRegressor

use of org.apache.spark.ml.regression.RandomForestRegressor in project net.jgp.labs.spark by jgperrin.

the class RandomForestRegressorInPipelineApp method main.

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("RandomForestRegressorApp").master("local[*]").getOrCreate();
    // $example on$
    // Load and parse the data file, converting it to a DataFrame.
    Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
    df.show(20, false);
    // Automatically identify categorical features, and index them.
    // Set maxCategories so features with > 4 distinct values are treated as continuous.
    VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = df.randomSplit(new double[] { 1, 0 });
    // Dataset<Row>[] splits = df.randomSplit(new double[] {0.7, 0.3});
    // splits[0];
    Dataset<Row> trainingData = df;
    // Dataset<Row> testData = splits[1];
    Dataset<Row> testData = spark.read().format("libsvm").load("data/sample-ml/simplegauss_test.txt");
    // Train a RandomForest model.
    RandomForestRegressor rf = new RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures");
    // Chain indexer and forest in a Pipeline
    Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { featureIndexer, rf });
    // Train model. This also runs the indexer.
    PipelineModel model = pipeline.fit(trainingData);
    // Make predictions.
    Dataset<Row> predictions = model.transform(testData);
    // Select example rows to display.
    predictions.select("prediction", "label", "features").show(5);
    // Select (prediction, true label) and compute test error
    RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
    double rmse = evaluator.evaluate(predictions);
    System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
    RandomForestRegressionModel rfModel = (RandomForestRegressionModel) (model.stages()[1]);
    System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
    // $example off$
    Double feature = 2.0;
    Vector features = Vectors.dense(feature);
    double p = rfModel.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    feature = 11.0;
    features = Vectors.dense(feature);
    p = rfModel.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    spark.stop();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Dataset(org.apache.spark.sql.Dataset) VectorIndexer(org.apache.spark.ml.feature.VectorIndexer) Pipeline(org.apache.spark.ml.Pipeline) PipelineModel(org.apache.spark.ml.PipelineModel) RandomForestRegressionModel(org.apache.spark.ml.regression.RandomForestRegressionModel) RandomForestRegressor(org.apache.spark.ml.regression.RandomForestRegressor) VectorIndexerModel(org.apache.spark.ml.feature.VectorIndexerModel) Row(org.apache.spark.sql.Row) RegressionEvaluator(org.apache.spark.ml.evaluation.RegressionEvaluator) Vector(org.apache.spark.ml.linalg.Vector)

Example 2 with RandomForestRegressor

use of org.apache.spark.ml.regression.RandomForestRegressor in project net.jgp.labs.spark by jgperrin.

the class RandomForestRegressorApp method main.

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("RandomForestRegressorApp").master("local[*]").getOrCreate();
    // Load and parse the data file, converting it to a DataFrame.
    Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
    df.show(20, false);
    // Automatically identify categorical features, and index them.
    // Set maxCategories so features with > 4 distinct values are treated as continuous.
    VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
    // Split the data into training and test sets (30% held out for testing)
    Dataset<Row>[] splits = df.randomSplit(new double[] { 1, 0 });
    // Dataset<Row>[] splits = df.randomSplit(new double[] {0.7, 0.3});
    // splits[0];
    Dataset<Row> trainingData = df;
    // Dataset<Row> testData = splits[1];
    Dataset<Row> testData = spark.read().format("libsvm").load("data/sample-ml/simplegauss_test.txt");
    // Train a RandomForest model.
    RandomForestRegressor rf = new RandomForestRegressor().setLabelCol("label").setFeaturesCol("indexedFeatures");
    // Chain indexer and forest in a Pipeline
    Pipeline pipeline = new Pipeline().setStages(new PipelineStage[] { featureIndexer, rf });
    // Train model. This also runs the indexer.
    PipelineModel model = pipeline.fit(trainingData);
    // Make predictions.
    Dataset<Row> predictions = model.transform(testData);
    // Select example rows to display.
    predictions.select("prediction", "label", "features").show(5);
    // Select (prediction, true label) and compute test error
    RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
    double rmse = evaluator.evaluate(predictions);
    System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
    RandomForestRegressionModel rfModel = (RandomForestRegressionModel) (model.stages()[1]);
    System.out.println("Learned regression forest model:\n" + rfModel.toDebugString());
    // $example off$
    Double feature = 2.0;
    Vector features = Vectors.dense(feature);
    double p = rfModel.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    feature = 11.0;
    features = Vectors.dense(feature);
    p = rfModel.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    spark.stop();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Dataset(org.apache.spark.sql.Dataset) VectorIndexer(org.apache.spark.ml.feature.VectorIndexer) Pipeline(org.apache.spark.ml.Pipeline) PipelineModel(org.apache.spark.ml.PipelineModel) RandomForestRegressionModel(org.apache.spark.ml.regression.RandomForestRegressionModel) RandomForestRegressor(org.apache.spark.ml.regression.RandomForestRegressor) VectorIndexerModel(org.apache.spark.ml.feature.VectorIndexerModel) Row(org.apache.spark.sql.Row) RegressionEvaluator(org.apache.spark.ml.evaluation.RegressionEvaluator) Vector(org.apache.spark.ml.linalg.Vector)

Aggregations

Pipeline (org.apache.spark.ml.Pipeline)2 PipelineModel (org.apache.spark.ml.PipelineModel)2 RegressionEvaluator (org.apache.spark.ml.evaluation.RegressionEvaluator)2 VectorIndexer (org.apache.spark.ml.feature.VectorIndexer)2 VectorIndexerModel (org.apache.spark.ml.feature.VectorIndexerModel)2 Vector (org.apache.spark.ml.linalg.Vector)2 RandomForestRegressionModel (org.apache.spark.ml.regression.RandomForestRegressionModel)2 RandomForestRegressor (org.apache.spark.ml.regression.RandomForestRegressor)2 Dataset (org.apache.spark.sql.Dataset)2 Row (org.apache.spark.sql.Row)2 SparkSession (org.apache.spark.sql.SparkSession)2