Search in sources :

Example 1 with GBTRegressor

use of org.apache.spark.ml.regression.GBTRegressor in project mmtf-spark by sbl-sdsc.

the class DatasetRegressor method main.

/**
 * @param args args[0] path to parquet file, args[1] name of the prediction column
 * @throws IOException
 * @throws StructureException
 */
public static void main(String[] args) throws IOException {
    if (args.length != 2) {
        System.err.println("Usage: " + DatasetRegressor.class.getSimpleName() + " <parquet file> <prediction column name>");
        System.exit(1);
    }
    // name of the prediction column
    String label = args[1];
    long start = System.nanoTime();
    SparkSession spark = SparkSession.builder().master("local[*]").appName(DatasetRegressor.class.getSimpleName()).getOrCreate();
    Dataset<Row> data = spark.read().parquet(args[0]).cache();
    int featureCount = ((DenseVector) data.first().getAs("features")).numActives();
    System.out.println("Feature count: " + featureCount);
    System.out.println("Dataset size : " + data.count());
    double testFraction = 0.3;
    long seed = 123;
    LinearRegression lr = new LinearRegression().setLabelCol(label).setFeaturesCol("features");
    SparkRegressor reg = new SparkRegressor(lr, label, testFraction, seed);
    System.out.println(reg.fit(data));
    GBTRegressor gbt = new GBTRegressor().setLabelCol(label).setFeaturesCol("features");
    reg = new SparkRegressor(gbt, label, testFraction, seed);
    System.out.println(reg.fit(data));
    GeneralizedLinearRegression glr = new GeneralizedLinearRegression().setLabelCol(label).setFeaturesCol("features").setFamily("gaussian").setLink("identity").setMaxIter(10).setRegParam(0.3);
    reg = new SparkRegressor(glr, label, testFraction, seed);
    System.out.println(reg.fit(data));
    long end = System.nanoTime();
    System.out.println((end - start) / 1E9 + " sec");
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) GeneralizedLinearRegression(org.apache.spark.ml.regression.GeneralizedLinearRegression) GBTRegressor(org.apache.spark.ml.regression.GBTRegressor) Row(org.apache.spark.sql.Row) LinearRegression(org.apache.spark.ml.regression.LinearRegression) GeneralizedLinearRegression(org.apache.spark.ml.regression.GeneralizedLinearRegression) DenseVector(org.apache.spark.ml.linalg.DenseVector)

Example 2 with GBTRegressor

use of org.apache.spark.ml.regression.GBTRegressor in project net.jgp.labs.spark by jgperrin.

the class GradientBoostedTreeRegressorApp method main.

public static void main(String[] args) {
    SparkSession spark = SparkSession.builder().appName("GradientBoostedTreeRegressorApp").master("local[*]").getOrCreate();
    // Load and parse the data file, converting it to a DataFrame.
    Dataset<Row> df = spark.read().format("libsvm").load("data/sample-ml/simplegauss.txt");
    // Automatically identify categorical features, and index them.
    // Set maxCategories so features with > 4 distinct values are treated as continuous.
    VectorIndexerModel featureIndexer = new VectorIndexer().setInputCol("features").setOutputCol("indexedFeatures").setMaxCategories(4).fit(df);
    // Train a GBT model
    GBTRegressor gbt = new GBTRegressor().setLabelCol("label").setFeaturesCol("features").setMaxIter(100);
    // Train model
    GBTRegressionModel model = gbt.fit(df);
    // Make predictions
    Dataset<Row> predictions = model.transform(df);
    // Select example rows to display
    predictions.show(20, false);
    // Select (prediction, true label) and compute test error.
    RegressionEvaluator evaluator = new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse");
    double rmse = evaluator.evaluate(predictions);
    System.out.println("Root Mean Squared Error (RMSE) on test data = " + rmse);
    System.out.println("Learned regression GBT model:\n" + model.toDebugString());
    Double feature = 2.0;
    Vector features = Vectors.dense(feature);
    double p = model.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    feature = 11.0;
    features = Vectors.dense(feature);
    p = model.predict(features);
    System.out.println("Prediction for feature " + feature + " is " + p + " (expected: 2)");
    spark.stop();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) GBTRegressor(org.apache.spark.ml.regression.GBTRegressor) VectorIndexerModel(org.apache.spark.ml.feature.VectorIndexerModel) GBTRegressionModel(org.apache.spark.ml.regression.GBTRegressionModel) Row(org.apache.spark.sql.Row) RegressionEvaluator(org.apache.spark.ml.evaluation.RegressionEvaluator) VectorIndexer(org.apache.spark.ml.feature.VectorIndexer) Vector(org.apache.spark.ml.linalg.Vector)

Aggregations

GBTRegressor (org.apache.spark.ml.regression.GBTRegressor)2 Row (org.apache.spark.sql.Row)2 SparkSession (org.apache.spark.sql.SparkSession)2 RegressionEvaluator (org.apache.spark.ml.evaluation.RegressionEvaluator)1 VectorIndexer (org.apache.spark.ml.feature.VectorIndexer)1 VectorIndexerModel (org.apache.spark.ml.feature.VectorIndexerModel)1 DenseVector (org.apache.spark.ml.linalg.DenseVector)1 Vector (org.apache.spark.ml.linalg.Vector)1 GBTRegressionModel (org.apache.spark.ml.regression.GBTRegressionModel)1 GeneralizedLinearRegression (org.apache.spark.ml.regression.GeneralizedLinearRegression)1 LinearRegression (org.apache.spark.ml.regression.LinearRegression)1