use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class SimplePredictionFromTextFile method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Simple prediction from Text File").master("local").getOrCreate();
spark.udf().register("vectorBuilder", new VectorBuilder(), new VectorUDT());
String filename = "data/tuple-data-file.csv";
StructType schema = new StructType(new StructField[] { new StructField("_c0", DataTypes.DoubleType, false, Metadata.empty()), new StructField("_c1", DataTypes.DoubleType, false, Metadata.empty()), new StructField("features", new VectorUDT(), true, Metadata.empty()) });
Dataset<Row> df = spark.read().format("csv").schema(schema).option("header", "false").load(filename);
df = df.withColumn("valuefeatures", df.col("_c0")).drop("_c0");
df = df.withColumn("label", df.col("_c1")).drop("_c1");
df.printSchema();
df = df.withColumn("features", callUDF("vectorBuilder", df.col("valuefeatures")));
df.printSchema();
df.show();
// .setRegParam(1).setElasticNetParam(1);
LinearRegression lr = new LinearRegression().setMaxIter(20);
// Fit the model to the data.
LinearRegressionModel model = lr.fit(df);
// Given a dataset, predict each point's label, and show the results.
model.transform(df).show();
LinearRegressionTrainingSummary trainingSummary = model.summary();
System.out.println("numIterations: " + trainingSummary.totalIterations());
System.out.println("objectiveHistory: " + Vectors.dense(trainingSummary.objectiveHistory()));
trainingSummary.residuals().show();
System.out.println("RMSE: " + trainingSummary.rootMeanSquaredError());
System.out.println("r2: " + trainingSummary.r2());
double intercept = model.intercept();
System.out.println("Interesection: " + intercept);
double regParam = model.getRegParam();
System.out.println("Regression parameter: " + regParam);
double tol = model.getTol();
System.out.println("Tol: " + tol);
Double feature = 7.0;
Vector features = Vectors.dense(feature);
double p = model.predict(features);
System.out.println("Prediction for feature " + feature + " is " + p);
System.out.println(8 * regParam + intercept);
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class AuthorsAndBooksCountBooksApp method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Authors and Books").master("local").getOrCreate();
String filename = "data/authors.csv";
Dataset<Row> authorsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
authorsDf.show();
filename = "data/books.csv";
Dataset<Row> booksDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
booksDf.show();
Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "left").withColumn("bookId", booksDf.col("id")).drop(booksDf.col("id")).groupBy(authorsDf.col("id"), authorsDf.col("name"), authorsDf.col("link")).count();
libraryDf.show();
libraryDf.printSchema();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class BookUrlBuilderApp method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Book URL Builder").master("local").getOrCreate();
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
df.show();
Dataset<String> ds = df.map(new BookUrlBuilder(), Encoders.STRING());
ds.printSchema();
ds.show(20, 80);
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class CsvToDatasetBookAsJson method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book> as JSON").master("local").getOrCreate();
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
df.show();
Dataset<String> bookDf = df.map(new BookMapper(), Encoders.STRING());
bookDf.show(20, 132);
Dataset<Row> bookAsJsonDf = spark.read().json(bookDf);
bookAsJsonDf.show();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class BuildDataFrameFromScratch2 method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Build a DataFrame from Scratch").master("local[*]").getOrCreate();
List<String[]> stringAsList = new ArrayList<>();
stringAsList.add(new String[] { "bar1.1", "bar2.1" });
stringAsList.add(new String[] { "bar1.2", "bar2.2" });
JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((String[] row) -> RowFactory.create(row));
// Creates schema
StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("foe1", DataTypes.StringType, false), DataTypes.createStructField("foe2", DataTypes.StringType, false) });
Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();
log.debug("** Schema: ");
df.printSchema();
log.debug("** Data: ");
df.show();
sparkContext.close();
}
Aggregations