use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class ReducerApp method start.
private void start() {
SparkSession spark = SparkSession.builder().master("local").getOrCreate();
List<Integer> data = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
Dataset<Integer> df = spark.createDataset(data, Encoders.INT());
df.show();
df.printSchema();
Integer sumByReduce = df.reduce(new SumByReduce());
System.out.println("Sum should be 55 and it is... " + sumByReduce);
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class BuildDataFrameFromScratch method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Build a DataFrame from Scratch").master("local[*]").getOrCreate();
List<String> stringAsList = new ArrayList<>();
stringAsList.add("bar");
JavaSparkContext sparkContext = new JavaSparkContext(spark.sparkContext());
JavaRDD<Row> rowRDD = sparkContext.parallelize(stringAsList).map((String row) -> RowFactory.create(row));
// Creates schema
StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("foe", DataTypes.StringType, false) });
Dataset<Row> df = spark.sqlContext().createDataFrame(rowRDD, schema).toDF();
log.debug("** Schema: ");
df.printSchema();
log.debug("** Data: ");
df.show();
sparkContext.close();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class IngestionJoinSave method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Authors and Books").master("local").getOrCreate();
String filename = "data/authors.csv";
// @formatter:off
Dataset<Row> authorsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").option("dateFormat", "mm/dd/yy").load(filename);
// @formatter:on
authorsDf.show();
filename = "data/books.csv";
// @formatter:off
Dataset<Row> booksDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
// @formatter:on
booksDf.show();
// @formatter:off
Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer").withColumn("bookId", booksDf.col("id")).drop(booksDf.col("id"));
// @formatter:on
libraryDf.show();
libraryDf.printSchema();
libraryDf.write().json("data/library.json");
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class AuthorsWithNoBooks method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Authors and Books").master("local").getOrCreate();
String filename = "data/authors.csv";
// @formatter:off
Dataset<Row> authorsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
// @formatter:on
filename = "data/books.csv";
// @formatter:off
Dataset<Row> booksDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
// @formatter:on
Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "left_anti");
libraryDf.show();
libraryDf.printSchema();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class ClaimProcessApp method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("For Each Claim").master("local").getOrCreate();
String filename = "data/claims.csv";
Dataset<Row> claimsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
claimsDf.show();
claimsDf.foreach(new ClaimPrepAndProcess());
}
Aggregations