use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class DataframeCheckpoint method start.
private void start() {
SparkConf conf = new SparkConf().setAppName("Checkpoint").setMaster("local[*]");
SparkContext sparkContext = new SparkContext(conf);
// We need to specify where Spark will save the checkpoint file. It can be an HDFS location.
sparkContext.setCheckpointDir("/tmp");
SparkSession spark = SparkSession.builder().appName("Checkpoint").master("local[*]").getOrCreate();
String filename = "data/tuple-data-file.csv";
Dataset<Row> df1 = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
System.out.println("DF #1 - step #1: simple dump of the dataframe");
df1.show();
System.out.println("DF #2 - step #2: same as DF #1 - step #1");
Dataset<Row> df2 = df1.checkpoint(false);
df2.show();
df1 = df1.withColumn("x", df1.col("_c0"));
System.out.println("DF #1 - step #2: new column x, which is the same as _c0");
df1.show();
System.out.println("DF #2 - step #2: no operation was done on df2");
df2.show();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class BasicUdfFromTextFile method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
// registers a new internal UDF
spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() {
private static final long serialVersionUID = -5372447039252716846L;
@Override
public Integer call(Integer x) {
return x * 2;
}
}, DataTypes.IntegerType);
String filename = "data/tuple-data-file.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
df = df.withColumn("label", df.col("_c0")).drop("_c0");
df = df.withColumn("value", df.col("_c1")).drop("_c1");
df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(DataTypes.IntegerType)));
df.show();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class ReadLinesFromFileStream method start.
private void start() {
log.debug("-> start()");
SparkSession spark = SparkSession.builder().appName("Read lines over a file stream").master("local").getOrCreate();
Dataset<Row> df = spark.readStream().format("text").load(StreamingUtils.getInputDirectory());
StreamingQuery query = df.writeStream().outputMode(OutputMode.Update()).format("console").start();
try {
query.awaitTermination();
} catch (StreamingQueryException e) {
log.error("Exception while waiting for query to end {}.", e.getMessage(), e);
}
// Never executed
df.show();
df.printSchema();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class AuthorsAndBooksWithDates method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Authors and Books").master("local").getOrCreate();
String filename = "data/authors.csv";
Dataset<Row> authorsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").option("dateFormat", "mm/dd/yy").load(filename);
authorsDf.show();
filename = "data/books.csv";
Dataset<Row> booksDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
booksDf.show();
Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer").withColumn("bookId", booksDf.col("id")).drop(booksDf.col("id"));
libraryDf.show();
libraryDf.printSchema();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class CsvToDatasetBook method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>").master("local").getOrCreate();
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
df.show();
Dataset<Book> bookDf = df.map(new BookMapper(), Encoders.bean(Book.class));
bookDf.show();
}
Aggregations