use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.
the class StreamingIngestionFileSystemTextFileToDataframeApp method start.
private void start() {
// Create a local StreamingContext with two working thread and batch interval of
// 1 second
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
msgDataStream.print();
// Create JavaRDD<Row>
msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
private static final long serialVersionUID = -590010339928376829L;
@Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
private static final long serialVersionUID = 5167089361335095997L;
@Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
// Create Schema
StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
// Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
msgDataFrame.show();
}
});
jssc.start();
try {
jssc.awaitTermination();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.
the class ReadLinesFromMultipleFileStreams method start.
private void start() {
log.debug("-> start()");
SparkSession spark = SparkSession.builder().appName("Read lines over a file stream").master("local").getOrCreate();
// @formatter:off
Dataset<Row> df = spark.readStream().format("text").load(StreamingUtils.getInputDirectory());
// @formatter:on
StreamingQuery query = df.writeStream().outputMode(OutputMode.Update()).format("console").start();
try {
query.awaitTermination();
} catch (StreamingQueryException e) {
log.error("Exception while waiting for query to end {}.", e.getMessage(), e);
}
// In this case everything is a string
df.show();
df.printSchema();
}
use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.
the class DataframeCheckpoint method start.
private void start() {
SparkConf conf = new SparkConf().setAppName("Checkpoint").setMaster("local[*]");
SparkContext sparkContext = new SparkContext(conf);
// We need to specify where Spark will save the checkpoint file. It can be an HDFS location.
sparkContext.setCheckpointDir("/tmp");
SparkSession spark = SparkSession.builder().appName("Checkpoint").master("local[*]").getOrCreate();
String filename = "data/tuple-data-file.csv";
Dataset<Row> df1 = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
System.out.println("DF #1 - step #1: simple dump of the dataframe");
df1.show();
System.out.println("DF #2 - step #2: same as DF #1 - step #1");
Dataset<Row> df2 = df1.checkpoint(false);
df2.show();
df1 = df1.withColumn("x", df1.col("_c0"));
System.out.println("DF #1 - step #2: new column x, which is the same as _c0");
df1.show();
System.out.println("DF #2 - step #2: no operation was done on df2");
df2.show();
}
use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.
the class BasicUdfFromTextFile method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
// registers a new internal UDF
spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() {
private static final long serialVersionUID = -5372447039252716846L;
@Override
public Integer call(Integer x) {
return x * 2;
}
}, DataTypes.IntegerType);
String filename = "data/tuple-data-file.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
df = df.withColumn("label", df.col("_c0")).drop("_c0");
df = df.withColumn("value", df.col("_c1")).drop("_c1");
df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(DataTypes.IntegerType)));
df.show();
}
use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.
the class ReadLinesFromFileStream method start.
private void start() {
log.debug("-> start()");
SparkSession spark = SparkSession.builder().appName("Read lines over a file stream").master("local").getOrCreate();
Dataset<Row> df = spark.readStream().format("text").load(StreamingUtils.getInputDirectory());
StreamingQuery query = df.writeStream().outputMode(OutputMode.Update()).format("console").start();
try {
query.awaitTermination();
} catch (StreamingQueryException e) {
log.error("Exception while waiting for query to end {}.", e.getMessage(), e);
}
// Never executed
df.show();
df.printSchema();
}
Aggregations