use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class RowProcessor method call.
@Override
public void call(JavaRDD<String> rdd) throws Exception {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
private static final long serialVersionUID = 5167089361335095997L;
@Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
// Create Schema
StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
// Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
msgDataFrame.show();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class ConnectRemotely method main.
public static void main(String[] args) {
SparkSession spark = SparkSession.builder().appName("myApp").master("spark://10.0.100.120:7077").getOrCreate();
System.out.println("Hello, Spark v." + spark.version());
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class BasicExternalUdfFromTextFile method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
spark.udf().register("x2Multiplier", new Multiplier2(), DataTypes.IntegerType);
String filename = "data/tuple-data-file.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
df = df.withColumn("label", df.col("_c0")).drop("_c0");
df = df.withColumn("value", df.col("_c1")).drop("_c1");
df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(DataTypes.IntegerType)));
df.show();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class AuthorsAndBooks method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("Authors and Books").master("local").getOrCreate();
String filename = "data/authors.csv";
// @formatter:off
Dataset<Row> authorsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
// @formatter:on
authorsDf.show();
filename = "data/books.csv";
// @formatter:off
Dataset<Row> booksDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
// @formatter:on
booksDf.show();
Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer");
libraryDf.show();
libraryDf.printSchema();
}
use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.
the class ForEachBookApp method start.
private void start() {
SparkSession spark = SparkSession.builder().appName("For Each Book").master("local").getOrCreate();
String filename = "data/books.csv";
Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
df.show();
df.foreach(new BookPrinter());
}
Aggregations