Search in sources :

Example 16 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class DataframeCheckpoint method start.

private void start() {
    SparkConf conf = new SparkConf().setAppName("Checkpoint").setMaster("local[*]");
    SparkContext sparkContext = new SparkContext(conf);
    // We need to specify where Spark will save the checkpoint file. It can be an HDFS location.
    sparkContext.setCheckpointDir("/tmp");
    SparkSession spark = SparkSession.builder().appName("Checkpoint").master("local[*]").getOrCreate();
    String filename = "data/tuple-data-file.csv";
    Dataset<Row> df1 = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
    System.out.println("DF #1 - step #1: simple dump of the dataframe");
    df1.show();
    System.out.println("DF #2 - step #2: same as DF #1 - step #1");
    Dataset<Row> df2 = df1.checkpoint(false);
    df2.show();
    df1 = df1.withColumn("x", df1.col("_c0"));
    System.out.println("DF #1 - step #2: new column x, which is the same as _c0");
    df1.show();
    System.out.println("DF #2 - step #2: no operation was done on df2");
    df2.show();
}
Also used : SparkContext(org.apache.spark.SparkContext) SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 17 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class BasicUdfFromTextFile method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
    // registers a new internal UDF
    spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() {

        private static final long serialVersionUID = -5372447039252716846L;

        @Override
        public Integer call(Integer x) {
            return x * 2;
        }
    }, DataTypes.IntegerType);
    String filename = "data/tuple-data-file.csv";
    Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
    df = df.withColumn("label", df.col("_c0")).drop("_c0");
    df = df.withColumn("value", df.col("_c1")).drop("_c1");
    df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(DataTypes.IntegerType)));
    df.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 18 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class ReadLinesFromFileStream method start.

private void start() {
    log.debug("-> start()");
    SparkSession spark = SparkSession.builder().appName("Read lines over a file stream").master("local").getOrCreate();
    Dataset<Row> df = spark.readStream().format("text").load(StreamingUtils.getInputDirectory());
    StreamingQuery query = df.writeStream().outputMode(OutputMode.Update()).format("console").start();
    try {
        query.awaitTermination();
    } catch (StreamingQueryException e) {
        log.error("Exception while waiting for query to end {}.", e.getMessage(), e);
    }
    // Never executed
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StreamingQuery(org.apache.spark.sql.streaming.StreamingQuery) Row(org.apache.spark.sql.Row) StreamingQueryException(org.apache.spark.sql.streaming.StreamingQueryException)

Example 19 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class AuthorsAndBooksWithDates method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("Authors and Books").master("local").getOrCreate();
    String filename = "data/authors.csv";
    Dataset<Row> authorsDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").option("dateFormat", "mm/dd/yy").load(filename);
    authorsDf.show();
    filename = "data/books.csv";
    Dataset<Row> booksDf = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
    booksDf.show();
    Dataset<Row> libraryDf = authorsDf.join(booksDf, authorsDf.col("id").equalTo(booksDf.col("authorId")), "full_outer").withColumn("bookId", booksDf.col("id")).drop(booksDf.col("id"));
    libraryDf.show();
    libraryDf.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 20 with SparkSession

use of org.apache.spark.sql.SparkSession in project net.jgp.labs.spark by jgperrin.

the class CsvToDatasetBook method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset<Book>").master("local").getOrCreate();
    String filename = "data/books.csv";
    Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "true").load(filename);
    df.show();
    Dataset<Book> bookDf = df.map(new BookMapper(), Encoders.bean(Book.class));
    bookDf.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Book(net.jgp.labs.spark.x.model.Book) Row(org.apache.spark.sql.Row)

Aggregations

SparkSession (org.apache.spark.sql.SparkSession)53 Row (org.apache.spark.sql.Row)43 StructType (org.apache.spark.sql.types.StructType)11 ArrayList (java.util.ArrayList)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 StructField (org.apache.spark.sql.types.StructField)6 SparkConf (org.apache.spark.SparkConf)4 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Script (org.apache.sysml.api.mlcontext.Script)3 Test (org.junit.Test)3 Dataset (org.apache.spark.sql.Dataset)2 StreamingQuery (org.apache.spark.sql.streaming.StreamingQuery)2 StreamingQueryException (org.apache.spark.sql.streaming.StreamingQueryException)2 DMLScript (org.apache.sysml.api.DMLScript)2 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)2 MLContext (org.apache.sysml.api.mlcontext.MLContext)2 Matrix (org.apache.sysml.api.mlcontext.Matrix)2 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)2 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)2 File (java.io.File)1