Search in sources :

Example 56 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class StreamingIngestionFileSystemTextFileToDataframeApp method start.

private void start() {
    // Create a local StreamingContext with two working thread and batch interval of
    // 1 second
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
    JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
    msgDataStream.print();
    // Create JavaRDD<Row>
    msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {

        private static final long serialVersionUID = -590010339928376829L;

        @Override
        public void call(JavaRDD<String> rdd) {
            JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {

                private static final long serialVersionUID = 5167089361335095997L;

                @Override
                public Row call(String msg) {
                    Row row = RowFactory.create(msg);
                    return row;
                }
            });
            // Create Schema
            StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
            // Get Spark 2.0 session
            SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
            Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
            msgDataFrame.show();
        }
    });
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) Dataset(org.apache.spark.sql.Dataset) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 57 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class ReadLinesFromMultipleFileStreams method start.

private void start() {
    log.debug("-> start()");
    SparkSession spark = SparkSession.builder().appName("Read lines over a file stream").master("local").getOrCreate();
    // @formatter:off
    Dataset<Row> df = spark.readStream().format("text").load(StreamingUtils.getInputDirectory());
    // @formatter:on
    StreamingQuery query = df.writeStream().outputMode(OutputMode.Update()).format("console").start();
    try {
        query.awaitTermination();
    } catch (StreamingQueryException e) {
        log.error("Exception while waiting for query to end {}.", e.getMessage(), e);
    }
    // In this case everything is a string
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StreamingQuery(org.apache.spark.sql.streaming.StreamingQuery) Row(org.apache.spark.sql.Row) StreamingQueryException(org.apache.spark.sql.streaming.StreamingQueryException)

Example 58 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class DataframeCheckpoint method start.

private void start() {
    SparkConf conf = new SparkConf().setAppName("Checkpoint").setMaster("local[*]");
    SparkContext sparkContext = new SparkContext(conf);
    // We need to specify where Spark will save the checkpoint file. It can be an HDFS location.
    sparkContext.setCheckpointDir("/tmp");
    SparkSession spark = SparkSession.builder().appName("Checkpoint").master("local[*]").getOrCreate();
    String filename = "data/tuple-data-file.csv";
    Dataset<Row> df1 = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
    System.out.println("DF #1 - step #1: simple dump of the dataframe");
    df1.show();
    System.out.println("DF #2 - step #2: same as DF #1 - step #1");
    Dataset<Row> df2 = df1.checkpoint(false);
    df2.show();
    df1 = df1.withColumn("x", df1.col("_c0"));
    System.out.println("DF #1 - step #2: new column x, which is the same as _c0");
    df1.show();
    System.out.println("DF #2 - step #2: no operation was done on df2");
    df2.show();
}
Also used : SparkContext(org.apache.spark.SparkContext) SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 59 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class BasicUdfFromTextFile method start.

private void start() {
    SparkSession spark = SparkSession.builder().appName("CSV to Dataset").master("local").getOrCreate();
    // registers a new internal UDF
    spark.udf().register("x2Multiplier", new UDF1<Integer, Integer>() {

        private static final long serialVersionUID = -5372447039252716846L;

        @Override
        public Integer call(Integer x) {
            return x * 2;
        }
    }, DataTypes.IntegerType);
    String filename = "data/tuple-data-file.csv";
    Dataset<Row> df = spark.read().format("csv").option("inferSchema", "true").option("header", "false").load(filename);
    df = df.withColumn("label", df.col("_c0")).drop("_c0");
    df = df.withColumn("value", df.col("_c1")).drop("_c1");
    df = df.withColumn("x2", callUDF("x2Multiplier", df.col("value").cast(DataTypes.IntegerType)));
    df.show();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) Row(org.apache.spark.sql.Row)

Example 60 with Row

use of org.apache.spark.sql.Row in project net.jgp.labs.spark by jgperrin.

the class ReadLinesFromFileStream method start.

private void start() {
    log.debug("-> start()");
    SparkSession spark = SparkSession.builder().appName("Read lines over a file stream").master("local").getOrCreate();
    Dataset<Row> df = spark.readStream().format("text").load(StreamingUtils.getInputDirectory());
    StreamingQuery query = df.writeStream().outputMode(OutputMode.Update()).format("console").start();
    try {
        query.awaitTermination();
    } catch (StreamingQueryException e) {
        log.error("Exception while waiting for query to end {}.", e.getMessage(), e);
    }
    // Never executed
    df.show();
    df.printSchema();
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StreamingQuery(org.apache.spark.sql.streaming.StreamingQuery) Row(org.apache.spark.sql.Row) StreamingQueryException(org.apache.spark.sql.streaming.StreamingQueryException)

Aggregations

Row (org.apache.spark.sql.Row)129 Test (org.junit.Test)60 Script (org.apache.sysml.api.mlcontext.Script)53 StructType (org.apache.spark.sql.types.StructType)50 ArrayList (java.util.ArrayList)48 StructField (org.apache.spark.sql.types.StructField)46 SparkSession (org.apache.spark.sql.SparkSession)43 VectorUDT (org.apache.spark.ml.linalg.VectorUDT)19 MatrixMetadata (org.apache.sysml.api.mlcontext.MatrixMetadata)19 MLResults (org.apache.sysml.api.mlcontext.MLResults)18 DenseVector (org.apache.spark.ml.linalg.DenseVector)16 Vector (org.apache.spark.ml.linalg.Vector)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)15 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)12 SQLContext (org.apache.spark.sql.SQLContext)12 User (uk.gov.gchq.gaffer.user.User)12 HashSet (java.util.HashSet)10 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)9 Tuple2 (scala.Tuple2)9 GetDataFrameOfElements (uk.gov.gchq.gaffer.spark.operation.dataframe.GetDataFrameOfElements)9