Search in sources :

Example 6 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class TransformTranslator method writeAvro.

private static <T> TransformEvaluator<AvroIO.Write.Bound<T>> writeAvro() {
    return new TransformEvaluator<AvroIO.Write.Bound<T>>() {

        @Override
        public void evaluate(AvroIO.Write.Bound<T> transform, EvaluationContext context) {
            Job job;
            try {
                job = Job.getInstance();
            } catch (IOException e) {
                throw new IllegalStateException(e);
            }
            AvroJob.setOutputKeySchema(job, transform.getSchema());
            @SuppressWarnings("unchecked") JavaPairRDD<AvroKey<T>, NullWritable> last = ((JavaRDDLike<WindowedValue<T>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<T>unwindowFunction()).mapToPair(new PairFunction<T, AvroKey<T>, NullWritable>() {

                @Override
                public Tuple2<AvroKey<T>, NullWritable> call(T t) throws Exception {
                    return new Tuple2<>(new AvroKey<>(t), NullWritable.get());
                }
            });
            ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
            writeHadoopFile(last, job.getConfiguration(), shardTemplateInfo, AvroKey.class, NullWritable.class, TemplatedAvroKeyOutputFormat.class);
        }
    };
}
Also used : AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) AvroKey(org.apache.avro.mapred.AvroKey) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) IOException(java.io.IOException) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) Tuple2(scala.Tuple2) Job(org.apache.hadoop.mapreduce.Job) AvroJob(org.apache.avro.mapreduce.AvroJob)

Example 7 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class TransformTranslator method readAvro.

private static <T> TransformEvaluator<AvroIO.Read.Bound<T>> readAvro() {
    return new TransformEvaluator<AvroIO.Read.Bound<T>>() {

        @Override
        public void evaluate(AvroIO.Read.Bound<T> transform, EvaluationContext context) {
            String pattern = transform.getFilepattern();
            JavaSparkContext jsc = context.getSparkContext();
            @SuppressWarnings("unchecked") JavaRDD<AvroKey<T>> avroFile = (JavaRDD<AvroKey<T>>) (JavaRDD<?>) jsc.newAPIHadoopFile(pattern, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, new Configuration()).keys();
            JavaRDD<WindowedValue<T>> rdd = avroFile.map(new Function<AvroKey<T>, T>() {

                @Override
                public T call(AvroKey<T> key) {
                    return key.datum();
                }
            }).map(WindowingHelpers.<T>windowFunction());
            context.setOutputRDD(transform, rdd);
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) AvroKey(org.apache.avro.mapred.AvroKey) NullWritable(org.apache.hadoop.io.NullWritable) JavaRDD(org.apache.spark.api.java.JavaRDD) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AvroKeyInputFormat(org.apache.avro.mapreduce.AvroKeyInputFormat)

Example 8 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class StreamingTransformTranslator method window.

private static <T, W extends BoundedWindow> TransformEvaluator<Window.Bound<T>> window() {
    return new TransformEvaluator<Window.Bound<T>>() {

        @Override
        public void evaluate(Window.Bound<T> transform, EvaluationContext context) {
            StreamingEvaluationContext sec = (StreamingEvaluationContext) context;
            //--- first we apply windowing to the stream
            WindowFn<? super T, W> windowFn = WINDOW_FG.get("windowFn", transform);
            @SuppressWarnings("unchecked") JavaDStream<WindowedValue<T>> dStream = (JavaDStream<WindowedValue<T>>) sec.getStream(transform);
            if (windowFn instanceof FixedWindows) {
                Duration windowDuration = Durations.milliseconds(((FixedWindows) windowFn).getSize().getMillis());
                sec.setStream(transform, dStream.window(windowDuration));
            } else if (windowFn instanceof SlidingWindows) {
                Duration windowDuration = Durations.milliseconds(((SlidingWindows) windowFn).getSize().getMillis());
                Duration slideDuration = Durations.milliseconds(((SlidingWindows) windowFn).getPeriod().getMillis());
                sec.setStream(transform, dStream.window(windowDuration, slideDuration));
            }
            //--- then we apply windowing to the elements
            DoFn<T, T> addWindowsDoFn = new AssignWindowsDoFn<>(windowFn);
            DoFnFunction<T, T> dofn = new DoFnFunction<>(addWindowsDoFn, ((StreamingEvaluationContext) context).getRuntimeContext(), null);
            @SuppressWarnings("unchecked") JavaDStreamLike<WindowedValue<T>, ?, JavaRDD<WindowedValue<T>>> dstream = (JavaDStreamLike<WindowedValue<T>, ?, JavaRDD<WindowedValue<T>>>) sec.getStream(transform);
            sec.setStream(transform, dstream.mapPartitions(dofn));
        }
    };
}
Also used : BoundedWindow(com.google.cloud.dataflow.sdk.transforms.windowing.BoundedWindow) Window(com.google.cloud.dataflow.sdk.transforms.windowing.Window) FixedWindows(com.google.cloud.dataflow.sdk.transforms.windowing.FixedWindows) Duration(org.apache.spark.streaming.Duration) AssignWindowsDoFn(com.google.cloud.dataflow.sdk.util.AssignWindowsDoFn) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) TransformEvaluator(com.cloudera.dataflow.spark.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) DoFnFunction(com.cloudera.dataflow.spark.DoFnFunction) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) JavaDStreamLike(org.apache.spark.streaming.api.java.JavaDStreamLike) EvaluationContext(com.cloudera.dataflow.spark.EvaluationContext) SlidingWindows(com.google.cloud.dataflow.sdk.transforms.windowing.SlidingWindows)

Example 9 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class TransformTranslator method combineGlobally.

private static <I, A, O> TransformEvaluator<Combine.Globally<I, O>> combineGlobally() {
    return new TransformEvaluator<Combine.Globally<I, O>>() {

        @Override
        public void evaluate(Combine.Globally<I, O> transform, EvaluationContext context) {
            final Combine.CombineFn<I, A, O> globally = COMBINE_GLOBALLY_FG.get("fn", transform);
            @SuppressWarnings("unchecked") JavaRDDLike<WindowedValue<I>, ?> inRdd = (JavaRDDLike<WindowedValue<I>, ?>) context.getInputRDD(transform);
            final Coder<I> iCoder = context.getInput(transform).getCoder();
            final Coder<A> aCoder;
            try {
                aCoder = globally.getAccumulatorCoder(context.getPipeline().getCoderRegistry(), iCoder);
            } catch (CannotProvideCoderException e) {
                throw new IllegalStateException("Could not determine coder for accumulator", e);
            }
            // Use coders to convert objects in the PCollection to byte arrays, so they
            // can be transferred over the network for the shuffle.
            JavaRDD<byte[]> inRddBytes = inRdd.map(WindowingHelpers.<I>unwindowFunction()).map(CoderHelpers.toByteFunction(iCoder));
            /*A*/
            byte[] acc = inRddBytes.aggregate(CoderHelpers.toByteArray(globally.createAccumulator(), aCoder), new Function2<byte[], byte[], byte[]>() {

                @Override
                public byte[] call(/*A*/
                byte[] ab, /*I*/
                byte[] ib) throws Exception {
                    A a = CoderHelpers.fromByteArray(ab, aCoder);
                    I i = CoderHelpers.fromByteArray(ib, iCoder);
                    return CoderHelpers.toByteArray(globally.addInput(a, i), aCoder);
                }
            }, new Function2<byte[], byte[], byte[]>() {

                @Override
                public byte[] call(/*A*/
                byte[] a1b, /*A*/
                byte[] a2b) throws Exception {
                    A a1 = CoderHelpers.fromByteArray(a1b, aCoder);
                    A a2 = CoderHelpers.fromByteArray(a2b, aCoder);
                    // don't use Guava's ImmutableList.of as values may be null
                    List<A> accumulators = Collections.unmodifiableList(Arrays.asList(a1, a2));
                    A merged = globally.mergeAccumulators(accumulators);
                    return CoderHelpers.toByteArray(merged, aCoder);
                }
            });
            O output = globally.extractOutput(CoderHelpers.fromByteArray(acc, aCoder));
            Coder<O> coder = context.getOutput(transform).getCoder();
            JavaRDD<byte[]> outRdd = context.getSparkContext().parallelize(// don't use Guava's ImmutableList.of as output may be null
            CoderHelpers.toByteArrays(Collections.singleton(output), coder));
            context.setOutputRDD(transform, outRdd.map(CoderHelpers.fromByteFunction(coder)).map(WindowingHelpers.<O>windowFunction()));
        }
    };
}
Also used : Combine(com.google.cloud.dataflow.sdk.transforms.Combine) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) IOException(java.io.IOException) TextIO(com.google.cloud.dataflow.sdk.io.TextIO) AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) List(java.util.List) PCollectionList(com.google.cloud.dataflow.sdk.values.PCollectionList)

Aggregations

WindowedValue (com.google.cloud.dataflow.sdk.util.WindowedValue)9 Tuple2 (scala.Tuple2)5 HadoopIO (com.cloudera.dataflow.hadoop.HadoopIO)4 CannotProvideCoderException (com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException)4 AvroIO (com.google.cloud.dataflow.sdk.io.AvroIO)4 IOException (java.io.IOException)4 Configuration (org.apache.hadoop.conf.Configuration)4 TextIO (com.google.cloud.dataflow.sdk.io.TextIO)3 KV (com.google.cloud.dataflow.sdk.values.KV)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Function (org.apache.spark.api.java.function.Function)3 DoFnFunction (com.cloudera.dataflow.spark.DoFnFunction)2 EvaluationContext (com.cloudera.dataflow.spark.EvaluationContext)2 TransformEvaluator (com.cloudera.dataflow.spark.TransformEvaluator)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 Map (java.util.Map)2 AvroKey (org.apache.avro.mapred.AvroKey)2 NullWritable (org.apache.hadoop.io.NullWritable)2 JavaRDDLike (org.apache.spark.api.java.JavaRDDLike)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2