Search in sources :

Example 1 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class TransformTranslator method readHadoop.

private static <K, V> TransformEvaluator<HadoopIO.Read.Bound<K, V>> readHadoop() {
    return new TransformEvaluator<HadoopIO.Read.Bound<K, V>>() {

        @Override
        public void evaluate(HadoopIO.Read.Bound<K, V> transform, EvaluationContext context) {
            String pattern = transform.getFilepattern();
            JavaSparkContext jsc = context.getSparkContext();
            @SuppressWarnings("unchecked") JavaPairRDD<K, V> file = jsc.newAPIHadoopFile(pattern, transform.getFormatClass(), transform.getKeyClass(), transform.getValueClass(), new Configuration());
            JavaRDD<WindowedValue<KV<K, V>>> rdd = file.map(new Function<Tuple2<K, V>, KV<K, V>>() {

                @Override
                public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
                    return KV.of(t2._1(), t2._2());
                }
            }).map(WindowingHelpers.<KV<K, V>>windowFunction());
            context.setOutputRDD(transform, rdd);
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) KV(com.google.cloud.dataflow.sdk.values.KV) Tuple2(scala.Tuple2) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 2 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class TransformTranslator method writeHadoop.

private static <K, V> TransformEvaluator<HadoopIO.Write.Bound<K, V>> writeHadoop() {
    return new TransformEvaluator<HadoopIO.Write.Bound<K, V>>() {

        @Override
        public void evaluate(HadoopIO.Write.Bound<K, V> transform, EvaluationContext context) {
            @SuppressWarnings("unchecked") JavaPairRDD<K, V> last = ((JavaRDDLike<WindowedValue<KV<K, V>>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<KV<K, V>>unwindowFunction()).mapToPair(new PairFunction<KV<K, V>, K, V>() {

                @Override
                public Tuple2<K, V> call(KV<K, V> t) throws Exception {
                    return new Tuple2<>(t.getKey(), t.getValue());
                }
            });
            ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
            Configuration conf = new Configuration();
            for (Map.Entry<String, String> e : transform.getConfigurationProperties().entrySet()) {
                conf.set(e.getKey(), e.getValue());
            }
            writeHadoopFile(last, conf, shardTemplateInfo, transform.getKeyClass(), transform.getValueClass(), transform.getFormatClass());
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) KV(com.google.cloud.dataflow.sdk.values.KV) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) IOException(java.io.IOException) KV(com.google.cloud.dataflow.sdk.values.KV) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) Tuple2(scala.Tuple2) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 3 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class TransformTranslator method multiDo.

private static <I, O> TransformEvaluator<ParDo.BoundMulti<I, O>> multiDo() {
    return new TransformEvaluator<ParDo.BoundMulti<I, O>>() {

        @Override
        public void evaluate(ParDo.BoundMulti<I, O> transform, EvaluationContext context) {
            TupleTag<O> mainOutputTag = MULTIDO_FG.get("mainOutputTag", transform);
            MultiDoFnFunction<I, O> multifn = new MultiDoFnFunction<>(transform.getFn(), context.getRuntimeContext(), mainOutputTag, getSideInputs(transform.getSideInputs(), context));
            @SuppressWarnings("unchecked") JavaRDDLike<WindowedValue<I>, ?> inRDD = (JavaRDDLike<WindowedValue<I>, ?>) context.getInputRDD(transform);
            JavaPairRDD<TupleTag<?>, WindowedValue<?>> all = inRDD.mapPartitionsToPair(multifn).cache();
            PCollectionTuple pct = context.getOutput(transform);
            for (Map.Entry<TupleTag<?>, PCollection<?>> e : pct.getAll().entrySet()) {
                @SuppressWarnings("unchecked") JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TupleTagFilter(e.getKey()));
                @SuppressWarnings("unchecked") JavaRDD<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
                context.setRDD(e.getValue(), values);
            }
        }
    };
}
Also used : TupleTag(com.google.cloud.dataflow.sdk.values.TupleTag) TextIO(com.google.cloud.dataflow.sdk.io.TextIO) AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) PCollection(com.google.cloud.dataflow.sdk.values.PCollection) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) ParDo(com.google.cloud.dataflow.sdk.transforms.ParDo) PCollectionTuple(com.google.cloud.dataflow.sdk.values.PCollectionTuple) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 4 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class TransformTranslator method writeText.

private static <T> TransformEvaluator<TextIO.Write.Bound<T>> writeText() {
    return new TransformEvaluator<TextIO.Write.Bound<T>>() {

        @Override
        public void evaluate(TextIO.Write.Bound<T> transform, EvaluationContext context) {
            @SuppressWarnings("unchecked") JavaPairRDD<T, Void> last = ((JavaRDDLike<WindowedValue<T>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<T>unwindowFunction()).mapToPair(new PairFunction<T, T, Void>() {

                @Override
                public Tuple2<T, Void> call(T t) throws Exception {
                    return new Tuple2<>(t, null);
                }
            });
            ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
            writeHadoopFile(last, new Configuration(), shardTemplateInfo, Text.class, NullWritable.class, TemplatedTextOutputFormat.class);
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) TextIO(com.google.cloud.dataflow.sdk.io.TextIO) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) IOException(java.io.IOException) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) Tuple2(scala.Tuple2)

Example 5 with WindowedValue

use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.

the class StreamingTransformTranslator method kafka.

private static <K, V> TransformEvaluator<KafkaIO.Read.Unbound<K, V>> kafka() {
    return new TransformEvaluator<KafkaIO.Read.Unbound<K, V>>() {

        @Override
        public void evaluate(KafkaIO.Read.Unbound<K, V> transform, EvaluationContext context) {
            StreamingEvaluationContext sec = (StreamingEvaluationContext) context;
            JavaStreamingContext jssc = sec.getStreamingContext();
            Class<K> keyClazz = transform.getKeyClass();
            Class<V> valueClazz = transform.getValueClass();
            Class<? extends Decoder<K>> keyDecoderClazz = transform.getKeyDecoderClass();
            Class<? extends Decoder<V>> valueDecoderClazz = transform.getValueDecoderClass();
            Map<String, String> kafkaParams = transform.getKafkaParams();
            Set<String> topics = transform.getTopics();
            JavaPairInputDStream<K, V> inputPairStream = KafkaUtils.createDirectStream(jssc, keyClazz, valueClazz, keyDecoderClazz, valueDecoderClazz, kafkaParams, topics);
            JavaDStream<WindowedValue<KV<K, V>>> inputStream = inputPairStream.map(new Function<Tuple2<K, V>, KV<K, V>>() {

                @Override
                public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
                    return KV.of(t2._1(), t2._2());
                }
            }).map(WindowingHelpers.<KV<K, V>>windowFunction());
            sec.setStream(transform, inputStream);
        }
    };
}
Also used : KafkaIO(com.cloudera.dataflow.io.KafkaIO) TransformEvaluator(com.cloudera.dataflow.spark.TransformEvaluator) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) DoFnFunction(com.cloudera.dataflow.spark.DoFnFunction) Function(org.apache.spark.api.java.function.Function) KV(com.google.cloud.dataflow.sdk.values.KV) Tuple2(scala.Tuple2) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) EvaluationContext(com.cloudera.dataflow.spark.EvaluationContext)

Aggregations

WindowedValue (com.google.cloud.dataflow.sdk.util.WindowedValue)9 Tuple2 (scala.Tuple2)5 HadoopIO (com.cloudera.dataflow.hadoop.HadoopIO)4 CannotProvideCoderException (com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException)4 AvroIO (com.google.cloud.dataflow.sdk.io.AvroIO)4 IOException (java.io.IOException)4 Configuration (org.apache.hadoop.conf.Configuration)4 TextIO (com.google.cloud.dataflow.sdk.io.TextIO)3 KV (com.google.cloud.dataflow.sdk.values.KV)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Function (org.apache.spark.api.java.function.Function)3 DoFnFunction (com.cloudera.dataflow.spark.DoFnFunction)2 EvaluationContext (com.cloudera.dataflow.spark.EvaluationContext)2 TransformEvaluator (com.cloudera.dataflow.spark.TransformEvaluator)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 Map (java.util.Map)2 AvroKey (org.apache.avro.mapred.AvroKey)2 NullWritable (org.apache.hadoop.io.NullWritable)2 JavaRDDLike (org.apache.spark.api.java.JavaRDDLike)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2