use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.
the class TransformTranslator method readHadoop.
private static <K, V> TransformEvaluator<HadoopIO.Read.Bound<K, V>> readHadoop() {
return new TransformEvaluator<HadoopIO.Read.Bound<K, V>>() {
@Override
public void evaluate(HadoopIO.Read.Bound<K, V> transform, EvaluationContext context) {
String pattern = transform.getFilepattern();
JavaSparkContext jsc = context.getSparkContext();
@SuppressWarnings("unchecked") JavaPairRDD<K, V> file = jsc.newAPIHadoopFile(pattern, transform.getFormatClass(), transform.getKeyClass(), transform.getValueClass(), new Configuration());
JavaRDD<WindowedValue<KV<K, V>>> rdd = file.map(new Function<Tuple2<K, V>, KV<K, V>>() {
@Override
public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
return KV.of(t2._1(), t2._2());
}
}).map(WindowingHelpers.<KV<K, V>>windowFunction());
context.setOutputRDD(transform, rdd);
}
};
}
use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.
the class TransformTranslator method writeHadoop.
private static <K, V> TransformEvaluator<HadoopIO.Write.Bound<K, V>> writeHadoop() {
return new TransformEvaluator<HadoopIO.Write.Bound<K, V>>() {
@Override
public void evaluate(HadoopIO.Write.Bound<K, V> transform, EvaluationContext context) {
@SuppressWarnings("unchecked") JavaPairRDD<K, V> last = ((JavaRDDLike<WindowedValue<KV<K, V>>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<KV<K, V>>unwindowFunction()).mapToPair(new PairFunction<KV<K, V>, K, V>() {
@Override
public Tuple2<K, V> call(KV<K, V> t) throws Exception {
return new Tuple2<>(t.getKey(), t.getValue());
}
});
ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
Configuration conf = new Configuration();
for (Map.Entry<String, String> e : transform.getConfigurationProperties().entrySet()) {
conf.set(e.getKey(), e.getValue());
}
writeHadoopFile(last, conf, shardTemplateInfo, transform.getKeyClass(), transform.getValueClass(), transform.getFormatClass());
}
};
}
use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.
the class TransformTranslator method multiDo.
private static <I, O> TransformEvaluator<ParDo.BoundMulti<I, O>> multiDo() {
return new TransformEvaluator<ParDo.BoundMulti<I, O>>() {
@Override
public void evaluate(ParDo.BoundMulti<I, O> transform, EvaluationContext context) {
TupleTag<O> mainOutputTag = MULTIDO_FG.get("mainOutputTag", transform);
MultiDoFnFunction<I, O> multifn = new MultiDoFnFunction<>(transform.getFn(), context.getRuntimeContext(), mainOutputTag, getSideInputs(transform.getSideInputs(), context));
@SuppressWarnings("unchecked") JavaRDDLike<WindowedValue<I>, ?> inRDD = (JavaRDDLike<WindowedValue<I>, ?>) context.getInputRDD(transform);
JavaPairRDD<TupleTag<?>, WindowedValue<?>> all = inRDD.mapPartitionsToPair(multifn).cache();
PCollectionTuple pct = context.getOutput(transform);
for (Map.Entry<TupleTag<?>, PCollection<?>> e : pct.getAll().entrySet()) {
@SuppressWarnings("unchecked") JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TupleTagFilter(e.getKey()));
@SuppressWarnings("unchecked") JavaRDD<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
context.setRDD(e.getValue(), values);
}
}
};
}
use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.
the class TransformTranslator method writeText.
private static <T> TransformEvaluator<TextIO.Write.Bound<T>> writeText() {
return new TransformEvaluator<TextIO.Write.Bound<T>>() {
@Override
public void evaluate(TextIO.Write.Bound<T> transform, EvaluationContext context) {
@SuppressWarnings("unchecked") JavaPairRDD<T, Void> last = ((JavaRDDLike<WindowedValue<T>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<T>unwindowFunction()).mapToPair(new PairFunction<T, T, Void>() {
@Override
public Tuple2<T, Void> call(T t) throws Exception {
return new Tuple2<>(t, null);
}
});
ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
writeHadoopFile(last, new Configuration(), shardTemplateInfo, Text.class, NullWritable.class, TemplatedTextOutputFormat.class);
}
};
}
use of com.google.cloud.dataflow.sdk.util.WindowedValue in project spark-dataflow by cloudera.
the class StreamingTransformTranslator method kafka.
private static <K, V> TransformEvaluator<KafkaIO.Read.Unbound<K, V>> kafka() {
return new TransformEvaluator<KafkaIO.Read.Unbound<K, V>>() {
@Override
public void evaluate(KafkaIO.Read.Unbound<K, V> transform, EvaluationContext context) {
StreamingEvaluationContext sec = (StreamingEvaluationContext) context;
JavaStreamingContext jssc = sec.getStreamingContext();
Class<K> keyClazz = transform.getKeyClass();
Class<V> valueClazz = transform.getValueClass();
Class<? extends Decoder<K>> keyDecoderClazz = transform.getKeyDecoderClass();
Class<? extends Decoder<V>> valueDecoderClazz = transform.getValueDecoderClass();
Map<String, String> kafkaParams = transform.getKafkaParams();
Set<String> topics = transform.getTopics();
JavaPairInputDStream<K, V> inputPairStream = KafkaUtils.createDirectStream(jssc, keyClazz, valueClazz, keyDecoderClazz, valueDecoderClazz, kafkaParams, topics);
JavaDStream<WindowedValue<KV<K, V>>> inputStream = inputPairStream.map(new Function<Tuple2<K, V>, KV<K, V>>() {
@Override
public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
return KV.of(t2._1(), t2._2());
}
}).map(WindowingHelpers.<KV<K, V>>windowFunction());
sec.setStream(transform, inputStream);
}
};
}
Aggregations