Search in sources :

Example 1 with CannotProvideCoderException

use of com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException in project spark-dataflow by cloudera.

the class TransformTranslator method combineGlobally.

private static <I, A, O> TransformEvaluator<Combine.Globally<I, O>> combineGlobally() {
    return new TransformEvaluator<Combine.Globally<I, O>>() {

        @Override
        public void evaluate(Combine.Globally<I, O> transform, EvaluationContext context) {
            final Combine.CombineFn<I, A, O> globally = COMBINE_GLOBALLY_FG.get("fn", transform);
            @SuppressWarnings("unchecked") JavaRDDLike<WindowedValue<I>, ?> inRdd = (JavaRDDLike<WindowedValue<I>, ?>) context.getInputRDD(transform);
            final Coder<I> iCoder = context.getInput(transform).getCoder();
            final Coder<A> aCoder;
            try {
                aCoder = globally.getAccumulatorCoder(context.getPipeline().getCoderRegistry(), iCoder);
            } catch (CannotProvideCoderException e) {
                throw new IllegalStateException("Could not determine coder for accumulator", e);
            }
            // Use coders to convert objects in the PCollection to byte arrays, so they
            // can be transferred over the network for the shuffle.
            JavaRDD<byte[]> inRddBytes = inRdd.map(WindowingHelpers.<I>unwindowFunction()).map(CoderHelpers.toByteFunction(iCoder));
            /*A*/
            byte[] acc = inRddBytes.aggregate(CoderHelpers.toByteArray(globally.createAccumulator(), aCoder), new Function2<byte[], byte[], byte[]>() {

                @Override
                public byte[] call(/*A*/
                byte[] ab, /*I*/
                byte[] ib) throws Exception {
                    A a = CoderHelpers.fromByteArray(ab, aCoder);
                    I i = CoderHelpers.fromByteArray(ib, iCoder);
                    return CoderHelpers.toByteArray(globally.addInput(a, i), aCoder);
                }
            }, new Function2<byte[], byte[], byte[]>() {

                @Override
                public byte[] call(/*A*/
                byte[] a1b, /*A*/
                byte[] a2b) throws Exception {
                    A a1 = CoderHelpers.fromByteArray(a1b, aCoder);
                    A a2 = CoderHelpers.fromByteArray(a2b, aCoder);
                    // don't use Guava's ImmutableList.of as values may be null
                    List<A> accumulators = Collections.unmodifiableList(Arrays.asList(a1, a2));
                    A merged = globally.mergeAccumulators(accumulators);
                    return CoderHelpers.toByteArray(merged, aCoder);
                }
            });
            O output = globally.extractOutput(CoderHelpers.fromByteArray(acc, aCoder));
            Coder<O> coder = context.getOutput(transform).getCoder();
            JavaRDD<byte[]> outRdd = context.getSparkContext().parallelize(// don't use Guava's ImmutableList.of as output may be null
            CoderHelpers.toByteArrays(Collections.singleton(output), coder));
            context.setOutputRDD(transform, outRdd.map(CoderHelpers.fromByteFunction(coder)).map(WindowingHelpers.<O>windowFunction()));
        }
    };
}
Also used : Combine(com.google.cloud.dataflow.sdk.transforms.Combine) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) IOException(java.io.IOException) TextIO(com.google.cloud.dataflow.sdk.io.TextIO) AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) JavaRDDLike(org.apache.spark.api.java.JavaRDDLike) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) List(java.util.List) PCollectionList(com.google.cloud.dataflow.sdk.values.PCollectionList)

Aggregations

HadoopIO (com.cloudera.dataflow.hadoop.HadoopIO)1 CannotProvideCoderException (com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException)1 AvroIO (com.google.cloud.dataflow.sdk.io.AvroIO)1 TextIO (com.google.cloud.dataflow.sdk.io.TextIO)1 Combine (com.google.cloud.dataflow.sdk.transforms.Combine)1 WindowedValue (com.google.cloud.dataflow.sdk.util.WindowedValue)1 PCollectionList (com.google.cloud.dataflow.sdk.values.PCollectionList)1 IOException (java.io.IOException)1 List (java.util.List)1 JavaRDDLike (org.apache.spark.api.java.JavaRDDLike)1