Search in sources :

Example 1 with GroupCombineOperator

use of org.apache.flink.api.java.operators.GroupCombineOperator in project beam by apache.

the class FlinkBatchPortablePipelineTranslator method translateGroupByKey.

private static <K, V> void translateGroupByKey(PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) {
    RunnerApi.Components components = pipeline.getComponents();
    String inputPCollectionId = Iterables.getOnlyElement(transform.getTransform().getInputsMap().values());
    PCollectionNode inputCollection = PipelineNode.pCollection(inputPCollectionId, components.getPcollectionsOrThrow(inputPCollectionId));
    DataSet<WindowedValue<KV<K, V>>> inputDataSet = context.getDataSetOrThrow(inputPCollectionId);
    RunnerApi.WindowingStrategy windowingStrategyProto = pipeline.getComponents().getWindowingStrategiesOrThrow(pipeline.getComponents().getPcollectionsOrThrow(inputPCollectionId).getWindowingStrategyId());
    RehydratedComponents rehydratedComponents = RehydratedComponents.forComponents(pipeline.getComponents());
    WindowingStrategy<Object, BoundedWindow> windowingStrategy;
    try {
        windowingStrategy = (WindowingStrategy<Object, BoundedWindow>) WindowingStrategyTranslation.fromProto(windowingStrategyProto, rehydratedComponents);
    } catch (InvalidProtocolBufferException e) {
        throw new IllegalStateException(String.format("Unable to hydrate GroupByKey windowing strategy %s.", windowingStrategyProto), e);
    }
    WindowedValueCoder<KV<K, V>> inputCoder;
    try {
        inputCoder = (WindowedValueCoder) WireCoders.instantiateRunnerWireCoder(inputCollection, pipeline.getComponents());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    KvCoder<K, V> inputElementCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
    Concatenate<V> combineFn = new Concatenate<>();
    Coder<List<V>> accumulatorCoder = combineFn.getAccumulatorCoder(CoderRegistry.createDefault(), inputElementCoder.getValueCoder());
    Coder<WindowedValue<KV<K, List<V>>>> outputCoder = WindowedValue.getFullCoder(KvCoder.of(inputElementCoder.getKeyCoder(), accumulatorCoder), windowingStrategy.getWindowFn().windowCoder());
    TypeInformation<WindowedValue<KV<K, List<V>>>> partialReduceTypeInfo = new CoderTypeInformation<>(outputCoder, context.getPipelineOptions());
    Grouping<WindowedValue<KV<K, V>>> inputGrouping = inputDataSet.groupBy(new KvKeySelector<>(inputElementCoder.getKeyCoder()));
    FlinkPartialReduceFunction<K, V, List<V>, ?> partialReduceFunction = new FlinkPartialReduceFunction<>(combineFn, windowingStrategy, Collections.emptyMap(), context.getPipelineOptions());
    FlinkReduceFunction<K, List<V>, List<V>, ?> reduceFunction = new FlinkReduceFunction<>(combineFn, windowingStrategy, Collections.emptyMap(), context.getPipelineOptions());
    // Partially GroupReduce the values into the intermediate format AccumT (combine)
    GroupCombineOperator<WindowedValue<KV<K, V>>, WindowedValue<KV<K, List<V>>>> groupCombine = new GroupCombineOperator<>(inputGrouping, partialReduceTypeInfo, partialReduceFunction, "GroupCombine: " + transform.getTransform().getUniqueName());
    Grouping<WindowedValue<KV<K, List<V>>>> intermediateGrouping = groupCombine.groupBy(new KvKeySelector<>(inputElementCoder.getKeyCoder()));
    // Fully reduce the values and create output format VO
    GroupReduceOperator<WindowedValue<KV<K, List<V>>>, WindowedValue<KV<K, List<V>>>> outputDataSet = new GroupReduceOperator<>(intermediateGrouping, partialReduceTypeInfo, reduceFunction, transform.getTransform().getUniqueName());
    context.addDataSet(Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), outputDataSet);
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) FlinkReduceFunction(org.apache.beam.runners.flink.translation.functions.FlinkReduceFunction) List(java.util.List) GroupCombineOperator(org.apache.flink.api.java.operators.GroupCombineOperator) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) FlinkPartialReduceFunction(org.apache.beam.runners.flink.translation.functions.FlinkPartialReduceFunction) KvCoder(org.apache.beam.sdk.coders.KvCoder) KV(org.apache.beam.sdk.values.KV) IOException(java.io.IOException) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) GroupReduceOperator(org.apache.flink.api.java.operators.GroupReduceOperator) Concatenate(org.apache.beam.runners.core.Concatenate) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents)

Aggregations

IOException (java.io.IOException)1 List (java.util.List)1 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)1 Concatenate (org.apache.beam.runners.core.Concatenate)1 RehydratedComponents (org.apache.beam.runners.core.construction.RehydratedComponents)1 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)1 FlinkPartialReduceFunction (org.apache.beam.runners.flink.translation.functions.FlinkPartialReduceFunction)1 FlinkReduceFunction (org.apache.beam.runners.flink.translation.functions.FlinkReduceFunction)1 CoderTypeInformation (org.apache.beam.runners.flink.translation.types.CoderTypeInformation)1 KvCoder (org.apache.beam.sdk.coders.KvCoder)1 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)1 WindowedValue (org.apache.beam.sdk.util.WindowedValue)1 KV (org.apache.beam.sdk.values.KV)1 InvalidProtocolBufferException (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException)1 GroupCombineOperator (org.apache.flink.api.java.operators.GroupCombineOperator)1 GroupReduceOperator (org.apache.flink.api.java.operators.GroupReduceOperator)1