Search in sources :

Example 16 with KvCoder

use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.

the class GroupIntoBatches method expand.

@Override
public PCollection<KV<K, Iterable<InputT>>> expand(PCollection<KV<K, InputT>> input) {
    Duration allowedLateness = input.getWindowingStrategy().getAllowedLateness();
    checkArgument(input.getCoder() instanceof KvCoder, "coder specified in the input PCollection is not a KvCoder");
    KvCoder inputCoder = (KvCoder) input.getCoder();
    Coder<K> keyCoder = (Coder<K>) inputCoder.getCoderArguments().get(0);
    Coder<InputT> valueCoder = (Coder<InputT>) inputCoder.getCoderArguments().get(1);
    return input.apply(ParDo.of(new GroupIntoBatchesDoFn<>(batchSize, allowedLateness, keyCoder, valueCoder)));
}
Also used : KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) KvCoder(org.apache.beam.sdk.coders.KvCoder) Duration(org.joda.time.Duration)

Example 17 with KvCoder

use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.

the class ApexParDoOperator method processElementInReadyWindows.

private Iterable<WindowedValue<InputT>> processElementInReadyWindows(WindowedValue<InputT> elem) {
    try {
        pushbackDoFnRunner.startBundle();
        if (currentKeyStateInternals != null) {
            InputT value = elem.getValue();
            final Object key;
            final Coder<Object> keyCoder;
            @SuppressWarnings({ "rawtypes", "unchecked" }) WindowedValueCoder<InputT> wvCoder = (WindowedValueCoder) inputCoder;
            if (value instanceof KeyedWorkItem) {
                key = ((KeyedWorkItem) value).key();
                @SuppressWarnings({ "rawtypes", "unchecked" }) KeyedWorkItemCoder<Object, ?> kwiCoder = (KeyedWorkItemCoder) wvCoder.getValueCoder();
                keyCoder = kwiCoder.getKeyCoder();
            } else {
                key = ((KV) value).getKey();
                @SuppressWarnings({ "rawtypes", "unchecked" }) KvCoder<Object, ?> kwiCoder = (KvCoder) wvCoder.getValueCoder();
                keyCoder = kwiCoder.getKeyCoder();
            }
            ((StateInternalsProxy) currentKeyStateInternals).setKey(key);
            currentKeyTimerInternals.setContext(key, keyCoder, new Instant(this.currentInputWatermark), new Instant(this.currentOutputWatermark));
        }
        Iterable<WindowedValue<InputT>> pushedBack = pushbackDoFnRunner.processElementInReadyWindows(elem);
        pushbackDoFnRunner.finishBundle();
        return pushedBack;
    } catch (UserCodeException ue) {
        if (ue.getCause() instanceof AssertionError) {
            ApexRunner.ASSERTION_ERROR.set((AssertionError) ue.getCause());
        }
        throw ue;
    }
}
Also used : StateInternalsProxy(org.apache.beam.runners.apex.translation.utils.StateInternalsProxy) KeyedWorkItemCoder(org.apache.beam.runners.core.KeyedWorkItemCoder) Instant(org.joda.time.Instant) KvCoder(org.apache.beam.sdk.coders.KvCoder) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) UserCodeException(org.apache.beam.sdk.util.UserCodeException) WindowedValue(org.apache.beam.sdk.util.WindowedValue) OutputWindowedValue(org.apache.beam.runners.core.OutputWindowedValue)

Example 18 with KvCoder

use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.

the class RegisterAndProcessBundleOperation method handleMultimapSideInput.

private CompletionStage<BeamFnApi.StateResponse.Builder> handleMultimapSideInput(StateRequest stateRequest) {
    checkState(stateRequest.getRequestCase() == RequestCase.GET, String.format("MultimapSideInput state requests only support '%s' requests, received '%s'", RequestCase.GET, stateRequest.getRequestCase()));
    StateKey.MultimapSideInput multimapSideInputStateKey = stateRequest.getStateKey().getMultimapSideInput();
    SideInputReader sideInputReader = ptransformIdToSideInputReader.get(multimapSideInputStateKey.getTransformId());
    checkState(sideInputReader != null, String.format("Unknown PTransform '%s'", multimapSideInputStateKey.getTransformId()));
    PCollectionView<Materializations.MultimapView<Object, Object>> view = (PCollectionView<Materializations.MultimapView<Object, Object>>) ptransformIdToSideInputIdToPCollectionView.get(multimapSideInputStateKey.getTransformId(), multimapSideInputStateKey.getSideInputId());
    checkState(view != null, String.format("Unknown side input '%s' on PTransform '%s'", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId()));
    checkState(Materializations.MULTIMAP_MATERIALIZATION_URN.equals(view.getViewFn().getMaterialization().getUrn()), String.format("Unknown materialization for side input '%s' on PTransform '%s' with urn '%s'", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId(), view.getViewFn().getMaterialization().getUrn()));
    checkState(view.getCoderInternal() instanceof KvCoder, String.format("Materialization of side input '%s' on PTransform '%s' expects %s but received %s.", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId(), KvCoder.class.getSimpleName(), view.getCoderInternal().getClass().getSimpleName()));
    Coder<Object> keyCoder = ((KvCoder) view.getCoderInternal()).getKeyCoder();
    Coder<Object> valueCoder = ((KvCoder) view.getCoderInternal()).getValueCoder();
    BoundedWindow window;
    try {
        // TODO: Use EncodedWindow instead of decoding the window.
        window = view.getWindowingStrategyInternal().getWindowFn().windowCoder().decode(multimapSideInputStateKey.getWindow().newInput());
    } catch (IOException e) {
        throw new IllegalArgumentException(String.format("Unable to decode window for side input '%s' on PTransform '%s'.", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId()), e);
    }
    Object userKey;
    try {
        // TODO: Use the encoded representation of the key.
        userKey = keyCoder.decode(multimapSideInputStateKey.getKey().newInput());
    } catch (IOException e) {
        throw new IllegalArgumentException(String.format("Unable to decode user key for side input '%s' on PTransform '%s'.", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId()), e);
    }
    Materializations.MultimapView<Object, Object> sideInput = sideInputReader.get(view, window);
    Iterable<Object> values = sideInput.get(userKey);
    try {
        // TODO: Use the raw value so we don't go through a decode/encode cycle for no reason.
        return CompletableFuture.completedFuture(StateResponse.newBuilder().setGet(StateGetResponse.newBuilder().setData(encodeAndConcat(values, valueCoder))));
    } catch (IOException e) {
        throw new IllegalArgumentException(String.format("Unable to encode values for side input '%s' on PTransform '%s'.", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId()), e);
    }
}
Also used : StateKey(org.apache.beam.model.fnexecution.v1.BeamFnApi.StateKey) KvCoder(org.apache.beam.sdk.coders.KvCoder) SideInputReader(org.apache.beam.runners.core.SideInputReader) Materializations(org.apache.beam.sdk.transforms.Materializations) IOException(java.io.IOException) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow)

Example 19 with KvCoder

use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.

the class FlinkBatchPortablePipelineTranslator method translateExecutableStage.

private static <InputT> void translateExecutableStage(PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) {
    // TODO: Fail on splittable DoFns.
    // TODO: Special-case single outputs to avoid multiplexing PCollections.
    RunnerApi.Components components = pipeline.getComponents();
    Map<String, String> outputs = transform.getTransform().getOutputsMap();
    // Mapping from PCollection id to coder tag id.
    BiMap<String, Integer> outputMap = createOutputMap(outputs.values());
    // Collect all output Coders and create a UnionCoder for our tagged outputs.
    List<Coder<?>> unionCoders = Lists.newArrayList();
    // Enforce tuple tag sorting by union tag index.
    Map<String, Coder<WindowedValue<?>>> outputCoders = Maps.newHashMap();
    for (String collectionId : new TreeMap<>(outputMap.inverse()).values()) {
        PCollectionNode collectionNode = PipelineNode.pCollection(collectionId, components.getPcollectionsOrThrow(collectionId));
        Coder<WindowedValue<?>> coder;
        try {
            coder = (Coder) WireCoders.instantiateRunnerWireCoder(collectionNode, components);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        outputCoders.put(collectionId, coder);
        unionCoders.add(coder);
    }
    UnionCoder unionCoder = UnionCoder.of(unionCoders);
    TypeInformation<RawUnionValue> typeInformation = new CoderTypeInformation<>(unionCoder, context.getPipelineOptions());
    RunnerApi.ExecutableStagePayload stagePayload;
    try {
        stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getTransform().getSpec().getPayload());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String inputPCollectionId = stagePayload.getInput();
    Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
    DataSet<WindowedValue<InputT>> inputDataSet = context.getDataSetOrThrow(inputPCollectionId);
    final FlinkExecutableStageFunction<InputT> function = new FlinkExecutableStageFunction<>(transform.getTransform().getUniqueName(), context.getPipelineOptions(), stagePayload, context.getJobInfo(), outputMap, FlinkExecutableStageContextFactory.getInstance(), getWindowingStrategy(inputPCollectionId, components).getWindowFn().windowCoder(), windowedInputCoder);
    final String operatorName = generateNameFromStagePayload(stagePayload);
    final SingleInputUdfOperator taggedDataset;
    if (stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0) {
        Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
        // Stateful stages are only allowed of KV input to be able to group on the key
        if (!(valueCoder instanceof KvCoder)) {
            throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
        }
        Coder keyCoder = ((KvCoder) valueCoder).getKeyCoder();
        Grouping<WindowedValue<InputT>> groupedInput = inputDataSet.groupBy(new KvKeySelector<>(keyCoder));
        boolean requiresTimeSortedInput = requiresTimeSortedInput(stagePayload, false);
        if (requiresTimeSortedInput) {
            groupedInput = ((UnsortedGrouping<WindowedValue<InputT>>) groupedInput).sortGroup(WindowedValue::getTimestamp, Order.ASCENDING);
        }
        taggedDataset = new GroupReduceOperator<>(groupedInput, typeInformation, function, operatorName);
    } else {
        taggedDataset = new MapPartitionOperator<>(inputDataSet, typeInformation, function, operatorName);
    }
    for (SideInputId sideInputId : stagePayload.getSideInputsList()) {
        String collectionId = stagePayload.getComponents().getTransformsOrThrow(sideInputId.getTransformId()).getInputsOrThrow(sideInputId.getLocalName());
        // Register under the global PCollection name. Only ExecutableStageFunction needs to know the
        // mapping from local name to global name and how to translate the broadcast data to a state
        // API view.
        taggedDataset.withBroadcastSet(context.getDataSetOrThrow(collectionId), collectionId);
    }
    for (String collectionId : outputs.values()) {
        pruneOutput(taggedDataset, context, outputMap.get(collectionId), outputCoders.get(collectionId), collectionId);
    }
    if (outputs.isEmpty()) {
        // NOTE: After pipeline translation, we traverse the set of unconsumed PCollections and add a
        // no-op sink to each to make sure they are materialized by Flink. However, some SDK-executed
        // stages have no runner-visible output after fusion. We handle this case by adding a sink
        // here.
        taggedDataset.output(new DiscardingOutputFormat<>()).name("DiscardingOutput");
    }
}
Also used : DiscardingOutputFormat(org.apache.flink.api.java.io.DiscardingOutputFormat) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) FlinkExecutableStageFunction(org.apache.beam.runners.flink.translation.functions.FlinkExecutableStageFunction) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SideInputId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.SideInputId) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) UnionCoder(org.apache.beam.sdk.transforms.join.UnionCoder) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) Coder(org.apache.beam.sdk.coders.Coder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) UnionCoder(org.apache.beam.sdk.transforms.join.UnionCoder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) SingleInputUdfOperator(org.apache.flink.api.java.operators.SingleInputUdfOperator) KvCoder(org.apache.beam.sdk.coders.KvCoder) IOException(java.io.IOException) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)

Example 20 with KvCoder

use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.

the class FlinkBatchPortablePipelineTranslator method translateGroupByKey.

private static <K, V> void translateGroupByKey(PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) {
    RunnerApi.Components components = pipeline.getComponents();
    String inputPCollectionId = Iterables.getOnlyElement(transform.getTransform().getInputsMap().values());
    PCollectionNode inputCollection = PipelineNode.pCollection(inputPCollectionId, components.getPcollectionsOrThrow(inputPCollectionId));
    DataSet<WindowedValue<KV<K, V>>> inputDataSet = context.getDataSetOrThrow(inputPCollectionId);
    RunnerApi.WindowingStrategy windowingStrategyProto = pipeline.getComponents().getWindowingStrategiesOrThrow(pipeline.getComponents().getPcollectionsOrThrow(inputPCollectionId).getWindowingStrategyId());
    RehydratedComponents rehydratedComponents = RehydratedComponents.forComponents(pipeline.getComponents());
    WindowingStrategy<Object, BoundedWindow> windowingStrategy;
    try {
        windowingStrategy = (WindowingStrategy<Object, BoundedWindow>) WindowingStrategyTranslation.fromProto(windowingStrategyProto, rehydratedComponents);
    } catch (InvalidProtocolBufferException e) {
        throw new IllegalStateException(String.format("Unable to hydrate GroupByKey windowing strategy %s.", windowingStrategyProto), e);
    }
    WindowedValueCoder<KV<K, V>> inputCoder;
    try {
        inputCoder = (WindowedValueCoder) WireCoders.instantiateRunnerWireCoder(inputCollection, pipeline.getComponents());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    KvCoder<K, V> inputElementCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
    Concatenate<V> combineFn = new Concatenate<>();
    Coder<List<V>> accumulatorCoder = combineFn.getAccumulatorCoder(CoderRegistry.createDefault(), inputElementCoder.getValueCoder());
    Coder<WindowedValue<KV<K, List<V>>>> outputCoder = WindowedValue.getFullCoder(KvCoder.of(inputElementCoder.getKeyCoder(), accumulatorCoder), windowingStrategy.getWindowFn().windowCoder());
    TypeInformation<WindowedValue<KV<K, List<V>>>> partialReduceTypeInfo = new CoderTypeInformation<>(outputCoder, context.getPipelineOptions());
    Grouping<WindowedValue<KV<K, V>>> inputGrouping = inputDataSet.groupBy(new KvKeySelector<>(inputElementCoder.getKeyCoder()));
    FlinkPartialReduceFunction<K, V, List<V>, ?> partialReduceFunction = new FlinkPartialReduceFunction<>(combineFn, windowingStrategy, Collections.emptyMap(), context.getPipelineOptions());
    FlinkReduceFunction<K, List<V>, List<V>, ?> reduceFunction = new FlinkReduceFunction<>(combineFn, windowingStrategy, Collections.emptyMap(), context.getPipelineOptions());
    // Partially GroupReduce the values into the intermediate format AccumT (combine)
    GroupCombineOperator<WindowedValue<KV<K, V>>, WindowedValue<KV<K, List<V>>>> groupCombine = new GroupCombineOperator<>(inputGrouping, partialReduceTypeInfo, partialReduceFunction, "GroupCombine: " + transform.getTransform().getUniqueName());
    Grouping<WindowedValue<KV<K, List<V>>>> intermediateGrouping = groupCombine.groupBy(new KvKeySelector<>(inputElementCoder.getKeyCoder()));
    // Fully reduce the values and create output format VO
    GroupReduceOperator<WindowedValue<KV<K, List<V>>>, WindowedValue<KV<K, List<V>>>> outputDataSet = new GroupReduceOperator<>(intermediateGrouping, partialReduceTypeInfo, reduceFunction, transform.getTransform().getUniqueName());
    context.addDataSet(Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), outputDataSet);
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) FlinkReduceFunction(org.apache.beam.runners.flink.translation.functions.FlinkReduceFunction) List(java.util.List) GroupCombineOperator(org.apache.flink.api.java.operators.GroupCombineOperator) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) FlinkPartialReduceFunction(org.apache.beam.runners.flink.translation.functions.FlinkPartialReduceFunction) KvCoder(org.apache.beam.sdk.coders.KvCoder) KV(org.apache.beam.sdk.values.KV) IOException(java.io.IOException) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) GroupReduceOperator(org.apache.flink.api.java.operators.GroupReduceOperator) Concatenate(org.apache.beam.runners.core.Concatenate) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents)

Aggregations

KvCoder (org.apache.beam.sdk.coders.KvCoder)44 Coder (org.apache.beam.sdk.coders.Coder)26 WindowedValue (org.apache.beam.sdk.util.WindowedValue)25 KV (org.apache.beam.sdk.values.KV)21 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)20 Map (java.util.Map)17 List (java.util.List)16 ArrayList (java.util.ArrayList)15 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)15 IOException (java.io.IOException)14 HashMap (java.util.HashMap)14 WindowingStrategy (org.apache.beam.sdk.values.WindowingStrategy)13 StringUtf8Coder (org.apache.beam.sdk.coders.StringUtf8Coder)11 IterableCoder (org.apache.beam.sdk.coders.IterableCoder)10 VoidCoder (org.apache.beam.sdk.coders.VoidCoder)10 PCollectionView (org.apache.beam.sdk.values.PCollectionView)10 Test (org.junit.Test)10 WindowedValueCoder (org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder)8 ViewFn (org.apache.beam.sdk.transforms.ViewFn)7 PCollection (org.apache.beam.sdk.values.PCollection)7