use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.
the class GroupIntoBatches method expand.
@Override
public PCollection<KV<K, Iterable<InputT>>> expand(PCollection<KV<K, InputT>> input) {
Duration allowedLateness = input.getWindowingStrategy().getAllowedLateness();
checkArgument(input.getCoder() instanceof KvCoder, "coder specified in the input PCollection is not a KvCoder");
KvCoder inputCoder = (KvCoder) input.getCoder();
Coder<K> keyCoder = (Coder<K>) inputCoder.getCoderArguments().get(0);
Coder<InputT> valueCoder = (Coder<InputT>) inputCoder.getCoderArguments().get(1);
return input.apply(ParDo.of(new GroupIntoBatchesDoFn<>(batchSize, allowedLateness, keyCoder, valueCoder)));
}
use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.
the class ApexParDoOperator method processElementInReadyWindows.
private Iterable<WindowedValue<InputT>> processElementInReadyWindows(WindowedValue<InputT> elem) {
try {
pushbackDoFnRunner.startBundle();
if (currentKeyStateInternals != null) {
InputT value = elem.getValue();
final Object key;
final Coder<Object> keyCoder;
@SuppressWarnings({ "rawtypes", "unchecked" }) WindowedValueCoder<InputT> wvCoder = (WindowedValueCoder) inputCoder;
if (value instanceof KeyedWorkItem) {
key = ((KeyedWorkItem) value).key();
@SuppressWarnings({ "rawtypes", "unchecked" }) KeyedWorkItemCoder<Object, ?> kwiCoder = (KeyedWorkItemCoder) wvCoder.getValueCoder();
keyCoder = kwiCoder.getKeyCoder();
} else {
key = ((KV) value).getKey();
@SuppressWarnings({ "rawtypes", "unchecked" }) KvCoder<Object, ?> kwiCoder = (KvCoder) wvCoder.getValueCoder();
keyCoder = kwiCoder.getKeyCoder();
}
((StateInternalsProxy) currentKeyStateInternals).setKey(key);
currentKeyTimerInternals.setContext(key, keyCoder, new Instant(this.currentInputWatermark), new Instant(this.currentOutputWatermark));
}
Iterable<WindowedValue<InputT>> pushedBack = pushbackDoFnRunner.processElementInReadyWindows(elem);
pushbackDoFnRunner.finishBundle();
return pushedBack;
} catch (UserCodeException ue) {
if (ue.getCause() instanceof AssertionError) {
ApexRunner.ASSERTION_ERROR.set((AssertionError) ue.getCause());
}
throw ue;
}
}
use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.
the class RegisterAndProcessBundleOperation method handleMultimapSideInput.
private CompletionStage<BeamFnApi.StateResponse.Builder> handleMultimapSideInput(StateRequest stateRequest) {
checkState(stateRequest.getRequestCase() == RequestCase.GET, String.format("MultimapSideInput state requests only support '%s' requests, received '%s'", RequestCase.GET, stateRequest.getRequestCase()));
StateKey.MultimapSideInput multimapSideInputStateKey = stateRequest.getStateKey().getMultimapSideInput();
SideInputReader sideInputReader = ptransformIdToSideInputReader.get(multimapSideInputStateKey.getTransformId());
checkState(sideInputReader != null, String.format("Unknown PTransform '%s'", multimapSideInputStateKey.getTransformId()));
PCollectionView<Materializations.MultimapView<Object, Object>> view = (PCollectionView<Materializations.MultimapView<Object, Object>>) ptransformIdToSideInputIdToPCollectionView.get(multimapSideInputStateKey.getTransformId(), multimapSideInputStateKey.getSideInputId());
checkState(view != null, String.format("Unknown side input '%s' on PTransform '%s'", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId()));
checkState(Materializations.MULTIMAP_MATERIALIZATION_URN.equals(view.getViewFn().getMaterialization().getUrn()), String.format("Unknown materialization for side input '%s' on PTransform '%s' with urn '%s'", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId(), view.getViewFn().getMaterialization().getUrn()));
checkState(view.getCoderInternal() instanceof KvCoder, String.format("Materialization of side input '%s' on PTransform '%s' expects %s but received %s.", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId(), KvCoder.class.getSimpleName(), view.getCoderInternal().getClass().getSimpleName()));
Coder<Object> keyCoder = ((KvCoder) view.getCoderInternal()).getKeyCoder();
Coder<Object> valueCoder = ((KvCoder) view.getCoderInternal()).getValueCoder();
BoundedWindow window;
try {
// TODO: Use EncodedWindow instead of decoding the window.
window = view.getWindowingStrategyInternal().getWindowFn().windowCoder().decode(multimapSideInputStateKey.getWindow().newInput());
} catch (IOException e) {
throw new IllegalArgumentException(String.format("Unable to decode window for side input '%s' on PTransform '%s'.", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId()), e);
}
Object userKey;
try {
// TODO: Use the encoded representation of the key.
userKey = keyCoder.decode(multimapSideInputStateKey.getKey().newInput());
} catch (IOException e) {
throw new IllegalArgumentException(String.format("Unable to decode user key for side input '%s' on PTransform '%s'.", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId()), e);
}
Materializations.MultimapView<Object, Object> sideInput = sideInputReader.get(view, window);
Iterable<Object> values = sideInput.get(userKey);
try {
// TODO: Use the raw value so we don't go through a decode/encode cycle for no reason.
return CompletableFuture.completedFuture(StateResponse.newBuilder().setGet(StateGetResponse.newBuilder().setData(encodeAndConcat(values, valueCoder))));
} catch (IOException e) {
throw new IllegalArgumentException(String.format("Unable to encode values for side input '%s' on PTransform '%s'.", multimapSideInputStateKey.getSideInputId(), multimapSideInputStateKey.getTransformId()), e);
}
}
use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.
the class FlinkBatchPortablePipelineTranslator method translateExecutableStage.
private static <InputT> void translateExecutableStage(PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) {
// TODO: Fail on splittable DoFns.
// TODO: Special-case single outputs to avoid multiplexing PCollections.
RunnerApi.Components components = pipeline.getComponents();
Map<String, String> outputs = transform.getTransform().getOutputsMap();
// Mapping from PCollection id to coder tag id.
BiMap<String, Integer> outputMap = createOutputMap(outputs.values());
// Collect all output Coders and create a UnionCoder for our tagged outputs.
List<Coder<?>> unionCoders = Lists.newArrayList();
// Enforce tuple tag sorting by union tag index.
Map<String, Coder<WindowedValue<?>>> outputCoders = Maps.newHashMap();
for (String collectionId : new TreeMap<>(outputMap.inverse()).values()) {
PCollectionNode collectionNode = PipelineNode.pCollection(collectionId, components.getPcollectionsOrThrow(collectionId));
Coder<WindowedValue<?>> coder;
try {
coder = (Coder) WireCoders.instantiateRunnerWireCoder(collectionNode, components);
} catch (IOException e) {
throw new RuntimeException(e);
}
outputCoders.put(collectionId, coder);
unionCoders.add(coder);
}
UnionCoder unionCoder = UnionCoder.of(unionCoders);
TypeInformation<RawUnionValue> typeInformation = new CoderTypeInformation<>(unionCoder, context.getPipelineOptions());
RunnerApi.ExecutableStagePayload stagePayload;
try {
stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getTransform().getSpec().getPayload());
} catch (IOException e) {
throw new RuntimeException(e);
}
String inputPCollectionId = stagePayload.getInput();
Coder<WindowedValue<InputT>> windowedInputCoder = instantiateCoder(inputPCollectionId, components);
DataSet<WindowedValue<InputT>> inputDataSet = context.getDataSetOrThrow(inputPCollectionId);
final FlinkExecutableStageFunction<InputT> function = new FlinkExecutableStageFunction<>(transform.getTransform().getUniqueName(), context.getPipelineOptions(), stagePayload, context.getJobInfo(), outputMap, FlinkExecutableStageContextFactory.getInstance(), getWindowingStrategy(inputPCollectionId, components).getWindowFn().windowCoder(), windowedInputCoder);
final String operatorName = generateNameFromStagePayload(stagePayload);
final SingleInputUdfOperator taggedDataset;
if (stagePayload.getUserStatesCount() > 0 || stagePayload.getTimersCount() > 0) {
Coder valueCoder = ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder();
// Stateful stages are only allowed of KV input to be able to group on the key
if (!(valueCoder instanceof KvCoder)) {
throw new IllegalStateException(String.format(Locale.ENGLISH, "The element coder for stateful DoFn '%s' must be KvCoder but is: %s", inputPCollectionId, valueCoder.getClass().getSimpleName()));
}
Coder keyCoder = ((KvCoder) valueCoder).getKeyCoder();
Grouping<WindowedValue<InputT>> groupedInput = inputDataSet.groupBy(new KvKeySelector<>(keyCoder));
boolean requiresTimeSortedInput = requiresTimeSortedInput(stagePayload, false);
if (requiresTimeSortedInput) {
groupedInput = ((UnsortedGrouping<WindowedValue<InputT>>) groupedInput).sortGroup(WindowedValue::getTimestamp, Order.ASCENDING);
}
taggedDataset = new GroupReduceOperator<>(groupedInput, typeInformation, function, operatorName);
} else {
taggedDataset = new MapPartitionOperator<>(inputDataSet, typeInformation, function, operatorName);
}
for (SideInputId sideInputId : stagePayload.getSideInputsList()) {
String collectionId = stagePayload.getComponents().getTransformsOrThrow(sideInputId.getTransformId()).getInputsOrThrow(sideInputId.getLocalName());
// Register under the global PCollection name. Only ExecutableStageFunction needs to know the
// mapping from local name to global name and how to translate the broadcast data to a state
// API view.
taggedDataset.withBroadcastSet(context.getDataSetOrThrow(collectionId), collectionId);
}
for (String collectionId : outputs.values()) {
pruneOutput(taggedDataset, context, outputMap.get(collectionId), outputCoders.get(collectionId), collectionId);
}
if (outputs.isEmpty()) {
// NOTE: After pipeline translation, we traverse the set of unconsumed PCollections and add a
// no-op sink to each to make sure they are materialized by Flink. However, some SDK-executed
// stages have no runner-visible output after fusion. We handle this case by adding a sink
// here.
taggedDataset.output(new DiscardingOutputFormat<>()).name("DiscardingOutput");
}
}
use of org.apache.beam.sdk.coders.KvCoder in project beam by apache.
the class FlinkBatchPortablePipelineTranslator method translateGroupByKey.
private static <K, V> void translateGroupByKey(PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) {
RunnerApi.Components components = pipeline.getComponents();
String inputPCollectionId = Iterables.getOnlyElement(transform.getTransform().getInputsMap().values());
PCollectionNode inputCollection = PipelineNode.pCollection(inputPCollectionId, components.getPcollectionsOrThrow(inputPCollectionId));
DataSet<WindowedValue<KV<K, V>>> inputDataSet = context.getDataSetOrThrow(inputPCollectionId);
RunnerApi.WindowingStrategy windowingStrategyProto = pipeline.getComponents().getWindowingStrategiesOrThrow(pipeline.getComponents().getPcollectionsOrThrow(inputPCollectionId).getWindowingStrategyId());
RehydratedComponents rehydratedComponents = RehydratedComponents.forComponents(pipeline.getComponents());
WindowingStrategy<Object, BoundedWindow> windowingStrategy;
try {
windowingStrategy = (WindowingStrategy<Object, BoundedWindow>) WindowingStrategyTranslation.fromProto(windowingStrategyProto, rehydratedComponents);
} catch (InvalidProtocolBufferException e) {
throw new IllegalStateException(String.format("Unable to hydrate GroupByKey windowing strategy %s.", windowingStrategyProto), e);
}
WindowedValueCoder<KV<K, V>> inputCoder;
try {
inputCoder = (WindowedValueCoder) WireCoders.instantiateRunnerWireCoder(inputCollection, pipeline.getComponents());
} catch (IOException e) {
throw new RuntimeException(e);
}
KvCoder<K, V> inputElementCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
Concatenate<V> combineFn = new Concatenate<>();
Coder<List<V>> accumulatorCoder = combineFn.getAccumulatorCoder(CoderRegistry.createDefault(), inputElementCoder.getValueCoder());
Coder<WindowedValue<KV<K, List<V>>>> outputCoder = WindowedValue.getFullCoder(KvCoder.of(inputElementCoder.getKeyCoder(), accumulatorCoder), windowingStrategy.getWindowFn().windowCoder());
TypeInformation<WindowedValue<KV<K, List<V>>>> partialReduceTypeInfo = new CoderTypeInformation<>(outputCoder, context.getPipelineOptions());
Grouping<WindowedValue<KV<K, V>>> inputGrouping = inputDataSet.groupBy(new KvKeySelector<>(inputElementCoder.getKeyCoder()));
FlinkPartialReduceFunction<K, V, List<V>, ?> partialReduceFunction = new FlinkPartialReduceFunction<>(combineFn, windowingStrategy, Collections.emptyMap(), context.getPipelineOptions());
FlinkReduceFunction<K, List<V>, List<V>, ?> reduceFunction = new FlinkReduceFunction<>(combineFn, windowingStrategy, Collections.emptyMap(), context.getPipelineOptions());
// Partially GroupReduce the values into the intermediate format AccumT (combine)
GroupCombineOperator<WindowedValue<KV<K, V>>, WindowedValue<KV<K, List<V>>>> groupCombine = new GroupCombineOperator<>(inputGrouping, partialReduceTypeInfo, partialReduceFunction, "GroupCombine: " + transform.getTransform().getUniqueName());
Grouping<WindowedValue<KV<K, List<V>>>> intermediateGrouping = groupCombine.groupBy(new KvKeySelector<>(inputElementCoder.getKeyCoder()));
// Fully reduce the values and create output format VO
GroupReduceOperator<WindowedValue<KV<K, List<V>>>, WindowedValue<KV<K, List<V>>>> outputDataSet = new GroupReduceOperator<>(intermediateGrouping, partialReduceTypeInfo, reduceFunction, transform.getTransform().getUniqueName());
context.addDataSet(Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), outputDataSet);
}
Aggregations