use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class SparkPCollectionView method createBroadcastHelper.
private SideInputBroadcast createBroadcastHelper(PCollectionView<?> view, JavaSparkContext context) {
Tuple2<byte[], Coder<Iterable<WindowedValue<?>>>> tuple2 = pviews.get(view);
SideInputBroadcast helper = SideInputBroadcast.create(tuple2._1, tuple2._2);
String pCollectionName = view.getPCollection() != null ? view.getPCollection().getName() : "UNKNOWN";
LOG.debug("Broadcasting [size={}B] view {} from pCollection {}", helper.getBroadcastSizeEstimate(), view, pCollectionName);
helper.broadcast(context);
broadcastHelperMap.put(view, helper);
return helper;
}
use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class SparkStreamingPortablePipelineTranslator method translateExecutableStage.
private static <InputT, OutputT, SideInputT> void translateExecutableStage(PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkStreamingTranslationContext context) {
RunnerApi.ExecutableStagePayload stagePayload;
try {
stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transformNode.getTransform().getSpec().getPayload());
} catch (IOException e) {
throw new RuntimeException(e);
}
String inputPCollectionId = stagePayload.getInput();
UnboundedDataset<InputT> inputDataset = (UnboundedDataset<InputT>) context.popDataset(inputPCollectionId);
List<Integer> streamSources = inputDataset.getStreamSources();
JavaDStream<WindowedValue<InputT>> inputDStream = inputDataset.getDStream();
Map<String, String> outputs = transformNode.getTransform().getOutputsMap();
BiMap<String, Integer> outputMap = createOutputMap(outputs.values());
RunnerApi.Components components = pipeline.getComponents();
Coder windowCoder = getWindowingStrategy(inputPCollectionId, components).getWindowFn().windowCoder();
// TODO (BEAM-10712): handle side inputs.
if (stagePayload.getSideInputsCount() > 0) {
throw new UnsupportedOperationException("Side inputs to executable stage are currently unsupported.");
}
ImmutableMap<String, Tuple2<Broadcast<List<byte[]>>, WindowedValue.WindowedValueCoder<SideInputT>>> broadcastVariables = ImmutableMap.copyOf(new HashMap<>());
SparkExecutableStageFunction<InputT, SideInputT> function = new SparkExecutableStageFunction<>(context.getSerializableOptions(), stagePayload, context.jobInfo, outputMap, SparkExecutableStageContextFactory.getInstance(), broadcastVariables, MetricsAccumulator.getInstance(), windowCoder);
JavaDStream<RawUnionValue> staged = inputDStream.mapPartitions(function);
String intermediateId = getExecutableStageIntermediateId(transformNode);
context.pushDataset(intermediateId, new Dataset() {
@Override
public void cache(String storageLevel, Coder<?> coder) {
StorageLevel level = StorageLevel.fromString(storageLevel);
staged.persist(level);
}
@Override
public void action() {
// Empty function to force computation of RDD.
staged.foreachRDD(TranslationUtils.emptyVoidFunction());
}
@Override
public void setName(String name) {
// ignore
}
});
// Pop dataset to mark DStream as used
context.popDataset(intermediateId);
for (String outputId : outputs.values()) {
JavaDStream<WindowedValue<OutputT>> outStream = staged.flatMap(new SparkExecutableStageExtractionFunction<>(outputMap.get(outputId)));
context.pushDataset(outputId, new UnboundedDataset<>(outStream, streamSources));
}
// Add sink to ensure stage is executed
if (outputs.isEmpty()) {
JavaDStream<WindowedValue<OutputT>> outStream = staged.flatMap((rawUnionValue) -> Collections.emptyIterator());
context.pushDataset(String.format("EmptyOutputSink_%d", context.nextSinkId()), new UnboundedDataset<>(outStream, streamSources));
}
}
use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class ValueAndCoderLazySerializable method getOrDecode.
public T getOrDecode(Coder<T> coder) {
if (!(coderOrBytes instanceof Coder)) {
ByteArrayInputStream bais = new ByteArrayInputStream((byte[]) this.coderOrBytes);
try {
value = coder.decode(bais);
} catch (IOException e) {
throw new IllegalStateException("Error decoding bytes for coder: " + coder, e);
}
this.coderOrBytes = coder;
}
return value;
}
use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class StreamingTransformTranslator method parDo.
private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {
@Override
public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
final DoFn<InputT, OutputT> doFn = transform.getFn();
checkArgument(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Splittable DoFn not yet supported in streaming mode: %s", doFn);
rejectStateAndTimers(doFn);
final SerializablePipelineOptions options = context.getSerializableOptions();
final SparkPCollectionView pviews = context.getPViews();
final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
@SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = (UnboundedDataset<InputT>) context.borrowDataset(transform);
JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
final DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
final Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
final String stepName = context.getCurrentTransform().getFullName();
JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(rdd -> {
final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs().values(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(metricsAccum, stepName, doFn, options, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, sideInputs, windowingStrategy, false, doFnSchemaInformation, sideInputMapping));
});
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
if (outputs.size() > 1) {
// Caching can cause Serialization, we need to code to bytes
// more details in https://issues.apache.org/jira/browse/BEAM-2669
Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).cache().mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
}
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
@SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
@SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
}
}
@Override
public String toNativeString() {
return "mapPartitions(new <fn>())";
}
};
}
use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class SparkSideInputReader method get.
@Override
@Nullable
public <T> T get(PCollectionView<T> view, BoundedWindow window) {
// --- validate sideInput.
checkNotNull(view, "The PCollectionView passed to sideInput cannot be null ");
KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>> windowedBroadcastHelper = sideInputs.get(view.getTagInternal());
checkNotNull(windowedBroadcastHelper, "SideInput for view " + view + " is not available.");
// --- sideInput window
final BoundedWindow sideInputWindow = view.getWindowMappingFn().getSideInputWindow(window);
// --- match the appropriate sideInput window.
// a tag will point to all matching sideInputs, that is all windows.
// now that we've obtained the appropriate sideInputWindow, all that's left is to filter by it.
Iterable<WindowedValue<?>> availableSideInputs = (Iterable<WindowedValue<?>>) windowedBroadcastHelper.getValue().getValue();
Iterable<?> sideInputForWindow = StreamSupport.stream(availableSideInputs.spliterator(), false).filter(sideInputCandidate -> {
if (sideInputCandidate == null) {
return false;
}
return Iterables.contains(sideInputCandidate.getWindows(), sideInputWindow);
}).collect(Collectors.toList()).stream().map(WindowedValue::getValue).collect(Collectors.toList());
switch(view.getViewFn().getMaterialization().getUrn()) {
case Materializations.ITERABLE_MATERIALIZATION_URN:
{
ViewFn<IterableView, T> viewFn = (ViewFn<IterableView, T>) view.getViewFn();
return viewFn.apply(() -> sideInputForWindow);
}
case Materializations.MULTIMAP_MATERIALIZATION_URN:
{
ViewFn<MultimapView, T> viewFn = (ViewFn<MultimapView, T>) view.getViewFn();
Coder<?> keyCoder = ((KvCoder<?, ?>) view.getCoderInternal()).getKeyCoder();
return viewFn.apply(InMemoryMultimapSideInputView.fromIterable(keyCoder, (Iterable) sideInputForWindow));
}
default:
throw new IllegalStateException(String.format("Unknown side input materialization format requested '%s'", view.getViewFn().getMaterialization().getUrn()));
}
}
Aggregations