use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class SparkBatchPortablePipelineTranslator method translateGroupByKey.
private static <K, V> void translateGroupByKey(PTransformNode transformNode, RunnerApi.Pipeline pipeline, SparkTranslationContext context) {
RunnerApi.Components components = pipeline.getComponents();
String inputId = getInputId(transformNode);
Dataset inputDataset = context.popDataset(inputId);
JavaRDD<WindowedValue<KV<K, V>>> inputRdd = ((BoundedDataset<KV<K, V>>) inputDataset).getRDD();
WindowedValueCoder<KV<K, V>> inputCoder = getWindowedValueCoder(inputId, components);
KvCoder<K, V> inputKvCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
Coder<K> inputKeyCoder = inputKvCoder.getKeyCoder();
Coder<V> inputValueCoder = inputKvCoder.getValueCoder();
WindowingStrategy windowingStrategy = getWindowingStrategy(inputId, components);
WindowFn<Object, BoundedWindow> windowFn = windowingStrategy.getWindowFn();
WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(inputValueCoder, windowFn.windowCoder());
JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKeyAndWindow;
Partitioner partitioner = getPartitioner(context);
// As this is batch, we can ignore triggering and allowed lateness parameters.
if (windowingStrategy.getWindowFn().equals(new GlobalWindows()) && windowingStrategy.getTimestampCombiner().equals(TimestampCombiner.END_OF_WINDOW)) {
// we can drop the windows and recover them later
groupedByKeyAndWindow = GroupNonMergingWindowsFunctions.groupByKeyInGlobalWindow(inputRdd, inputKeyCoder, inputValueCoder, partitioner);
} else if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
// we can have a memory sensitive translation for non-merging windows
groupedByKeyAndWindow = GroupNonMergingWindowsFunctions.groupByKeyAndWindow(inputRdd, inputKeyCoder, inputValueCoder, windowingStrategy, partitioner);
} else {
JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly = GroupCombineFunctions.groupByKeyOnly(inputRdd, inputKeyCoder, wvCoder, partitioner);
// for batch, GroupAlsoByWindow uses an in-memory StateInternals.
groupedByKeyAndWindow = groupedByKeyOnly.flatMap(new SparkGroupAlsoByWindowViaOutputBufferFn<>(windowingStrategy, new TranslationUtils.InMemoryStateInternalsFactory<>(), SystemReduceFn.buffering(inputValueCoder), context.serializablePipelineOptions));
}
context.pushDataset(getOutputId(transformNode), new BoundedDataset<>(groupedByKeyAndWindow));
}
use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class ParDoTranslatorBatch method translateTransform.
@Override
public void translateTransform(PTransform<PCollection<InputT>, PCollectionTuple> transform, AbstractTranslationContext context) {
String stepName = context.getCurrentTransform().getFullName();
// Check for not supported advanced features
// TODO: add support of Splittable DoFn
DoFn<InputT, OutputT> doFn = getDoFn(context);
checkState(!DoFnSignatures.isSplittable(doFn), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
// TODO: add support of states and timers
checkState(!DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");
checkState(!DoFnSignatures.requiresTimeSortedInput(doFn), "@RequiresTimeSortedInput is not " + "supported for the moment");
DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
// Init main variables
PValue input = context.getInput();
Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
TupleTag<?> mainOutputTag = getTupleTag(context);
List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
WindowingStrategy<?, ?> windowingStrategy = ((PCollection<InputT>) input).getWindowingStrategy();
Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();
// construct a map from side input to WindowingStrategy so that
// the DoFn runner can map main-input windows to side input windows
List<PCollectionView<?>> sideInputs = getSideInputs(context);
Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
for (PCollectionView<?> sideInput : sideInputs) {
sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
}
SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);
Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
for (TupleTag<?> tag : outputTags) {
if (!tag.equals(mainOutputTag)) {
additionalOutputTags.add(tag);
}
}
Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
@SuppressWarnings("unchecked") DoFnFunction<InputT, OutputT> doFnWrapper = new DoFnFunction(metricsAccum, stepName, doFn, windowingStrategy, sideInputStrategies, context.getSerializableOptions(), additionalOutputTags, mainOutputTag, inputCoder, outputCoderMap, broadcastStateData, doFnSchemaInformation, sideInputMapping);
MultiOutputCoder multipleOutputCoder = MultiOutputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs = inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
if (outputs.entrySet().size() > 1) {
allOutputs.persist();
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
}
} else {
Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
Dataset<WindowedValue<?>> outputDataset = allOutputs.map((MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder));
context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
}
}
use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class TransformTranslator method groupByKey.
private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
return new TransformEvaluator<GroupByKey<K, V>>() {
@Override
public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
@SuppressWarnings("unchecked") JavaRDD<WindowedValue<KV<K, V>>> inRDD = ((BoundedDataset<KV<K, V>>) context.borrowDataset(transform)).getRDD();
final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
@SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
@SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();
// --- coders.
final Coder<K> keyCoder = coder.getKeyCoder();
final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKey;
Partitioner partitioner = getPartitioner(context);
// As this is batch, we can ignore triggering and allowed lateness parameters.
if (windowingStrategy.getWindowFn().equals(new GlobalWindows()) && windowingStrategy.getTimestampCombiner().equals(TimestampCombiner.END_OF_WINDOW)) {
// we can drop the windows and recover them later
groupedByKey = GroupNonMergingWindowsFunctions.groupByKeyInGlobalWindow(inRDD, keyCoder, coder.getValueCoder(), partitioner);
} else if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
// we can have a memory sensitive translation for non-merging windows
groupedByKey = GroupNonMergingWindowsFunctions.groupByKeyAndWindow(inRDD, keyCoder, coder.getValueCoder(), windowingStrategy, partitioner);
} else {
// --- group by key only.
JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly = GroupCombineFunctions.groupByKeyOnly(inRDD, keyCoder, wvCoder, partitioner);
// --- now group also by window.
// for batch, GroupAlsoByWindow uses an in-memory StateInternals.
groupedByKey = groupedByKeyOnly.flatMap(new SparkGroupAlsoByWindowViaOutputBufferFn<>(windowingStrategy, new TranslationUtils.InMemoryStateInternalsFactory<>(), SystemReduceFn.buffering(coder.getValueCoder()), context.getSerializableOptions()));
}
context.putDataset(transform, new BoundedDataset<>(groupedByKey));
}
@Override
public String toNativeString() {
return "groupByKey()";
}
};
}
use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class SparkCombineFnTest method testSessionCombineFn.
@Test
public void testSessionCombineFn() throws Exception {
WindowingStrategy<Object, IntervalWindow> strategy = WindowingStrategy.of(Sessions.withGapDuration(Duration.millis(1000)));
SparkCombineFn<KV<String, Integer>, Integer, Long, Long> sparkCombineFn = SparkCombineFn.keyed(combineFn, opts, Collections.emptyMap(), strategy);
Instant now = Instant.ofEpochMilli(0);
WindowedValue<KV<String, Integer>> first = input("key", 1, now.plus(Duration.millis(5000)), strategy.getWindowFn());
WindowedValue<KV<String, Integer>> second = input("key", 2, now.plus(Duration.millis(1000)), strategy.getWindowFn());
WindowedValue<KV<String, Integer>> third = input("key", 3, now.plus(Duration.millis(500)), strategy.getWindowFn());
SparkCombineFn.WindowedAccumulator<KV<String, Integer>, Integer, Long, ?> c1 = sparkCombineFn.createCombiner(first);
SparkCombineFn.WindowedAccumulator<KV<String, Integer>, Integer, Long, ?> c2 = sparkCombineFn.createCombiner(third);
sparkCombineFn.mergeValue(c1, second);
SparkCombineFn.WindowedAccumulator<KV<String, Integer>, Integer, Long, ?> c3 = sparkCombineFn.mergeCombiners(c1, c2);
Iterable<WindowedValue<Long>> output = sparkCombineFn.extractOutput(c3);
assertEquals(2, Iterables.size(output));
List<String> format = StreamSupport.stream(output.spliterator(), false).map(val -> val.getValue() + ":" + val.getTimestamp().getMillis()).collect(Collectors.toList());
assertEquals(Lists.newArrayList("5:1999", "1:5999"), format);
}
use of org.apache.beam.sdk.values.WindowingStrategy in project beam by apache.
the class PCollectionViewTranslatorBatch method translateNode.
@Override
public void translateNode(View.CreatePCollectionView<ElemT, ViewT> transform, Twister2BatchTranslationContext context) {
BatchTSet<WindowedValue<ElemT>> inputDataSet = context.getInputDataSet(context.getInput(transform));
@SuppressWarnings("unchecked") AppliedPTransform<PCollection<ElemT>, PCollection<ElemT>, PTransform<PCollection<ElemT>, PCollection<ElemT>>> application = (AppliedPTransform<PCollection<ElemT>, PCollection<ElemT>, PTransform<PCollection<ElemT>, PCollection<ElemT>>>) context.getCurrentTransform();
org.apache.beam.sdk.values.PCollectionView<ViewT> input;
PCollection<ElemT> inputPCol = context.getInput(transform);
final Coder coder = inputPCol.getCoder();
WindowingStrategy windowingStrategy = inputPCol.getWindowingStrategy();
WindowFn windowFn = windowingStrategy.getWindowFn();
try {
input = CreatePCollectionViewTranslation.getView(application);
} catch (IOException e) {
throw new RuntimeException(e);
}
switch(input.getViewFn().getMaterialization().getUrn()) {
case Materializations.MULTIMAP_MATERIALIZATION_URN:
KvCoder kvCoder = (KvCoder<?, ?>) coder;
final Coder keyCoder = kvCoder.getKeyCoder();
final WindowedValue.WindowedValueCoder kvwvCoder = WindowedValue.FullWindowedValueCoder.of(kvCoder.getValueCoder(), windowFn.windowCoder());
BatchTSet<WindowedValue<ElemT>> multimapMaterialization = inputDataSet.direct().map(new MapToTupleFunction<>(keyCoder, kvwvCoder)).allGather().map(new ByteToWindowFunctionPrimitive(keyCoder, kvwvCoder));
context.setSideInputDataSet(input.getTagInternal().getId(), multimapMaterialization);
break;
case Materializations.ITERABLE_MATERIALIZATION_URN:
final WindowedValue.WindowedValueCoder wvCoder = WindowedValue.FullWindowedValueCoder.of(coder, windowFn.windowCoder());
BatchTSet<WindowedValue<ElemT>> iterableMaterialization = inputDataSet.direct().map(new ElemToBytesFunction<>(wvCoder)).allGather().map(new ByteToElemFunction(wvCoder));
try {
input = CreatePCollectionViewTranslation.getView(application);
} catch (IOException e) {
throw new RuntimeException(e);
}
context.setSideInputDataSet(input.getTagInternal().getId(), iterableMaterialization);
break;
default:
throw new UnsupportedOperationException("Unknown side input materialization " + input.getViewFn().getMaterialization().getUrn());
}
}
Aggregations