use of org.apache.beam.sdk.transforms.GroupByKey in project beam by apache.
the class StreamingTransformTranslator method groupByKey.
private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
return new TransformEvaluator<GroupByKey<K, V>>() {
@Override
public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
@SuppressWarnings("unchecked") UnboundedDataset<KV<K, V>> inputDataset = (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
List<Integer> streamSources = inputDataset.getStreamSources();
JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
@SuppressWarnings("unchecked") final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
@SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
@SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();
//--- coders.
final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
//--- group by key only.
JavaDStream<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupedByKeyStream = dStream.transform(new Function<JavaRDD<WindowedValue<KV<K, V>>>, JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>>>() {
@Override
public JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> call(JavaRDD<WindowedValue<KV<K, V>>> rdd) throws Exception {
return GroupCombineFunctions.groupByKeyOnly(rdd, coder.getKeyCoder(), wvCoder);
}
});
//--- now group also by window.
JavaDStream<WindowedValue<KV<K, Iterable<V>>>> outStream = SparkGroupAlsoByWindowViaWindowSet.groupAlsoByWindow(groupedByKeyStream, coder.getKeyCoder(), wvCoder, windowingStrategy, runtimeContext, streamSources);
context.putDataset(transform, new UnboundedDataset<>(outStream, streamSources));
}
@Override
public String toNativeString() {
return "groupByKey()";
}
};
}
use of org.apache.beam.sdk.transforms.GroupByKey in project beam by apache.
the class BatchLoads method expandTriggered.
// Expand the pipeline when the user has requested periodically-triggered file writes.
private WriteResult expandTriggered(PCollection<KV<DestinationT, ElementT>> input) {
Pipeline p = input.getPipeline();
final PCollectionView<String> loadJobIdPrefixView = createJobIdPrefixView(p, JobType.LOAD);
final PCollectionView<String> tempLoadJobIdPrefixView = createJobIdPrefixView(p, JobType.TEMP_TABLE_LOAD);
final PCollectionView<String> copyJobIdPrefixView = createJobIdPrefixView(p, JobType.COPY);
final PCollectionView<String> tempFilePrefixView = createTempFilePrefixView(p, loadJobIdPrefixView);
PCollection<WriteBundlesToFiles.Result<DestinationT>> results;
if (numFileShards > 0) {
// The user-supplied triggeringFrequency is often chosen to control how many BigQuery load
// jobs are generated, to prevent going over BigQuery's daily quota for load jobs. If this
// is set to a large value, currently we have to buffer all the data until the trigger fires.
// Instead we ensure that the files are written if a threshold number of records are ready.
// We use only the user-supplied trigger on the actual BigQuery load. This allows us to
// offload the data to the filesystem.
PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterFirst.of(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency), AfterPane.elementCountAtLeast(FILE_TRIGGERING_RECORD_COUNT)))).discardingFiredPanes());
results = writeStaticallyShardedFiles(inputInGlobalWindow, tempFilePrefixView);
} else {
// In the case of dynamic sharding, however, we use a default trigger since the transform
// performs sharding also batches elements to avoid generating too many tiny files. User
// trigger is applied right after writes to limit the number of load jobs.
PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
results = writeDynamicallyShardedFilesTriggered(inputInGlobalWindow, tempFilePrefixView);
}
// Apply the user's trigger before we start generating BigQuery load jobs.
results = results.apply("applyUserTrigger", Window.<WriteBundlesToFiles.Result<DestinationT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency))).discardingFiredPanes());
TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> multiPartitionsTag = new TupleTag<>("multiPartitionsTag");
TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag = new TupleTag<>("singlePartitionTag");
// If we have non-default triggered output, we can't use the side-input technique used in
// expandUntriggered. Instead make the result list a main input. Apply a GroupByKey first for
// determinism.
PCollectionTuple partitions = results.apply("AttachDestinationKey", WithKeys.of(result -> result.destination)).setCoder(KvCoder.of(destinationCoder, WriteBundlesToFiles.ResultCoder.of(destinationCoder))).apply("GroupFilesByDestination", GroupByKey.create()).apply("ExtractResultValues", Values.create()).apply("WritePartitionTriggered", ParDo.of(new WritePartition<>(singletonTable, dynamicDestinations, tempFilePrefixView, maxFilesPerPartition, maxBytesPerPartition, multiPartitionsTag, singlePartitionTag, rowWriterFactory)).withSideInputs(tempFilePrefixView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
PCollection<KV<TableDestination, WriteTables.Result>> tempTables = writeTempTables(partitions.get(multiPartitionsTag), tempLoadJobIdPrefixView);
PCollection<TableDestination> successfulMultiPartitionWrites = tempTables.apply("Window Into Global Windows", Window.<KV<TableDestination, WriteTables.Result>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))).apply("Add Void Key", WithKeys.of((Void) null)).setCoder(KvCoder.of(VoidCoder.of(), tempTables.getCoder())).apply("GroupByKey", GroupByKey.create()).apply("Extract Values", Values.create()).apply("WriteRenameTriggered", ParDo.of(new WriteRename(bigQueryServices, copyJobIdPrefixView, writeDisposition, createDisposition, maxRetryJobs, kmsKey, loadJobProjectId)).withSideInputs(copyJobIdPrefixView));
PCollection<TableDestination> successfulSinglePartitionWrites = writeSinglePartition(partitions.get(singlePartitionTag), loadJobIdPrefixView).apply("RewindowSinglePartitionResults", Window.<TableDestination>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))));
PCollectionList<TableDestination> allSuccessfulWrites = PCollectionList.of(successfulMultiPartitionWrites).and(successfulSinglePartitionWrites);
return writeResult(p, allSuccessfulWrites.apply(Flatten.pCollections()));
}
use of org.apache.beam.sdk.transforms.GroupByKey in project beam by apache.
the class TransformTranslator method groupByKey.
private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
return new TransformEvaluator<GroupByKey<K, V>>() {
@Override
public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
@SuppressWarnings("unchecked") JavaRDD<WindowedValue<KV<K, V>>> inRDD = ((BoundedDataset<KV<K, V>>) context.borrowDataset(transform)).getRDD();
final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
@SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
@SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();
// --- coders.
final Coder<K> keyCoder = coder.getKeyCoder();
final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKey;
Partitioner partitioner = getPartitioner(context);
// As this is batch, we can ignore triggering and allowed lateness parameters.
if (windowingStrategy.getWindowFn().equals(new GlobalWindows()) && windowingStrategy.getTimestampCombiner().equals(TimestampCombiner.END_OF_WINDOW)) {
// we can drop the windows and recover them later
groupedByKey = GroupNonMergingWindowsFunctions.groupByKeyInGlobalWindow(inRDD, keyCoder, coder.getValueCoder(), partitioner);
} else if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
// we can have a memory sensitive translation for non-merging windows
groupedByKey = GroupNonMergingWindowsFunctions.groupByKeyAndWindow(inRDD, keyCoder, coder.getValueCoder(), windowingStrategy, partitioner);
} else {
// --- group by key only.
JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly = GroupCombineFunctions.groupByKeyOnly(inRDD, keyCoder, wvCoder, partitioner);
// --- now group also by window.
// for batch, GroupAlsoByWindow uses an in-memory StateInternals.
groupedByKey = groupedByKeyOnly.flatMap(new SparkGroupAlsoByWindowViaOutputBufferFn<>(windowingStrategy, new TranslationUtils.InMemoryStateInternalsFactory<>(), SystemReduceFn.buffering(coder.getValueCoder()), context.getSerializableOptions()));
}
context.putDataset(transform, new BoundedDataset<>(groupedByKey));
}
@Override
public String toNativeString() {
return "groupByKey()";
}
};
}
Aggregations