Search in sources :

Example 1 with GroupByKey

use of org.apache.beam.sdk.transforms.GroupByKey in project beam by apache.

the class StreamingTransformTranslator method groupByKey.

private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
    return new TransformEvaluator<GroupByKey<K, V>>() {

        @Override
        public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
            @SuppressWarnings("unchecked") UnboundedDataset<KV<K, V>> inputDataset = (UnboundedDataset<KV<K, V>>) context.borrowDataset(transform);
            List<Integer> streamSources = inputDataset.getStreamSources();
            JavaDStream<WindowedValue<KV<K, V>>> dStream = inputDataset.getDStream();
            @SuppressWarnings("unchecked") final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            @SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();
            //--- coders.
            final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
            //--- group by key only.
            JavaDStream<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupedByKeyStream = dStream.transform(new Function<JavaRDD<WindowedValue<KV<K, V>>>, JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>>>() {

                @Override
                public JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> call(JavaRDD<WindowedValue<KV<K, V>>> rdd) throws Exception {
                    return GroupCombineFunctions.groupByKeyOnly(rdd, coder.getKeyCoder(), wvCoder);
                }
            });
            //--- now group also by window.
            JavaDStream<WindowedValue<KV<K, Iterable<V>>>> outStream = SparkGroupAlsoByWindowViaWindowSet.groupAlsoByWindow(groupedByKeyStream, coder.getKeyCoder(), wvCoder, windowingStrategy, runtimeContext, streamSources);
            context.putDataset(transform, new UnboundedDataset<>(outStream, streamSources));
        }

        @Override
        public String toNativeString() {
            return "groupByKey()";
        }
    };
}
Also used : GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) SparkAssignWindowFn(org.apache.beam.runners.spark.translation.SparkAssignWindowFn) KvCoder(org.apache.beam.sdk.coders.KvCoder) KV(org.apache.beam.sdk.values.KV) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext)

Example 2 with GroupByKey

use of org.apache.beam.sdk.transforms.GroupByKey in project beam by apache.

the class BatchLoads method expandTriggered.

// Expand the pipeline when the user has requested periodically-triggered file writes.
private WriteResult expandTriggered(PCollection<KV<DestinationT, ElementT>> input) {
    Pipeline p = input.getPipeline();
    final PCollectionView<String> loadJobIdPrefixView = createJobIdPrefixView(p, JobType.LOAD);
    final PCollectionView<String> tempLoadJobIdPrefixView = createJobIdPrefixView(p, JobType.TEMP_TABLE_LOAD);
    final PCollectionView<String> copyJobIdPrefixView = createJobIdPrefixView(p, JobType.COPY);
    final PCollectionView<String> tempFilePrefixView = createTempFilePrefixView(p, loadJobIdPrefixView);
    PCollection<WriteBundlesToFiles.Result<DestinationT>> results;
    if (numFileShards > 0) {
        // The user-supplied triggeringFrequency is often chosen to control how many BigQuery load
        // jobs are generated, to prevent going over BigQuery's daily quota for load jobs. If this
        // is set to a large value, currently we have to buffer all the data until the trigger fires.
        // Instead we ensure that the files are written if a threshold number of records are ready.
        // We use only the user-supplied trigger on the actual BigQuery load. This allows us to
        // offload the data to the filesystem.
        PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterFirst.of(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency), AfterPane.elementCountAtLeast(FILE_TRIGGERING_RECORD_COUNT)))).discardingFiredPanes());
        results = writeStaticallyShardedFiles(inputInGlobalWindow, tempFilePrefixView);
    } else {
        // In the case of dynamic sharding, however, we use a default trigger since the transform
        // performs sharding also batches elements to avoid generating too many tiny files. User
        // trigger is applied right after writes to limit the number of load jobs.
        PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
        results = writeDynamicallyShardedFilesTriggered(inputInGlobalWindow, tempFilePrefixView);
    }
    // Apply the user's trigger before we start generating BigQuery load jobs.
    results = results.apply("applyUserTrigger", Window.<WriteBundlesToFiles.Result<DestinationT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency))).discardingFiredPanes());
    TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> multiPartitionsTag = new TupleTag<>("multiPartitionsTag");
    TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag = new TupleTag<>("singlePartitionTag");
    // If we have non-default triggered output, we can't use the side-input technique used in
    // expandUntriggered. Instead make the result list a main input. Apply a GroupByKey first for
    // determinism.
    PCollectionTuple partitions = results.apply("AttachDestinationKey", WithKeys.of(result -> result.destination)).setCoder(KvCoder.of(destinationCoder, WriteBundlesToFiles.ResultCoder.of(destinationCoder))).apply("GroupFilesByDestination", GroupByKey.create()).apply("ExtractResultValues", Values.create()).apply("WritePartitionTriggered", ParDo.of(new WritePartition<>(singletonTable, dynamicDestinations, tempFilePrefixView, maxFilesPerPartition, maxBytesPerPartition, multiPartitionsTag, singlePartitionTag, rowWriterFactory)).withSideInputs(tempFilePrefixView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
    PCollection<KV<TableDestination, WriteTables.Result>> tempTables = writeTempTables(partitions.get(multiPartitionsTag), tempLoadJobIdPrefixView);
    PCollection<TableDestination> successfulMultiPartitionWrites = tempTables.apply("Window Into Global Windows", Window.<KV<TableDestination, WriteTables.Result>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))).apply("Add Void Key", WithKeys.of((Void) null)).setCoder(KvCoder.of(VoidCoder.of(), tempTables.getCoder())).apply("GroupByKey", GroupByKey.create()).apply("Extract Values", Values.create()).apply("WriteRenameTriggered", ParDo.of(new WriteRename(bigQueryServices, copyJobIdPrefixView, writeDisposition, createDisposition, maxRetryJobs, kmsKey, loadJobProjectId)).withSideInputs(copyJobIdPrefixView));
    PCollection<TableDestination> successfulSinglePartitionWrites = writeSinglePartition(partitions.get(singlePartitionTag), loadJobIdPrefixView).apply("RewindowSinglePartitionResults", Window.<TableDestination>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))));
    PCollectionList<TableDestination> allSuccessfulWrites = PCollectionList.of(successfulMultiPartitionWrites).and(successfulSinglePartitionWrites);
    return writeResult(p, allSuccessfulWrites.apply(Flatten.pCollections()));
}
Also used : LoggerFactory(org.slf4j.LoggerFactory) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) PCollectionList(org.apache.beam.sdk.values.PCollectionList) Strings(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings) Create(org.apache.beam.sdk.transforms.Create) TableRow(com.google.api.services.bigquery.model.TableRow) Window(org.apache.beam.sdk.transforms.windowing.Window) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) ValueProvider(org.apache.beam.sdk.options.ValueProvider) Keys(org.apache.beam.sdk.transforms.Keys) ShardedKey(org.apache.beam.sdk.values.ShardedKey) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) KvCoder(org.apache.beam.sdk.coders.KvCoder) Repeatedly(org.apache.beam.sdk.transforms.windowing.Repeatedly) Set(java.util.Set) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) BigQueryHelpers.resolveTempLocation(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.resolveTempLocation) List(java.util.List) DefaultTrigger(org.apache.beam.sdk.transforms.windowing.DefaultTrigger) ParDo(org.apache.beam.sdk.transforms.ParDo) VisibleForTesting(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting) AfterFirst(org.apache.beam.sdk.transforms.windowing.AfterFirst) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) AfterPane(org.apache.beam.sdk.transforms.windowing.AfterPane) NullableCoder(org.apache.beam.sdk.coders.NullableCoder) Values(org.apache.beam.sdk.transforms.Values) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) Duration(org.joda.time.Duration) Coder(org.apache.beam.sdk.coders.Coder) View(org.apache.beam.sdk.transforms.View) TupleTagList(org.apache.beam.sdk.values.TupleTagList) PTransform(org.apache.beam.sdk.transforms.PTransform) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) TupleTag(org.apache.beam.sdk.values.TupleTag) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) JobType(org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.JobType) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Nullable(org.checkerframework.checker.nullness.qual.Nullable) GroupIntoBatches(org.apache.beam.sdk.transforms.GroupIntoBatches) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) DoFn(org.apache.beam.sdk.transforms.DoFn) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WithKeys(org.apache.beam.sdk.transforms.WithKeys) Logger(org.slf4j.Logger) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) ShardedKeyCoder(org.apache.beam.sdk.coders.ShardedKeyCoder) PCollection(org.apache.beam.sdk.values.PCollection) AfterProcessingTime(org.apache.beam.sdk.transforms.windowing.AfterProcessingTime) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Collections(java.util.Collections) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 3 with GroupByKey

use of org.apache.beam.sdk.transforms.GroupByKey in project beam by apache.

the class TransformTranslator method groupByKey.

private static <K, V, W extends BoundedWindow> TransformEvaluator<GroupByKey<K, V>> groupByKey() {
    return new TransformEvaluator<GroupByKey<K, V>>() {

        @Override
        public void evaluate(GroupByKey<K, V> transform, EvaluationContext context) {
            @SuppressWarnings("unchecked") JavaRDD<WindowedValue<KV<K, V>>> inRDD = ((BoundedDataset<KV<K, V>>) context.borrowDataset(transform)).getRDD();
            final KvCoder<K, V> coder = (KvCoder<K, V>) context.getInput(transform).getCoder();
            @SuppressWarnings("unchecked") final WindowingStrategy<?, W> windowingStrategy = (WindowingStrategy<?, W>) context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") final WindowFn<Object, W> windowFn = (WindowFn<Object, W>) windowingStrategy.getWindowFn();
            // --- coders.
            final Coder<K> keyCoder = coder.getKeyCoder();
            final WindowedValue.WindowedValueCoder<V> wvCoder = WindowedValue.FullWindowedValueCoder.of(coder.getValueCoder(), windowFn.windowCoder());
            JavaRDD<WindowedValue<KV<K, Iterable<V>>>> groupedByKey;
            Partitioner partitioner = getPartitioner(context);
            // As this is batch, we can ignore triggering and allowed lateness parameters.
            if (windowingStrategy.getWindowFn().equals(new GlobalWindows()) && windowingStrategy.getTimestampCombiner().equals(TimestampCombiner.END_OF_WINDOW)) {
                // we can drop the windows and recover them later
                groupedByKey = GroupNonMergingWindowsFunctions.groupByKeyInGlobalWindow(inRDD, keyCoder, coder.getValueCoder(), partitioner);
            } else if (GroupNonMergingWindowsFunctions.isEligibleForGroupByWindow(windowingStrategy)) {
                // we can have a memory sensitive translation for non-merging windows
                groupedByKey = GroupNonMergingWindowsFunctions.groupByKeyAndWindow(inRDD, keyCoder, coder.getValueCoder(), windowingStrategy, partitioner);
            } else {
                // --- group by key only.
                JavaRDD<KV<K, Iterable<WindowedValue<V>>>> groupedByKeyOnly = GroupCombineFunctions.groupByKeyOnly(inRDD, keyCoder, wvCoder, partitioner);
                // --- now group also by window.
                // for batch, GroupAlsoByWindow uses an in-memory StateInternals.
                groupedByKey = groupedByKeyOnly.flatMap(new SparkGroupAlsoByWindowViaOutputBufferFn<>(windowingStrategy, new TranslationUtils.InMemoryStateInternalsFactory<>(), SystemReduceFn.buffering(coder.getValueCoder()), context.getSerializableOptions()));
            }
            context.putDataset(transform, new BoundedDataset<>(groupedByKey));
        }

        @Override
        public String toNativeString() {
            return "groupByKey()";
        }
    };
}
Also used : FluentIterable(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.FluentIterable) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Partitioner(org.apache.spark.Partitioner) HashPartitioner(org.apache.spark.HashPartitioner) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) KvCoder(org.apache.beam.sdk.coders.KvCoder) JavaRDD(org.apache.spark.api.java.JavaRDD)

Aggregations

KvCoder (org.apache.beam.sdk.coders.KvCoder)3 GroupByKey (org.apache.beam.sdk.transforms.GroupByKey)3 KV (org.apache.beam.sdk.values.KV)3 GlobalWindows (org.apache.beam.sdk.transforms.windowing.GlobalWindows)2 TableRow (com.google.api.services.bigquery.model.TableRow)1 Collections (java.util.Collections)1 List (java.util.List)1 Set (java.util.Set)1 ThreadLocalRandom (java.util.concurrent.ThreadLocalRandom)1 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)1 SparkAssignWindowFn (org.apache.beam.runners.spark.translation.SparkAssignWindowFn)1 SparkRuntimeContext (org.apache.beam.runners.spark.translation.SparkRuntimeContext)1 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)1 Pipeline (org.apache.beam.sdk.Pipeline)1 Coder (org.apache.beam.sdk.coders.Coder)1 IterableCoder (org.apache.beam.sdk.coders.IterableCoder)1 NullableCoder (org.apache.beam.sdk.coders.NullableCoder)1 ShardedKeyCoder (org.apache.beam.sdk.coders.ShardedKeyCoder)1 VoidCoder (org.apache.beam.sdk.coders.VoidCoder)1 GcsPath (org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath)1