Search in sources :

Example 1 with ShardedKey

use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.

the class BatchLoads method writeDynamicallyShardedFilesUntriggered.

// Writes input data to dynamically-sharded per-bundle files without triggering. Input records are
// spilt to new files if memory is constrained. Returns a PCollection of filename, file byte size,
// and table destination.
PCollection<WriteBundlesToFiles.Result<DestinationT>> writeDynamicallyShardedFilesUntriggered(PCollection<KV<DestinationT, ElementT>> input, PCollectionView<String> tempFilePrefix) {
    TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag = new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {
    };
    TupleTag<KV<ShardedKey<DestinationT>, ElementT>> unwrittedRecordsTag = new TupleTag<KV<ShardedKey<DestinationT>, ElementT>>("unwrittenRecords") {
    };
    PCollectionTuple writeBundlesTuple = input.apply("WriteBundlesToFiles", ParDo.of(new WriteBundlesToFiles<>(tempFilePrefix, unwrittedRecordsTag, maxNumWritersPerBundle, maxFileSize, rowWriterFactory)).withSideInputs(tempFilePrefix).withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
    PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles = writeBundlesTuple.get(writtenFilesTag).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
    PCollection<KV<ShardedKey<DestinationT>, ElementT>> unwrittenRecords = writeBundlesTuple.get(unwrittedRecordsTag).setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), elementCoder));
    // If the bundles contain too many output tables to be written inline to files (due to memory
    // limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
    // Group these records by key, and write the files after grouping. Since the record is grouped
    // by key, we can ensure that only one file is open at a time in each bundle.
    PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped = writeShardedRecords(unwrittenRecords, tempFilePrefix);
    // PCollection of filename, file byte size, and table destination.
    return PCollectionList.of(writtenFiles).and(writtenFilesGrouped).apply("FlattenFiles", Flatten.pCollections()).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
}
Also used : ShardedKey(org.apache.beam.sdk.values.ShardedKey) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) KV(org.apache.beam.sdk.values.KV) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result)

Example 2 with ShardedKey

use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.

the class BigQueryIOWriteTest method testWritePartition.

private void testWritePartition(long numTables, long numFilesPerTable, long fileSize, long expectedNumPartitionsPerTable) throws Exception {
    p.enableAbandonedNodeEnforcement(false);
    // In the case where a static destination is specified (i.e. not through a dynamic table
    // function) and there is no input data, WritePartition will generate an empty table. This
    // code is to test that path.
    boolean isSingleton = numTables == 1 && numFilesPerTable == 0;
    DynamicDestinations<String, TableDestination> dynamicDestinations = new DynamicDestinationsHelpers.ConstantTableDestinations<>(ValueProvider.StaticValueProvider.of("SINGLETON"), "", false);
    List<ShardedKey<TableDestination>> expectedPartitions = Lists.newArrayList();
    if (isSingleton) {
        expectedPartitions.add(ShardedKey.of(new TableDestination("SINGLETON", ""), 1));
    } else {
        for (int i = 0; i < numTables; ++i) {
            for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
                String tableName = String.format("project-id:dataset-id.tables%05d", i);
                expectedPartitions.add(ShardedKey.of(new TableDestination(tableName, ""), j));
            }
        }
    }
    List<WriteBundlesToFiles.Result<TableDestination>> files = Lists.newArrayList();
    Map<String, List<String>> filenamesPerTable = Maps.newHashMap();
    for (int i = 0; i < numTables; ++i) {
        String tableName = String.format("project-id:dataset-id.tables%05d", i);
        List<String> filenames = filenamesPerTable.computeIfAbsent(tableName, k -> Lists.newArrayList());
        for (int j = 0; j < numFilesPerTable; ++j) {
            String fileName = String.format("%s_files%05d", tableName, j);
            filenames.add(fileName);
            files.add(new WriteBundlesToFiles.Result<>(fileName, fileSize, new TableDestination(tableName, "")));
        }
    }
    TupleTag<KV<ShardedKey<TableDestination>, WritePartition.Result>> multiPartitionsTag = new TupleTag<KV<ShardedKey<TableDestination>, WritePartition.Result>>("multiPartitionsTag") {
    };
    TupleTag<KV<ShardedKey<TableDestination>, WritePartition.Result>> singlePartitionTag = new TupleTag<KV<ShardedKey<TableDestination>, WritePartition.Result>>("singlePartitionTag") {
    };
    String tempFilePrefix = testFolder.newFolder("BigQueryIOTest").getAbsolutePath();
    PCollectionView<String> tempFilePrefixView = p.apply(Create.of(tempFilePrefix)).apply(View.asSingleton());
    WritePartition<TableDestination> writePartition = new WritePartition<>(isSingleton, dynamicDestinations, tempFilePrefixView, BatchLoads.DEFAULT_MAX_FILES_PER_PARTITION, BatchLoads.DEFAULT_MAX_BYTES_PER_PARTITION, multiPartitionsTag, singlePartitionTag, RowWriterFactory.tableRows(SerializableFunctions.identity(), SerializableFunctions.identity()));
    DoFnTester<Iterable<WriteBundlesToFiles.Result<TableDestination>>, KV<ShardedKey<TableDestination>, WritePartition.Result>> tester = DoFnTester.of(writePartition);
    tester.setSideInput(tempFilePrefixView, GlobalWindow.INSTANCE, tempFilePrefix);
    tester.processElement(files);
    List<KV<ShardedKey<TableDestination>, WritePartition.Result>> partitions;
    if (expectedNumPartitionsPerTable > 1) {
        partitions = tester.takeOutputElements(multiPartitionsTag);
    } else {
        partitions = tester.takeOutputElements(singlePartitionTag);
    }
    List<ShardedKey<TableDestination>> partitionsResult = Lists.newArrayList();
    Map<String, List<String>> filesPerTableResult = Maps.newHashMap();
    for (KV<ShardedKey<TableDestination>, WritePartition.Result> partition : partitions) {
        String table = partition.getKey().getKey().getTableSpec();
        partitionsResult.add(partition.getKey());
        List<String> tableFilesResult = filesPerTableResult.computeIfAbsent(table, k -> Lists.newArrayList());
        tableFilesResult.addAll(partition.getValue().getFilenames());
    }
    assertThat(partitionsResult, containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
    if (isSingleton) {
        assertEquals(1, filesPerTableResult.size());
        List<String> singletonFiles = filesPerTableResult.values().iterator().next();
        assertTrue(Files.exists(Paths.get(singletonFiles.get(0))));
        assertThat(Files.readAllBytes(Paths.get(singletonFiles.get(0))).length, equalTo(0));
    } else {
        assertEquals(filenamesPerTable, filesPerTableResult);
    }
    for (List<String> filenames : filesPerTableResult.values()) {
        for (String filename : filenames) {
            Files.deleteIfExists(Paths.get(filename));
        }
    }
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteTables.Result) ShardedKey(org.apache.beam.sdk.values.ShardedKey) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) KV(org.apache.beam.sdk.values.KV)

Example 3 with ShardedKey

use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.

the class BatchLoads method expandUntriggered.

// Expand the pipeline when the user has not requested periodically-triggered file writes.
public WriteResult expandUntriggered(PCollection<KV<DestinationT, ElementT>> input) {
    Pipeline p = input.getPipeline();
    final PCollectionView<String> loadJobIdPrefixView = createJobIdPrefixView(p, JobType.LOAD);
    final PCollectionView<String> tempLoadJobIdPrefixView = createJobIdPrefixView(p, JobType.TEMP_TABLE_LOAD);
    final PCollectionView<String> copyJobIdPrefixView = createJobIdPrefixView(p, JobType.COPY);
    final PCollectionView<String> tempFilePrefixView = createTempFilePrefixView(p, loadJobIdPrefixView);
    PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
    PCollection<WriteBundlesToFiles.Result<DestinationT>> results = (numFileShards == 0) ? writeDynamicallyShardedFilesUntriggered(inputInGlobalWindow, tempFilePrefixView) : writeStaticallyShardedFiles(inputInGlobalWindow, tempFilePrefixView);
    TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> multiPartitionsTag = new TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>>("multiPartitionsTag") {
    };
    TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag = new TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>>("singlePartitionTag") {
    };
    // This transform will look at the set of files written for each table, and if any table has
    // too many files or bytes, will partition that table's files into multiple partitions for
    // loading.
    PCollectionTuple partitions = results.apply("ReifyResults", new ReifyAsIterable<>()).setCoder(IterableCoder.of(WriteBundlesToFiles.ResultCoder.of(destinationCoder))).apply("WritePartitionUntriggered", ParDo.of(new WritePartition<>(singletonTable, dynamicDestinations, tempFilePrefixView, maxFilesPerPartition, maxBytesPerPartition, multiPartitionsTag, singlePartitionTag, rowWriterFactory)).withSideInputs(tempFilePrefixView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
    PCollection<TableDestination> successfulSinglePartitionWrites = writeSinglePartition(partitions.get(singlePartitionTag), loadJobIdPrefixView);
    PCollection<TableDestination> successfulMultiPartitionWrites = writeTempTables(partitions.get(multiPartitionsTag), tempLoadJobIdPrefixView).apply("ReifyRenameInput", new ReifyAsIterable<>()).apply("WriteRenameUntriggered", ParDo.of(new WriteRename(bigQueryServices, copyJobIdPrefixView, writeDisposition, createDisposition, maxRetryJobs, kmsKey, loadJobProjectId)).withSideInputs(copyJobIdPrefixView));
    PCollectionList<TableDestination> allSuccessfulWrites = PCollectionList.of(successfulSinglePartitionWrites).and(successfulMultiPartitionWrites);
    return writeResult(p, allSuccessfulWrites.apply(Flatten.pCollections()));
}
Also used : GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) ShardedKey(org.apache.beam.sdk.values.ShardedKey) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 4 with ShardedKey

use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.

the class BatchLoads method expandTriggered.

// Expand the pipeline when the user has requested periodically-triggered file writes.
private WriteResult expandTriggered(PCollection<KV<DestinationT, ElementT>> input) {
    Pipeline p = input.getPipeline();
    final PCollectionView<String> loadJobIdPrefixView = createJobIdPrefixView(p, JobType.LOAD);
    final PCollectionView<String> tempLoadJobIdPrefixView = createJobIdPrefixView(p, JobType.TEMP_TABLE_LOAD);
    final PCollectionView<String> copyJobIdPrefixView = createJobIdPrefixView(p, JobType.COPY);
    final PCollectionView<String> tempFilePrefixView = createTempFilePrefixView(p, loadJobIdPrefixView);
    PCollection<WriteBundlesToFiles.Result<DestinationT>> results;
    if (numFileShards > 0) {
        // The user-supplied triggeringFrequency is often chosen to control how many BigQuery load
        // jobs are generated, to prevent going over BigQuery's daily quota for load jobs. If this
        // is set to a large value, currently we have to buffer all the data until the trigger fires.
        // Instead we ensure that the files are written if a threshold number of records are ready.
        // We use only the user-supplied trigger on the actual BigQuery load. This allows us to
        // offload the data to the filesystem.
        PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterFirst.of(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency), AfterPane.elementCountAtLeast(FILE_TRIGGERING_RECORD_COUNT)))).discardingFiredPanes());
        results = writeStaticallyShardedFiles(inputInGlobalWindow, tempFilePrefixView);
    } else {
        // In the case of dynamic sharding, however, we use a default trigger since the transform
        // performs sharding also batches elements to avoid generating too many tiny files. User
        // trigger is applied right after writes to limit the number of load jobs.
        PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
        results = writeDynamicallyShardedFilesTriggered(inputInGlobalWindow, tempFilePrefixView);
    }
    // Apply the user's trigger before we start generating BigQuery load jobs.
    results = results.apply("applyUserTrigger", Window.<WriteBundlesToFiles.Result<DestinationT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency))).discardingFiredPanes());
    TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> multiPartitionsTag = new TupleTag<>("multiPartitionsTag");
    TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag = new TupleTag<>("singlePartitionTag");
    // If we have non-default triggered output, we can't use the side-input technique used in
    // expandUntriggered. Instead make the result list a main input. Apply a GroupByKey first for
    // determinism.
    PCollectionTuple partitions = results.apply("AttachDestinationKey", WithKeys.of(result -> result.destination)).setCoder(KvCoder.of(destinationCoder, WriteBundlesToFiles.ResultCoder.of(destinationCoder))).apply("GroupFilesByDestination", GroupByKey.create()).apply("ExtractResultValues", Values.create()).apply("WritePartitionTriggered", ParDo.of(new WritePartition<>(singletonTable, dynamicDestinations, tempFilePrefixView, maxFilesPerPartition, maxBytesPerPartition, multiPartitionsTag, singlePartitionTag, rowWriterFactory)).withSideInputs(tempFilePrefixView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
    PCollection<KV<TableDestination, WriteTables.Result>> tempTables = writeTempTables(partitions.get(multiPartitionsTag), tempLoadJobIdPrefixView);
    PCollection<TableDestination> successfulMultiPartitionWrites = tempTables.apply("Window Into Global Windows", Window.<KV<TableDestination, WriteTables.Result>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))).apply("Add Void Key", WithKeys.of((Void) null)).setCoder(KvCoder.of(VoidCoder.of(), tempTables.getCoder())).apply("GroupByKey", GroupByKey.create()).apply("Extract Values", Values.create()).apply("WriteRenameTriggered", ParDo.of(new WriteRename(bigQueryServices, copyJobIdPrefixView, writeDisposition, createDisposition, maxRetryJobs, kmsKey, loadJobProjectId)).withSideInputs(copyJobIdPrefixView));
    PCollection<TableDestination> successfulSinglePartitionWrites = writeSinglePartition(partitions.get(singlePartitionTag), loadJobIdPrefixView).apply("RewindowSinglePartitionResults", Window.<TableDestination>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))));
    PCollectionList<TableDestination> allSuccessfulWrites = PCollectionList.of(successfulMultiPartitionWrites).and(successfulSinglePartitionWrites);
    return writeResult(p, allSuccessfulWrites.apply(Flatten.pCollections()));
}
Also used : LoggerFactory(org.slf4j.LoggerFactory) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) PCollectionList(org.apache.beam.sdk.values.PCollectionList) Strings(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings) Create(org.apache.beam.sdk.transforms.Create) TableRow(com.google.api.services.bigquery.model.TableRow) Window(org.apache.beam.sdk.transforms.windowing.Window) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) ValueProvider(org.apache.beam.sdk.options.ValueProvider) Keys(org.apache.beam.sdk.transforms.Keys) ShardedKey(org.apache.beam.sdk.values.ShardedKey) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) KvCoder(org.apache.beam.sdk.coders.KvCoder) Repeatedly(org.apache.beam.sdk.transforms.windowing.Repeatedly) Set(java.util.Set) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) BigQueryHelpers.resolveTempLocation(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.resolveTempLocation) List(java.util.List) DefaultTrigger(org.apache.beam.sdk.transforms.windowing.DefaultTrigger) ParDo(org.apache.beam.sdk.transforms.ParDo) VisibleForTesting(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting) AfterFirst(org.apache.beam.sdk.transforms.windowing.AfterFirst) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) AfterPane(org.apache.beam.sdk.transforms.windowing.AfterPane) NullableCoder(org.apache.beam.sdk.coders.NullableCoder) Values(org.apache.beam.sdk.transforms.Values) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) Duration(org.joda.time.Duration) Coder(org.apache.beam.sdk.coders.Coder) View(org.apache.beam.sdk.transforms.View) TupleTagList(org.apache.beam.sdk.values.TupleTagList) PTransform(org.apache.beam.sdk.transforms.PTransform) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) TupleTag(org.apache.beam.sdk.values.TupleTag) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) JobType(org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.JobType) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Nullable(org.checkerframework.checker.nullness.qual.Nullable) GroupIntoBatches(org.apache.beam.sdk.transforms.GroupIntoBatches) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) DoFn(org.apache.beam.sdk.transforms.DoFn) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WithKeys(org.apache.beam.sdk.transforms.WithKeys) Logger(org.slf4j.Logger) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) ShardedKeyCoder(org.apache.beam.sdk.coders.ShardedKeyCoder) PCollection(org.apache.beam.sdk.values.PCollection) AfterProcessingTime(org.apache.beam.sdk.transforms.windowing.AfterProcessingTime) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Collections(java.util.Collections) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 5 with ShardedKey

use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.

the class StreamingWriteTables method writeAndGetErrors.

private <T> PCollectionTuple writeAndGetErrors(PCollection<KV<TableDestination, ElementT>> input, TupleTag<T> failedInsertsTag, AtomicCoder<T> coder, ErrorContainer<T> errorContainer) {
    BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
    if (autoSharding && deterministicRecordIdFn == null) {
        // If runner determined dynamic sharding is enabled, group TableRows on table destinations
        // that may be sharded during the runtime. Otherwise, we choose a fixed number of shards per
        // table destination following the logic below in the other branch.
        PCollection<KV<String, TableRowInfo<ElementT>>> unshardedTagged = input.apply("MapToTableSpec", MapElements.via(new SimpleFunction<KV<TableDestination, ElementT>, KV<String, ElementT>>() {

            @Override
            public KV<String, ElementT> apply(KV<TableDestination, ElementT> input) {
                return KV.of(input.getKey().getTableSpec(), input.getValue());
            }
        })).setCoder(KvCoder.of(StringUtf8Coder.of(), elementCoder)).apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds<>())).setCoder(KvCoder.of(StringUtf8Coder.of(), TableRowInfoCoder.of(elementCoder)));
        // the same time batches the TableRows to be inserted to BigQuery.
        return unshardedTagged.apply("StreamingWrite", new BatchedStreamingWrite<>(bigQueryServices, retryPolicy, failedInsertsTag, coder, errorContainer, skipInvalidRows, ignoreUnknownValues, ignoreInsertIds, toTableRow, toFailsafeTableRow).viaStateful());
    } else {
        // We create 50 keys per BigQuery table to generate output on. This is few enough that we
        // get good batching into BigQuery's insert calls, and enough that we can max out the
        // streaming insert quota.
        int numShards = options.getNumStreamingKeys();
        PCollection<KV<ShardedKey<String>, TableRowInfo<ElementT>>> shardedTagged = input.apply("ShardTableWrites", ParDo.of(new GenerateShardedTable<>(numShards))).setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), elementCoder)).apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds<>(deterministicRecordIdFn))).setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of(elementCoder)));
        if (deterministicRecordIdFn == null) {
            // If not using a deterministic function for record ids, we must apply a reshuffle to ensure
            // determinism on the generated ids.
            shardedTagged = shardedTagged.apply(Reshuffle.of());
        }
        return shardedTagged.apply("GlobalWindow", Window.<KV<ShardedKey<String>, TableRowInfo<ElementT>>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes()).apply("StripShardId", MapElements.via(new SimpleFunction<KV<ShardedKey<String>, TableRowInfo<ElementT>>, KV<String, TableRowInfo<ElementT>>>() {

            @Override
            public KV<String, TableRowInfo<ElementT>> apply(KV<ShardedKey<String>, TableRowInfo<ElementT>> input) {
                return KV.of(input.getKey().getKey(), input.getValue());
            }
        })).setCoder(KvCoder.of(StringUtf8Coder.of(), TableRowInfoCoder.of(elementCoder))).apply("StreamingWrite", new BatchedStreamingWrite<>(bigQueryServices, retryPolicy, failedInsertsTag, coder, errorContainer, skipInvalidRows, ignoreUnknownValues, ignoreInsertIds, toTableRow, toFailsafeTableRow).viaDoFnFinalization());
    }
}
Also used : GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) KV(org.apache.beam.sdk.values.KV) ShardedKey(org.apache.beam.sdk.values.ShardedKey) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction)

Aggregations

KV (org.apache.beam.sdk.values.KV)5 ShardedKey (org.apache.beam.sdk.values.ShardedKey)5 TupleTag (org.apache.beam.sdk.values.TupleTag)4 Result (org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result)3 GlobalWindows (org.apache.beam.sdk.transforms.windowing.GlobalWindows)3 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)3 List (java.util.List)2 Pipeline (org.apache.beam.sdk.Pipeline)2 SimpleFunction (org.apache.beam.sdk.transforms.SimpleFunction)2 TableRow (com.google.api.services.bigquery.model.TableRow)1 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 Set (java.util.Set)1 ThreadLocalRandom (java.util.concurrent.ThreadLocalRandom)1 Coder (org.apache.beam.sdk.coders.Coder)1 IterableCoder (org.apache.beam.sdk.coders.IterableCoder)1 KvCoder (org.apache.beam.sdk.coders.KvCoder)1 NullableCoder (org.apache.beam.sdk.coders.NullableCoder)1 ShardedKeyCoder (org.apache.beam.sdk.coders.ShardedKeyCoder)1 VoidCoder (org.apache.beam.sdk.coders.VoidCoder)1