use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.
the class BatchLoads method writeDynamicallyShardedFilesUntriggered.
// Writes input data to dynamically-sharded per-bundle files without triggering. Input records are
// spilt to new files if memory is constrained. Returns a PCollection of filename, file byte size,
// and table destination.
PCollection<WriteBundlesToFiles.Result<DestinationT>> writeDynamicallyShardedFilesUntriggered(PCollection<KV<DestinationT, ElementT>> input, PCollectionView<String> tempFilePrefix) {
TupleTag<WriteBundlesToFiles.Result<DestinationT>> writtenFilesTag = new TupleTag<WriteBundlesToFiles.Result<DestinationT>>("writtenFiles") {
};
TupleTag<KV<ShardedKey<DestinationT>, ElementT>> unwrittedRecordsTag = new TupleTag<KV<ShardedKey<DestinationT>, ElementT>>("unwrittenRecords") {
};
PCollectionTuple writeBundlesTuple = input.apply("WriteBundlesToFiles", ParDo.of(new WriteBundlesToFiles<>(tempFilePrefix, unwrittedRecordsTag, maxNumWritersPerBundle, maxFileSize, rowWriterFactory)).withSideInputs(tempFilePrefix).withOutputTags(writtenFilesTag, TupleTagList.of(unwrittedRecordsTag)));
PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFiles = writeBundlesTuple.get(writtenFilesTag).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
PCollection<KV<ShardedKey<DestinationT>, ElementT>> unwrittenRecords = writeBundlesTuple.get(unwrittedRecordsTag).setCoder(KvCoder.of(ShardedKeyCoder.of(destinationCoder), elementCoder));
// If the bundles contain too many output tables to be written inline to files (due to memory
// limits), any unwritten records will be spilled to the unwrittenRecordsTag PCollection.
// Group these records by key, and write the files after grouping. Since the record is grouped
// by key, we can ensure that only one file is open at a time in each bundle.
PCollection<WriteBundlesToFiles.Result<DestinationT>> writtenFilesGrouped = writeShardedRecords(unwrittenRecords, tempFilePrefix);
// PCollection of filename, file byte size, and table destination.
return PCollectionList.of(writtenFiles).and(writtenFilesGrouped).apply("FlattenFiles", Flatten.pCollections()).setCoder(WriteBundlesToFiles.ResultCoder.of(destinationCoder));
}
use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.
the class BigQueryIOWriteTest method testWritePartition.
private void testWritePartition(long numTables, long numFilesPerTable, long fileSize, long expectedNumPartitionsPerTable) throws Exception {
p.enableAbandonedNodeEnforcement(false);
// In the case where a static destination is specified (i.e. not through a dynamic table
// function) and there is no input data, WritePartition will generate an empty table. This
// code is to test that path.
boolean isSingleton = numTables == 1 && numFilesPerTable == 0;
DynamicDestinations<String, TableDestination> dynamicDestinations = new DynamicDestinationsHelpers.ConstantTableDestinations<>(ValueProvider.StaticValueProvider.of("SINGLETON"), "", false);
List<ShardedKey<TableDestination>> expectedPartitions = Lists.newArrayList();
if (isSingleton) {
expectedPartitions.add(ShardedKey.of(new TableDestination("SINGLETON", ""), 1));
} else {
for (int i = 0; i < numTables; ++i) {
for (int j = 1; j <= expectedNumPartitionsPerTable; ++j) {
String tableName = String.format("project-id:dataset-id.tables%05d", i);
expectedPartitions.add(ShardedKey.of(new TableDestination(tableName, ""), j));
}
}
}
List<WriteBundlesToFiles.Result<TableDestination>> files = Lists.newArrayList();
Map<String, List<String>> filenamesPerTable = Maps.newHashMap();
for (int i = 0; i < numTables; ++i) {
String tableName = String.format("project-id:dataset-id.tables%05d", i);
List<String> filenames = filenamesPerTable.computeIfAbsent(tableName, k -> Lists.newArrayList());
for (int j = 0; j < numFilesPerTable; ++j) {
String fileName = String.format("%s_files%05d", tableName, j);
filenames.add(fileName);
files.add(new WriteBundlesToFiles.Result<>(fileName, fileSize, new TableDestination(tableName, "")));
}
}
TupleTag<KV<ShardedKey<TableDestination>, WritePartition.Result>> multiPartitionsTag = new TupleTag<KV<ShardedKey<TableDestination>, WritePartition.Result>>("multiPartitionsTag") {
};
TupleTag<KV<ShardedKey<TableDestination>, WritePartition.Result>> singlePartitionTag = new TupleTag<KV<ShardedKey<TableDestination>, WritePartition.Result>>("singlePartitionTag") {
};
String tempFilePrefix = testFolder.newFolder("BigQueryIOTest").getAbsolutePath();
PCollectionView<String> tempFilePrefixView = p.apply(Create.of(tempFilePrefix)).apply(View.asSingleton());
WritePartition<TableDestination> writePartition = new WritePartition<>(isSingleton, dynamicDestinations, tempFilePrefixView, BatchLoads.DEFAULT_MAX_FILES_PER_PARTITION, BatchLoads.DEFAULT_MAX_BYTES_PER_PARTITION, multiPartitionsTag, singlePartitionTag, RowWriterFactory.tableRows(SerializableFunctions.identity(), SerializableFunctions.identity()));
DoFnTester<Iterable<WriteBundlesToFiles.Result<TableDestination>>, KV<ShardedKey<TableDestination>, WritePartition.Result>> tester = DoFnTester.of(writePartition);
tester.setSideInput(tempFilePrefixView, GlobalWindow.INSTANCE, tempFilePrefix);
tester.processElement(files);
List<KV<ShardedKey<TableDestination>, WritePartition.Result>> partitions;
if (expectedNumPartitionsPerTable > 1) {
partitions = tester.takeOutputElements(multiPartitionsTag);
} else {
partitions = tester.takeOutputElements(singlePartitionTag);
}
List<ShardedKey<TableDestination>> partitionsResult = Lists.newArrayList();
Map<String, List<String>> filesPerTableResult = Maps.newHashMap();
for (KV<ShardedKey<TableDestination>, WritePartition.Result> partition : partitions) {
String table = partition.getKey().getKey().getTableSpec();
partitionsResult.add(partition.getKey());
List<String> tableFilesResult = filesPerTableResult.computeIfAbsent(table, k -> Lists.newArrayList());
tableFilesResult.addAll(partition.getValue().getFilenames());
}
assertThat(partitionsResult, containsInAnyOrder(Iterables.toArray(expectedPartitions, ShardedKey.class)));
if (isSingleton) {
assertEquals(1, filesPerTableResult.size());
List<String> singletonFiles = filesPerTableResult.values().iterator().next();
assertTrue(Files.exists(Paths.get(singletonFiles.get(0))));
assertThat(Files.readAllBytes(Paths.get(singletonFiles.get(0))).length, equalTo(0));
} else {
assertEquals(filenamesPerTable, filesPerTableResult);
}
for (List<String> filenames : filesPerTableResult.values()) {
for (String filename : filenames) {
Files.deleteIfExists(Paths.get(filename));
}
}
}
use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.
the class BatchLoads method expandUntriggered.
// Expand the pipeline when the user has not requested periodically-triggered file writes.
public WriteResult expandUntriggered(PCollection<KV<DestinationT, ElementT>> input) {
Pipeline p = input.getPipeline();
final PCollectionView<String> loadJobIdPrefixView = createJobIdPrefixView(p, JobType.LOAD);
final PCollectionView<String> tempLoadJobIdPrefixView = createJobIdPrefixView(p, JobType.TEMP_TABLE_LOAD);
final PCollectionView<String> copyJobIdPrefixView = createJobIdPrefixView(p, JobType.COPY);
final PCollectionView<String> tempFilePrefixView = createTempFilePrefixView(p, loadJobIdPrefixView);
PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
PCollection<WriteBundlesToFiles.Result<DestinationT>> results = (numFileShards == 0) ? writeDynamicallyShardedFilesUntriggered(inputInGlobalWindow, tempFilePrefixView) : writeStaticallyShardedFiles(inputInGlobalWindow, tempFilePrefixView);
TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> multiPartitionsTag = new TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>>("multiPartitionsTag") {
};
TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag = new TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>>("singlePartitionTag") {
};
// This transform will look at the set of files written for each table, and if any table has
// too many files or bytes, will partition that table's files into multiple partitions for
// loading.
PCollectionTuple partitions = results.apply("ReifyResults", new ReifyAsIterable<>()).setCoder(IterableCoder.of(WriteBundlesToFiles.ResultCoder.of(destinationCoder))).apply("WritePartitionUntriggered", ParDo.of(new WritePartition<>(singletonTable, dynamicDestinations, tempFilePrefixView, maxFilesPerPartition, maxBytesPerPartition, multiPartitionsTag, singlePartitionTag, rowWriterFactory)).withSideInputs(tempFilePrefixView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
PCollection<TableDestination> successfulSinglePartitionWrites = writeSinglePartition(partitions.get(singlePartitionTag), loadJobIdPrefixView);
PCollection<TableDestination> successfulMultiPartitionWrites = writeTempTables(partitions.get(multiPartitionsTag), tempLoadJobIdPrefixView).apply("ReifyRenameInput", new ReifyAsIterable<>()).apply("WriteRenameUntriggered", ParDo.of(new WriteRename(bigQueryServices, copyJobIdPrefixView, writeDisposition, createDisposition, maxRetryJobs, kmsKey, loadJobProjectId)).withSideInputs(copyJobIdPrefixView));
PCollectionList<TableDestination> allSuccessfulWrites = PCollectionList.of(successfulSinglePartitionWrites).and(successfulMultiPartitionWrites);
return writeResult(p, allSuccessfulWrites.apply(Flatten.pCollections()));
}
use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.
the class BatchLoads method expandTriggered.
// Expand the pipeline when the user has requested periodically-triggered file writes.
private WriteResult expandTriggered(PCollection<KV<DestinationT, ElementT>> input) {
Pipeline p = input.getPipeline();
final PCollectionView<String> loadJobIdPrefixView = createJobIdPrefixView(p, JobType.LOAD);
final PCollectionView<String> tempLoadJobIdPrefixView = createJobIdPrefixView(p, JobType.TEMP_TABLE_LOAD);
final PCollectionView<String> copyJobIdPrefixView = createJobIdPrefixView(p, JobType.COPY);
final PCollectionView<String> tempFilePrefixView = createTempFilePrefixView(p, loadJobIdPrefixView);
PCollection<WriteBundlesToFiles.Result<DestinationT>> results;
if (numFileShards > 0) {
// The user-supplied triggeringFrequency is often chosen to control how many BigQuery load
// jobs are generated, to prevent going over BigQuery's daily quota for load jobs. If this
// is set to a large value, currently we have to buffer all the data until the trigger fires.
// Instead we ensure that the files are written if a threshold number of records are ready.
// We use only the user-supplied trigger on the actual BigQuery load. This allows us to
// offload the data to the filesystem.
PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterFirst.of(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency), AfterPane.elementCountAtLeast(FILE_TRIGGERING_RECORD_COUNT)))).discardingFiredPanes());
results = writeStaticallyShardedFiles(inputInGlobalWindow, tempFilePrefixView);
} else {
// In the case of dynamic sharding, however, we use a default trigger since the transform
// performs sharding also batches elements to avoid generating too many tiny files. User
// trigger is applied right after writes to limit the number of load jobs.
PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
results = writeDynamicallyShardedFilesTriggered(inputInGlobalWindow, tempFilePrefixView);
}
// Apply the user's trigger before we start generating BigQuery load jobs.
results = results.apply("applyUserTrigger", Window.<WriteBundlesToFiles.Result<DestinationT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency))).discardingFiredPanes());
TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> multiPartitionsTag = new TupleTag<>("multiPartitionsTag");
TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag = new TupleTag<>("singlePartitionTag");
// If we have non-default triggered output, we can't use the side-input technique used in
// expandUntriggered. Instead make the result list a main input. Apply a GroupByKey first for
// determinism.
PCollectionTuple partitions = results.apply("AttachDestinationKey", WithKeys.of(result -> result.destination)).setCoder(KvCoder.of(destinationCoder, WriteBundlesToFiles.ResultCoder.of(destinationCoder))).apply("GroupFilesByDestination", GroupByKey.create()).apply("ExtractResultValues", Values.create()).apply("WritePartitionTriggered", ParDo.of(new WritePartition<>(singletonTable, dynamicDestinations, tempFilePrefixView, maxFilesPerPartition, maxBytesPerPartition, multiPartitionsTag, singlePartitionTag, rowWriterFactory)).withSideInputs(tempFilePrefixView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
PCollection<KV<TableDestination, WriteTables.Result>> tempTables = writeTempTables(partitions.get(multiPartitionsTag), tempLoadJobIdPrefixView);
PCollection<TableDestination> successfulMultiPartitionWrites = tempTables.apply("Window Into Global Windows", Window.<KV<TableDestination, WriteTables.Result>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))).apply("Add Void Key", WithKeys.of((Void) null)).setCoder(KvCoder.of(VoidCoder.of(), tempTables.getCoder())).apply("GroupByKey", GroupByKey.create()).apply("Extract Values", Values.create()).apply("WriteRenameTriggered", ParDo.of(new WriteRename(bigQueryServices, copyJobIdPrefixView, writeDisposition, createDisposition, maxRetryJobs, kmsKey, loadJobProjectId)).withSideInputs(copyJobIdPrefixView));
PCollection<TableDestination> successfulSinglePartitionWrites = writeSinglePartition(partitions.get(singlePartitionTag), loadJobIdPrefixView).apply("RewindowSinglePartitionResults", Window.<TableDestination>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))));
PCollectionList<TableDestination> allSuccessfulWrites = PCollectionList.of(successfulMultiPartitionWrites).and(successfulSinglePartitionWrites);
return writeResult(p, allSuccessfulWrites.apply(Flatten.pCollections()));
}
use of org.apache.beam.sdk.values.ShardedKey in project beam by apache.
the class StreamingWriteTables method writeAndGetErrors.
private <T> PCollectionTuple writeAndGetErrors(PCollection<KV<TableDestination, ElementT>> input, TupleTag<T> failedInsertsTag, AtomicCoder<T> coder, ErrorContainer<T> errorContainer) {
BigQueryOptions options = input.getPipeline().getOptions().as(BigQueryOptions.class);
if (autoSharding && deterministicRecordIdFn == null) {
// If runner determined dynamic sharding is enabled, group TableRows on table destinations
// that may be sharded during the runtime. Otherwise, we choose a fixed number of shards per
// table destination following the logic below in the other branch.
PCollection<KV<String, TableRowInfo<ElementT>>> unshardedTagged = input.apply("MapToTableSpec", MapElements.via(new SimpleFunction<KV<TableDestination, ElementT>, KV<String, ElementT>>() {
@Override
public KV<String, ElementT> apply(KV<TableDestination, ElementT> input) {
return KV.of(input.getKey().getTableSpec(), input.getValue());
}
})).setCoder(KvCoder.of(StringUtf8Coder.of(), elementCoder)).apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds<>())).setCoder(KvCoder.of(StringUtf8Coder.of(), TableRowInfoCoder.of(elementCoder)));
// the same time batches the TableRows to be inserted to BigQuery.
return unshardedTagged.apply("StreamingWrite", new BatchedStreamingWrite<>(bigQueryServices, retryPolicy, failedInsertsTag, coder, errorContainer, skipInvalidRows, ignoreUnknownValues, ignoreInsertIds, toTableRow, toFailsafeTableRow).viaStateful());
} else {
// We create 50 keys per BigQuery table to generate output on. This is few enough that we
// get good batching into BigQuery's insert calls, and enough that we can max out the
// streaming insert quota.
int numShards = options.getNumStreamingKeys();
PCollection<KV<ShardedKey<String>, TableRowInfo<ElementT>>> shardedTagged = input.apply("ShardTableWrites", ParDo.of(new GenerateShardedTable<>(numShards))).setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), elementCoder)).apply("TagWithUniqueIds", ParDo.of(new TagWithUniqueIds<>(deterministicRecordIdFn))).setCoder(KvCoder.of(ShardedKeyCoder.of(StringUtf8Coder.of()), TableRowInfoCoder.of(elementCoder)));
if (deterministicRecordIdFn == null) {
// If not using a deterministic function for record ids, we must apply a reshuffle to ensure
// determinism on the generated ids.
shardedTagged = shardedTagged.apply(Reshuffle.of());
}
return shardedTagged.apply("GlobalWindow", Window.<KV<ShardedKey<String>, TableRowInfo<ElementT>>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes()).apply("StripShardId", MapElements.via(new SimpleFunction<KV<ShardedKey<String>, TableRowInfo<ElementT>>, KV<String, TableRowInfo<ElementT>>>() {
@Override
public KV<String, TableRowInfo<ElementT>> apply(KV<ShardedKey<String>, TableRowInfo<ElementT>> input) {
return KV.of(input.getKey().getKey(), input.getValue());
}
})).setCoder(KvCoder.of(StringUtf8Coder.of(), TableRowInfoCoder.of(elementCoder))).apply("StreamingWrite", new BatchedStreamingWrite<>(bigQueryServices, retryPolicy, failedInsertsTag, coder, errorContainer, skipInvalidRows, ignoreUnknownValues, ignoreInsertIds, toTableRow, toFailsafeTableRow).viaDoFnFinalization());
}
}
Aggregations