use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class CalculateSchemas method expand.
@Override
public PCollectionView<Map<DestinationT, String>> expand(PCollection<KV<DestinationT, TableRow>> input) {
List<PCollectionView<?>> sideInputs = Lists.newArrayList();
sideInputs.addAll(dynamicDestinations.getSideInputs());
return input.apply("Keys", Keys.<DestinationT>create()).apply("Distinct Keys", Distinct.<DestinationT>create()).apply("GetSchemas", ParDo.of(new DoFn<DestinationT, KV<DestinationT, String>>() {
@ProcessElement
public void processElement(ProcessContext c) throws Exception {
dynamicDestinations.setSideInputAccessorFromProcessContext(c);
TableSchema tableSchema = dynamicDestinations.getSchema(c.element());
if (tableSchema != null) {
// If the createDisposition is CREATE_NEVER, then there's no need for a
// schema, and getSchema might return null. In this case, we simply
// leave it out of the map.
c.output(KV.of(c.element(), BigQueryHelpers.toJsonString(tableSchema)));
}
}
}).withSideInputs(sideInputs)).apply("asMap", View.<DestinationT, String>asMap());
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class FnApiStateAccessor method get.
@Override
@Nullable
public <T> T get(PCollectionView<T> view, BoundedWindow window) {
TupleTag<?> tag = view.getTagInternal();
SideInputSpec sideInputSpec = sideInputSpecMap.get(tag);
checkArgument(sideInputSpec != null, "Attempting to access unknown side input %s.", view);
ByteString.Output encodedWindowOut = ByteString.newOutput();
try {
sideInputSpec.getWindowCoder().encode(sideInputSpec.getWindowMappingFn().getSideInputWindow(window), encodedWindowOut);
} catch (IOException e) {
throw new IllegalStateException(e);
}
ByteString encodedWindow = encodedWindowOut.toByteString();
StateKey.Builder cacheKeyBuilder = StateKey.newBuilder();
switch(sideInputSpec.getAccessPattern()) {
case Materializations.ITERABLE_MATERIALIZATION_URN:
cacheKeyBuilder.getIterableSideInputBuilder().setTransformId(ptransformId).setSideInputId(tag.getId()).setWindow(encodedWindow);
break;
case Materializations.MULTIMAP_MATERIALIZATION_URN:
checkState(sideInputSpec.getCoder() instanceof KvCoder, "Expected %s but received %s.", KvCoder.class, sideInputSpec.getCoder().getClass());
cacheKeyBuilder.getMultimapKeysSideInputBuilder().setTransformId(ptransformId).setSideInputId(tag.getId()).setWindow(encodedWindow);
break;
default:
throw new IllegalStateException(String.format("This SDK is only capable of dealing with %s materializations " + "but was asked to handle %s for PCollectionView with tag %s.", ImmutableList.of(Materializations.ITERABLE_MATERIALIZATION_URN, Materializations.MULTIMAP_MATERIALIZATION_URN), sideInputSpec.getAccessPattern(), tag));
}
return (T) stateKeyObjectCache.computeIfAbsent(cacheKeyBuilder.build(), key -> {
switch(sideInputSpec.getAccessPattern()) {
case Materializations.ITERABLE_MATERIALIZATION_URN:
return sideInputSpec.getViewFn().apply(new IterableSideInput<>(getCacheFor(key), beamFnStateClient, processBundleInstructionId.get(), key, sideInputSpec.getCoder()));
case Materializations.MULTIMAP_MATERIALIZATION_URN:
return sideInputSpec.getViewFn().apply(new MultimapSideInput<>(getCacheFor(key), beamFnStateClient, processBundleInstructionId.get(), key, ((KvCoder) sideInputSpec.getCoder()).getKeyCoder(), ((KvCoder) sideInputSpec.getCoder()).getValueCoder()));
default:
throw new IllegalStateException(String.format("This SDK is only capable of dealing with %s materializations " + "but was asked to handle %s for PCollectionView with tag %s.", ImmutableList.of(Materializations.ITERABLE_MATERIALIZATION_URN, Materializations.MULTIMAP_MATERIALIZATION_URN), sideInputSpec.getAccessPattern(), tag));
}
});
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class BatchLoads method expandTriggered.
// Expand the pipeline when the user has requested periodically-triggered file writes.
private WriteResult expandTriggered(PCollection<KV<DestinationT, ElementT>> input) {
Pipeline p = input.getPipeline();
final PCollectionView<String> loadJobIdPrefixView = createJobIdPrefixView(p, JobType.LOAD);
final PCollectionView<String> tempLoadJobIdPrefixView = createJobIdPrefixView(p, JobType.TEMP_TABLE_LOAD);
final PCollectionView<String> copyJobIdPrefixView = createJobIdPrefixView(p, JobType.COPY);
final PCollectionView<String> tempFilePrefixView = createTempFilePrefixView(p, loadJobIdPrefixView);
PCollection<WriteBundlesToFiles.Result<DestinationT>> results;
if (numFileShards > 0) {
// The user-supplied triggeringFrequency is often chosen to control how many BigQuery load
// jobs are generated, to prevent going over BigQuery's daily quota for load jobs. If this
// is set to a large value, currently we have to buffer all the data until the trigger fires.
// Instead we ensure that the files are written if a threshold number of records are ready.
// We use only the user-supplied trigger on the actual BigQuery load. This allows us to
// offload the data to the filesystem.
PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterFirst.of(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency), AfterPane.elementCountAtLeast(FILE_TRIGGERING_RECORD_COUNT)))).discardingFiredPanes());
results = writeStaticallyShardedFiles(inputInGlobalWindow, tempFilePrefixView);
} else {
// In the case of dynamic sharding, however, we use a default trigger since the transform
// performs sharding also batches elements to avoid generating too many tiny files. User
// trigger is applied right after writes to limit the number of load jobs.
PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
results = writeDynamicallyShardedFilesTriggered(inputInGlobalWindow, tempFilePrefixView);
}
// Apply the user's trigger before we start generating BigQuery load jobs.
results = results.apply("applyUserTrigger", Window.<WriteBundlesToFiles.Result<DestinationT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency))).discardingFiredPanes());
TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> multiPartitionsTag = new TupleTag<>("multiPartitionsTag");
TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag = new TupleTag<>("singlePartitionTag");
// If we have non-default triggered output, we can't use the side-input technique used in
// expandUntriggered. Instead make the result list a main input. Apply a GroupByKey first for
// determinism.
PCollectionTuple partitions = results.apply("AttachDestinationKey", WithKeys.of(result -> result.destination)).setCoder(KvCoder.of(destinationCoder, WriteBundlesToFiles.ResultCoder.of(destinationCoder))).apply("GroupFilesByDestination", GroupByKey.create()).apply("ExtractResultValues", Values.create()).apply("WritePartitionTriggered", ParDo.of(new WritePartition<>(singletonTable, dynamicDestinations, tempFilePrefixView, maxFilesPerPartition, maxBytesPerPartition, multiPartitionsTag, singlePartitionTag, rowWriterFactory)).withSideInputs(tempFilePrefixView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
PCollection<KV<TableDestination, WriteTables.Result>> tempTables = writeTempTables(partitions.get(multiPartitionsTag), tempLoadJobIdPrefixView);
PCollection<TableDestination> successfulMultiPartitionWrites = tempTables.apply("Window Into Global Windows", Window.<KV<TableDestination, WriteTables.Result>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))).apply("Add Void Key", WithKeys.of((Void) null)).setCoder(KvCoder.of(VoidCoder.of(), tempTables.getCoder())).apply("GroupByKey", GroupByKey.create()).apply("Extract Values", Values.create()).apply("WriteRenameTriggered", ParDo.of(new WriteRename(bigQueryServices, copyJobIdPrefixView, writeDisposition, createDisposition, maxRetryJobs, kmsKey, loadJobProjectId)).withSideInputs(copyJobIdPrefixView));
PCollection<TableDestination> successfulSinglePartitionWrites = writeSinglePartition(partitions.get(singlePartitionTag), loadJobIdPrefixView).apply("RewindowSinglePartitionResults", Window.<TableDestination>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))));
PCollectionList<TableDestination> allSuccessfulWrites = PCollectionList.of(successfulMultiPartitionWrites).and(successfulSinglePartitionWrites);
return writeResult(p, allSuccessfulWrites.apply(Flatten.pCollections()));
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class BigQueryIOWriteTest method testWriteWithDynamicTables.
@Test
public void testWriteWithDynamicTables() throws Exception {
List<Integer> inserts = new ArrayList<>();
for (int i = 0; i < 10; i++) {
inserts.add(i);
}
// Create a windowing strategy that puts the input into five different windows depending on
// record value.
WindowFn<Integer, PartitionedGlobalWindow> windowFn = new PartitionedGlobalWindows<>(i -> Integer.toString(i % 5));
final Map<Integer, TableDestination> targetTables = Maps.newHashMap();
Map<String, String> schemas = Maps.newHashMap();
for (int i = 0; i < 5; i++) {
TableDestination destination = new TableDestination("project-id:dataset-id" + ".table-id-" + i, "");
targetTables.put(i, destination);
// Make sure each target table has its own custom table.
schemas.put(destination.getTableSpec(), toJsonString(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"), new TableFieldSchema().setName("custom_" + i).setType("STRING")))));
}
SerializableFunction<ValueInSingleWindow<Integer>, TableDestination> tableFunction = input -> {
PartitionedGlobalWindow window = (PartitionedGlobalWindow) input.getWindow();
// Check that we can access the element as well here and that it matches the window.
checkArgument(window.value.equals(Integer.toString(input.getValue() % 5)), "Incorrect element");
return targetTables.get(input.getValue() % 5);
};
PCollection<Integer> input = p.apply("CreateSource", Create.of(inserts));
if (useStreaming) {
input = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
PCollectionView<Map<String, String>> schemasView = p.apply("CreateSchemaMap", Create.of(schemas)).apply("ViewSchemaAsMap", View.asMap());
input.apply(Window.into(windowFn)).apply(BigQueryIO.<Integer>write().to(tableFunction).withFormatFunction(i -> new TableRow().set("name", "number" + i).set("number", Integer.toString(i))).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchemaFromView(schemasView).withTestServices(fakeBqServices).withoutValidation());
p.run();
for (int i = 0; i < 5; ++i) {
String tableId = String.format("table-id-%d", i);
String tableSpec = String.format("project-id:dataset-id.%s", tableId);
// Verify that table was created with the correct schema.
assertThat(toJsonString(fakeDatasetService.getTable(new TableReference().setProjectId("project-id").setDatasetId("dataset-id").setTableId(tableId)).getSchema()), equalTo(schemas.get(tableSpec)));
// Verify that the table has the expected contents.
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", tableId), containsInAnyOrder(new TableRow().set("name", String.format("number%d", i)).set("number", Integer.toString(i)), new TableRow().set("name", String.format("number%d", i + 5)).set("number", Integer.toString(i + 5))));
}
}
use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.
the class ParDoTranslation method getSideInputs.
public static List<PCollectionView<?>> getSideInputs(AppliedPTransform<?, ?, ?> application) throws IOException {
PTransform<?, ?> transform = application.getTransform();
if (transform instanceof ParDo.MultiOutput) {
return ((ParDo.MultiOutput<?, ?>) transform).getSideInputs().values().stream().collect(Collectors.toList());
}
SdkComponents sdkComponents = SdkComponents.create(application.getPipeline().getOptions());
RunnerApi.PTransform parDoProto = PTransformTranslation.toProto(application, sdkComponents);
ParDoPayload payload = ParDoPayload.parseFrom(parDoProto.getSpec().getPayload());
List<PCollectionView<?>> views = new ArrayList<>();
RehydratedComponents components = RehydratedComponents.forComponents(sdkComponents.toComponents());
for (Map.Entry<String, SideInput> sideInputEntry : payload.getSideInputsMap().entrySet()) {
String sideInputTag = sideInputEntry.getKey();
RunnerApi.SideInput sideInput = sideInputEntry.getValue();
PCollection<?> originalPCollection = checkNotNull((PCollection<?>) application.getInputs().get(new TupleTag<>(sideInputTag)), "no input with tag %s", sideInputTag);
views.add(PCollectionViewTranslation.viewFromProto(sideInput, sideInputTag, originalPCollection, parDoProto, components));
}
return views;
}
Aggregations