Search in sources :

Example 31 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class CalculateSchemas method expand.

@Override
public PCollectionView<Map<DestinationT, String>> expand(PCollection<KV<DestinationT, TableRow>> input) {
    List<PCollectionView<?>> sideInputs = Lists.newArrayList();
    sideInputs.addAll(dynamicDestinations.getSideInputs());
    return input.apply("Keys", Keys.<DestinationT>create()).apply("Distinct Keys", Distinct.<DestinationT>create()).apply("GetSchemas", ParDo.of(new DoFn<DestinationT, KV<DestinationT, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            dynamicDestinations.setSideInputAccessorFromProcessContext(c);
            TableSchema tableSchema = dynamicDestinations.getSchema(c.element());
            if (tableSchema != null) {
                // If the createDisposition is CREATE_NEVER, then there's no need for a
                // schema, and getSchema might return null. In this case, we simply
                // leave it out of the map.
                c.output(KV.of(c.element(), BigQueryHelpers.toJsonString(tableSchema)));
            }
        }
    }).withSideInputs(sideInputs)).apply("asMap", View.<DestinationT, String>asMap());
}
Also used : PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) TableSchema(com.google.api.services.bigquery.model.TableSchema)

Example 32 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class FnApiStateAccessor method get.

@Override
@Nullable
public <T> T get(PCollectionView<T> view, BoundedWindow window) {
    TupleTag<?> tag = view.getTagInternal();
    SideInputSpec sideInputSpec = sideInputSpecMap.get(tag);
    checkArgument(sideInputSpec != null, "Attempting to access unknown side input %s.", view);
    ByteString.Output encodedWindowOut = ByteString.newOutput();
    try {
        sideInputSpec.getWindowCoder().encode(sideInputSpec.getWindowMappingFn().getSideInputWindow(window), encodedWindowOut);
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
    ByteString encodedWindow = encodedWindowOut.toByteString();
    StateKey.Builder cacheKeyBuilder = StateKey.newBuilder();
    switch(sideInputSpec.getAccessPattern()) {
        case Materializations.ITERABLE_MATERIALIZATION_URN:
            cacheKeyBuilder.getIterableSideInputBuilder().setTransformId(ptransformId).setSideInputId(tag.getId()).setWindow(encodedWindow);
            break;
        case Materializations.MULTIMAP_MATERIALIZATION_URN:
            checkState(sideInputSpec.getCoder() instanceof KvCoder, "Expected %s but received %s.", KvCoder.class, sideInputSpec.getCoder().getClass());
            cacheKeyBuilder.getMultimapKeysSideInputBuilder().setTransformId(ptransformId).setSideInputId(tag.getId()).setWindow(encodedWindow);
            break;
        default:
            throw new IllegalStateException(String.format("This SDK is only capable of dealing with %s materializations " + "but was asked to handle %s for PCollectionView with tag %s.", ImmutableList.of(Materializations.ITERABLE_MATERIALIZATION_URN, Materializations.MULTIMAP_MATERIALIZATION_URN), sideInputSpec.getAccessPattern(), tag));
    }
    return (T) stateKeyObjectCache.computeIfAbsent(cacheKeyBuilder.build(), key -> {
        switch(sideInputSpec.getAccessPattern()) {
            case Materializations.ITERABLE_MATERIALIZATION_URN:
                return sideInputSpec.getViewFn().apply(new IterableSideInput<>(getCacheFor(key), beamFnStateClient, processBundleInstructionId.get(), key, sideInputSpec.getCoder()));
            case Materializations.MULTIMAP_MATERIALIZATION_URN:
                return sideInputSpec.getViewFn().apply(new MultimapSideInput<>(getCacheFor(key), beamFnStateClient, processBundleInstructionId.get(), key, ((KvCoder) sideInputSpec.getCoder()).getKeyCoder(), ((KvCoder) sideInputSpec.getCoder()).getValueCoder()));
            default:
                throw new IllegalStateException(String.format("This SDK is only capable of dealing with %s materializations " + "but was asked to handle %s for PCollectionView with tag %s.", ImmutableList.of(Materializations.ITERABLE_MATERIALIZATION_URN, Materializations.MULTIMAP_MATERIALIZATION_URN), sideInputSpec.getAccessPattern(), tag));
        }
    });
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) CombineFn(org.apache.beam.sdk.transforms.Combine.CombineFn) CombineFnWithContext(org.apache.beam.sdk.transforms.CombineWithContext.CombineFnWithContext) SetState(org.apache.beam.sdk.state.SetState) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) Coder(org.apache.beam.sdk.coders.Coder) ValueState(org.apache.beam.sdk.state.ValueState) StateContext(org.apache.beam.sdk.state.StateContext) Function(java.util.function.Function) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) CacheToken(org.apache.beam.model.fnexecution.v1.BeamFnApi.ProcessBundleRequest.CacheToken) MapState(org.apache.beam.sdk.state.MapState) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) Cache(org.apache.beam.fn.harness.Cache) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Maps(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Materializations(org.apache.beam.sdk.transforms.Materializations) CombineFnUtil(org.apache.beam.sdk.util.CombineFnUtil) Nullable(org.checkerframework.checker.nullness.qual.Nullable) SideInputReader(org.apache.beam.runners.core.SideInputReader) KvCoder(org.apache.beam.sdk.coders.KvCoder) OrderedListState(org.apache.beam.sdk.state.OrderedListState) Iterator(java.util.Iterator) Collection(java.util.Collection) ReadableStates(org.apache.beam.sdk.state.ReadableStates) IOException(java.io.IOException) BeamFnApi(org.apache.beam.model.fnexecution.v1.BeamFnApi) List(java.util.List) BagState(org.apache.beam.sdk.state.BagState) CombiningState(org.apache.beam.sdk.state.CombiningState) WatermarkHoldState(org.apache.beam.sdk.state.WatermarkHoldState) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Caches(org.apache.beam.fn.harness.Caches) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) StateBinder(org.apache.beam.sdk.state.StateBinder) StateKey(org.apache.beam.model.fnexecution.v1.BeamFnApi.StateKey) ThrowingRunnable(org.apache.beam.sdk.function.ThrowingRunnable) ReadableState(org.apache.beam.sdk.state.ReadableState) StateKey(org.apache.beam.model.fnexecution.v1.BeamFnApi.StateKey) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) KvCoder(org.apache.beam.sdk.coders.KvCoder) IOException(java.io.IOException) Nullable(org.checkerframework.checker.nullness.qual.Nullable)

Example 33 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class BatchLoads method expandTriggered.

// Expand the pipeline when the user has requested periodically-triggered file writes.
private WriteResult expandTriggered(PCollection<KV<DestinationT, ElementT>> input) {
    Pipeline p = input.getPipeline();
    final PCollectionView<String> loadJobIdPrefixView = createJobIdPrefixView(p, JobType.LOAD);
    final PCollectionView<String> tempLoadJobIdPrefixView = createJobIdPrefixView(p, JobType.TEMP_TABLE_LOAD);
    final PCollectionView<String> copyJobIdPrefixView = createJobIdPrefixView(p, JobType.COPY);
    final PCollectionView<String> tempFilePrefixView = createTempFilePrefixView(p, loadJobIdPrefixView);
    PCollection<WriteBundlesToFiles.Result<DestinationT>> results;
    if (numFileShards > 0) {
        // The user-supplied triggeringFrequency is often chosen to control how many BigQuery load
        // jobs are generated, to prevent going over BigQuery's daily quota for load jobs. If this
        // is set to a large value, currently we have to buffer all the data until the trigger fires.
        // Instead we ensure that the files are written if a threshold number of records are ready.
        // We use only the user-supplied trigger on the actual BigQuery load. This allows us to
        // offload the data to the filesystem.
        PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterFirst.of(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency), AfterPane.elementCountAtLeast(FILE_TRIGGERING_RECORD_COUNT)))).discardingFiredPanes());
        results = writeStaticallyShardedFiles(inputInGlobalWindow, tempFilePrefixView);
    } else {
        // In the case of dynamic sharding, however, we use a default trigger since the transform
        // performs sharding also batches elements to avoid generating too many tiny files. User
        // trigger is applied right after writes to limit the number of load jobs.
        PCollection<KV<DestinationT, ElementT>> inputInGlobalWindow = input.apply("rewindowIntoGlobal", Window.<KV<DestinationT, ElementT>>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
        results = writeDynamicallyShardedFilesTriggered(inputInGlobalWindow, tempFilePrefixView);
    }
    // Apply the user's trigger before we start generating BigQuery load jobs.
    results = results.apply("applyUserTrigger", Window.<WriteBundlesToFiles.Result<DestinationT>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterProcessingTime.pastFirstElementInPane().plusDelayOf(triggeringFrequency))).discardingFiredPanes());
    TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> multiPartitionsTag = new TupleTag<>("multiPartitionsTag");
    TupleTag<KV<ShardedKey<DestinationT>, WritePartition.Result>> singlePartitionTag = new TupleTag<>("singlePartitionTag");
    // If we have non-default triggered output, we can't use the side-input technique used in
    // expandUntriggered. Instead make the result list a main input. Apply a GroupByKey first for
    // determinism.
    PCollectionTuple partitions = results.apply("AttachDestinationKey", WithKeys.of(result -> result.destination)).setCoder(KvCoder.of(destinationCoder, WriteBundlesToFiles.ResultCoder.of(destinationCoder))).apply("GroupFilesByDestination", GroupByKey.create()).apply("ExtractResultValues", Values.create()).apply("WritePartitionTriggered", ParDo.of(new WritePartition<>(singletonTable, dynamicDestinations, tempFilePrefixView, maxFilesPerPartition, maxBytesPerPartition, multiPartitionsTag, singlePartitionTag, rowWriterFactory)).withSideInputs(tempFilePrefixView).withOutputTags(multiPartitionsTag, TupleTagList.of(singlePartitionTag)));
    PCollection<KV<TableDestination, WriteTables.Result>> tempTables = writeTempTables(partitions.get(multiPartitionsTag), tempLoadJobIdPrefixView);
    PCollection<TableDestination> successfulMultiPartitionWrites = tempTables.apply("Window Into Global Windows", Window.<KV<TableDestination, WriteTables.Result>>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1)))).apply("Add Void Key", WithKeys.of((Void) null)).setCoder(KvCoder.of(VoidCoder.of(), tempTables.getCoder())).apply("GroupByKey", GroupByKey.create()).apply("Extract Values", Values.create()).apply("WriteRenameTriggered", ParDo.of(new WriteRename(bigQueryServices, copyJobIdPrefixView, writeDisposition, createDisposition, maxRetryJobs, kmsKey, loadJobProjectId)).withSideInputs(copyJobIdPrefixView));
    PCollection<TableDestination> successfulSinglePartitionWrites = writeSinglePartition(partitions.get(singlePartitionTag), loadJobIdPrefixView).apply("RewindowSinglePartitionResults", Window.<TableDestination>into(new GlobalWindows()).triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))));
    PCollectionList<TableDestination> allSuccessfulWrites = PCollectionList.of(successfulMultiPartitionWrites).and(successfulSinglePartitionWrites);
    return writeResult(p, allSuccessfulWrites.apply(Flatten.pCollections()));
}
Also used : LoggerFactory(org.slf4j.LoggerFactory) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) PCollectionList(org.apache.beam.sdk.values.PCollectionList) Strings(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings) Create(org.apache.beam.sdk.transforms.Create) TableRow(com.google.api.services.bigquery.model.TableRow) Window(org.apache.beam.sdk.transforms.windowing.Window) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) ValueProvider(org.apache.beam.sdk.options.ValueProvider) Keys(org.apache.beam.sdk.transforms.Keys) ShardedKey(org.apache.beam.sdk.values.ShardedKey) Flatten(org.apache.beam.sdk.transforms.Flatten) MapElements(org.apache.beam.sdk.transforms.MapElements) KvCoder(org.apache.beam.sdk.coders.KvCoder) Repeatedly(org.apache.beam.sdk.transforms.windowing.Repeatedly) Set(java.util.Set) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) BigQueryHelpers.resolveTempLocation(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.resolveTempLocation) List(java.util.List) DefaultTrigger(org.apache.beam.sdk.transforms.windowing.DefaultTrigger) ParDo(org.apache.beam.sdk.transforms.ParDo) VisibleForTesting(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting) AfterFirst(org.apache.beam.sdk.transforms.windowing.AfterFirst) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) AfterPane(org.apache.beam.sdk.transforms.windowing.AfterPane) NullableCoder(org.apache.beam.sdk.coders.NullableCoder) Values(org.apache.beam.sdk.transforms.Values) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) Duration(org.joda.time.Duration) Coder(org.apache.beam.sdk.coders.Coder) View(org.apache.beam.sdk.transforms.View) TupleTagList(org.apache.beam.sdk.values.TupleTagList) PTransform(org.apache.beam.sdk.transforms.PTransform) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) TupleTag(org.apache.beam.sdk.values.TupleTag) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) JobType(org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.JobType) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Nullable(org.checkerframework.checker.nullness.qual.Nullable) GroupIntoBatches(org.apache.beam.sdk.transforms.GroupIntoBatches) GcsPath(org.apache.beam.sdk.extensions.gcp.util.gcsfs.GcsPath) DoFn(org.apache.beam.sdk.transforms.DoFn) Reshuffle(org.apache.beam.sdk.transforms.Reshuffle) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WithKeys(org.apache.beam.sdk.transforms.WithKeys) Logger(org.slf4j.Logger) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) ShardedKeyCoder(org.apache.beam.sdk.coders.ShardedKeyCoder) PCollection(org.apache.beam.sdk.values.PCollection) AfterProcessingTime(org.apache.beam.sdk.transforms.windowing.AfterProcessingTime) WriteDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Collections(java.util.Collections) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteBundlesToFiles.Result) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Example 34 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class BigQueryIOWriteTest method testWriteWithDynamicTables.

@Test
public void testWriteWithDynamicTables() throws Exception {
    List<Integer> inserts = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        inserts.add(i);
    }
    // Create a windowing strategy that puts the input into five different windows depending on
    // record value.
    WindowFn<Integer, PartitionedGlobalWindow> windowFn = new PartitionedGlobalWindows<>(i -> Integer.toString(i % 5));
    final Map<Integer, TableDestination> targetTables = Maps.newHashMap();
    Map<String, String> schemas = Maps.newHashMap();
    for (int i = 0; i < 5; i++) {
        TableDestination destination = new TableDestination("project-id:dataset-id" + ".table-id-" + i, "");
        targetTables.put(i, destination);
        // Make sure each target table has its own custom table.
        schemas.put(destination.getTableSpec(), toJsonString(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"), new TableFieldSchema().setName("custom_" + i).setType("STRING")))));
    }
    SerializableFunction<ValueInSingleWindow<Integer>, TableDestination> tableFunction = input -> {
        PartitionedGlobalWindow window = (PartitionedGlobalWindow) input.getWindow();
        // Check that we can access the element as well here and that it matches the window.
        checkArgument(window.value.equals(Integer.toString(input.getValue() % 5)), "Incorrect element");
        return targetTables.get(input.getValue() % 5);
    };
    PCollection<Integer> input = p.apply("CreateSource", Create.of(inserts));
    if (useStreaming) {
        input = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
    }
    PCollectionView<Map<String, String>> schemasView = p.apply("CreateSchemaMap", Create.of(schemas)).apply("ViewSchemaAsMap", View.asMap());
    input.apply(Window.into(windowFn)).apply(BigQueryIO.<Integer>write().to(tableFunction).withFormatFunction(i -> new TableRow().set("name", "number" + i).set("number", Integer.toString(i))).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchemaFromView(schemasView).withTestServices(fakeBqServices).withoutValidation());
    p.run();
    for (int i = 0; i < 5; ++i) {
        String tableId = String.format("table-id-%d", i);
        String tableSpec = String.format("project-id:dataset-id.%s", tableId);
        // Verify that table was created with the correct schema.
        assertThat(toJsonString(fakeDatasetService.getTable(new TableReference().setProjectId("project-id").setDatasetId("dataset-id").setTableId(tableId)).getSchema()), equalTo(schemas.get(tableSpec)));
        // Verify that the table has the expected contents.
        assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", tableId), containsInAnyOrder(new TableRow().set("name", String.format("number%d", i)).set("number", Integer.toString(i)), new TableRow().set("name", String.format("number%d", i + 5)).set("number", Integer.toString(i + 5))));
    }
}
Also used : ExpectedLogs(org.apache.beam.sdk.testing.ExpectedLogs) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) ValueInSingleWindow(org.apache.beam.sdk.values.ValueInSingleWindow) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Encoder(org.apache.avro.io.Encoder) ResultCoder(org.apache.beam.sdk.io.gcp.bigquery.WritePartition.ResultCoder) Matcher(java.util.regex.Matcher) DoFnTester(org.apache.beam.sdk.transforms.DoFnTester) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) EnumSet(java.util.EnumSet) ValueProvider(org.apache.beam.sdk.options.ValueProvider) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) KvCoder(org.apache.beam.sdk.coders.KvCoder) Matchers.allOf(org.hamcrest.Matchers.allOf) Set(java.util.Set) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType) Serializable(java.io.Serializable) IncompatibleWindowException(org.apache.beam.sdk.transforms.windowing.IncompatibleWindowException) Assert.assertFalse(org.junit.Assert.assertFalse) AutoValue(com.google.auto.value.AutoValue) TestStream(org.apache.beam.sdk.testing.TestStream) Matchers.is(org.hamcrest.Matchers.is) DisplayDataMatchers.hasDisplayItem(org.apache.beam.sdk.transforms.display.DisplayDataMatchers.hasDisplayItem) Write(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write) Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method) Preconditions.checkNotNull(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull) KV(org.apache.beam.sdk.values.KV) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) View(org.apache.beam.sdk.transforms.View) ArrayList(java.util.ArrayList) GenericData(org.apache.avro.generic.GenericData) Distinct(org.apache.beam.sdk.transforms.Distinct) Multimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Multimap) TupleTag(org.apache.beam.sdk.values.TupleTag) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) Maps(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps) StreamSupport(java.util.stream.StreamSupport) JavaFieldSchema(org.apache.beam.sdk.schemas.JavaFieldSchema) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Row(org.apache.beam.sdk.values.Row) Result(org.apache.beam.sdk.io.gcp.bigquery.WriteTables.Result) Before(org.junit.Before) TableReference(com.google.api.services.bigquery.model.TableReference) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Files(java.nio.file.Files) PAssert(org.apache.beam.sdk.testing.PAssert) NonMergingWindowFn(org.apache.beam.sdk.transforms.windowing.NonMergingWindowFn) Parameter(org.junit.runners.Parameterized.Parameter) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) ShardedKeyCoder(org.apache.beam.sdk.coders.ShardedKeyCoder) Test(org.junit.Test) Schema(org.apache.beam.sdk.schemas.Schema) File(java.io.File) Assert.assertNull(org.junit.Assert.assertNull) Paths(java.nio.file.Paths) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) AtomicCoder(org.apache.beam.sdk.coders.AtomicCoder) DefaultSchema(org.apache.beam.sdk.schemas.annotations.DefaultSchema) FakeJobService(org.apache.beam.sdk.io.gcp.testing.FakeJobService) Assert.assertEquals(org.junit.Assert.assertEquals) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) TimePartitioning(com.google.api.services.bigquery.model.TimePartitioning) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) After(org.junit.After) TableRow(com.google.api.services.bigquery.model.TableRow) Assert.fail(org.junit.Assert.fail) TableSchema(com.google.api.services.bigquery.model.TableSchema) ArrayListMultimap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ArrayListMultimap) ShardedKey(org.apache.beam.sdk.values.ShardedKey) Parameterized(org.junit.runners.Parameterized) MapElements(org.apache.beam.sdk.transforms.MapElements) DatumWriter(org.apache.avro.io.DatumWriter) Collection(java.util.Collection) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) CreateDisposition(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition) Description(org.junit.runner.Description) Collectors(java.util.stream.Collectors) List(java.util.List) Clustering(com.google.api.services.bigquery.model.Clustering) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) TableDataInsertAllResponse(com.google.api.services.bigquery.model.TableDataInsertAllResponse) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pattern(java.util.regex.Pattern) ErrorProto(com.google.api.services.bigquery.model.ErrorProto) Statement(org.junit.runners.model.Statement) TestRule(org.junit.rules.TestRule) Parameters(org.junit.runners.Parameterized.Parameters) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) SerializableFunctions(org.apache.beam.sdk.transforms.SerializableFunctions) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) SchemaUpdateOption(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.SchemaUpdateOption) WindowMappingFn(org.apache.beam.sdk.transforms.windowing.WindowMappingFn) SchemaCreate(org.apache.beam.sdk.schemas.annotations.SchemaCreate) Job(com.google.api.services.bigquery.model.Job) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExpectedException(org.junit.rules.ExpectedException) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Matchers.hasEntry(org.hamcrest.Matchers.hasEntry) OutputStream(java.io.OutputStream) DisplayData(org.apache.beam.sdk.transforms.display.DisplayData) GenericRecord(org.apache.avro.generic.GenericRecord) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) Matchers(org.hamcrest.Matchers) PCollection(org.apache.beam.sdk.values.PCollection) Table(com.google.api.services.bigquery.model.Table) Rule(org.junit.Rule) Instant(org.joda.time.Instant) Collections(java.util.Collections) JobConfigurationLoad(com.google.api.services.bigquery.model.JobConfigurationLoad) TemporaryFolder(org.junit.rules.TemporaryFolder) InputStream(java.io.InputStream) TableSchema(com.google.api.services.bigquery.model.TableSchema) ArrayList(java.util.ArrayList) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TableReference(com.google.api.services.bigquery.model.TableReference) TableRow(com.google.api.services.bigquery.model.TableRow) ValueInSingleWindow(org.apache.beam.sdk.values.ValueInSingleWindow) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 35 with PCollectionView

use of org.apache.beam.sdk.values.PCollectionView in project beam by apache.

the class ParDoTranslation method getSideInputs.

public static List<PCollectionView<?>> getSideInputs(AppliedPTransform<?, ?, ?> application) throws IOException {
    PTransform<?, ?> transform = application.getTransform();
    if (transform instanceof ParDo.MultiOutput) {
        return ((ParDo.MultiOutput<?, ?>) transform).getSideInputs().values().stream().collect(Collectors.toList());
    }
    SdkComponents sdkComponents = SdkComponents.create(application.getPipeline().getOptions());
    RunnerApi.PTransform parDoProto = PTransformTranslation.toProto(application, sdkComponents);
    ParDoPayload payload = ParDoPayload.parseFrom(parDoProto.getSpec().getPayload());
    List<PCollectionView<?>> views = new ArrayList<>();
    RehydratedComponents components = RehydratedComponents.forComponents(sdkComponents.toComponents());
    for (Map.Entry<String, SideInput> sideInputEntry : payload.getSideInputsMap().entrySet()) {
        String sideInputTag = sideInputEntry.getKey();
        RunnerApi.SideInput sideInput = sideInputEntry.getValue();
        PCollection<?> originalPCollection = checkNotNull((PCollection<?>) application.getInputs().get(new TupleTag<>(sideInputTag)), "no input with tag %s", sideInputTag);
        views.add(PCollectionViewTranslation.viewFromProto(sideInput, sideInputTag, originalPCollection, parDoProto, components));
    }
    return views;
}
Also used : ParDoPayload(org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput) ArrayList(java.util.ArrayList) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) PCollectionView(org.apache.beam.sdk.values.PCollectionView) ParDo(org.apache.beam.sdk.transforms.ParDo) Map(java.util.Map) HashMap(java.util.HashMap) MultiOutput(org.apache.beam.sdk.transforms.ParDo.MultiOutput) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput)

Aggregations

PCollectionView (org.apache.beam.sdk.values.PCollectionView)67 Map (java.util.Map)29 HashMap (java.util.HashMap)28 Test (org.junit.Test)28 TupleTag (org.apache.beam.sdk.values.TupleTag)27 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 Coder (org.apache.beam.sdk.coders.Coder)21 KV (org.apache.beam.sdk.values.KV)20 Instant (org.joda.time.Instant)20 KvCoder (org.apache.beam.sdk.coders.KvCoder)18 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 PCollection (org.apache.beam.sdk.values.PCollection)18 DoFn (org.apache.beam.sdk.transforms.DoFn)16 ArrayList (java.util.ArrayList)15 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)14 List (java.util.List)13 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)13 IOException (java.io.IOException)12 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)12 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)10