Search in sources :

Example 21 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class SideInputReference method fromSideInputId.

/**
 * Create a side input reference from a SideInputId proto and components.
 */
public static SideInputReference fromSideInputId(SideInputId sideInputId, RunnerApi.Components components) {
    String transformId = sideInputId.getTransformId();
    String localName = sideInputId.getLocalName();
    String collectionId = components.getTransformsOrThrow(transformId).getInputsOrThrow(localName);
    PTransform transform = components.getTransformsOrThrow(transformId);
    PCollection collection = components.getPcollectionsOrThrow(collectionId);
    return SideInputReference.of(PipelineNode.pTransform(transformId, transform), localName, PipelineNode.pCollection(collectionId, collection));
}
Also used : PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)

Example 22 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class InsertFetchAndFilterStreamingSideInputNodesTest method testSdkParDoWithSideInput.

@Test
public void testSdkParDoWithSideInput() throws Exception {
    Pipeline p = Pipeline.create();
    PCollection<String> pc = p.apply(Create.of("a", "b", "c"));
    PCollectionView<List<String>> pcView = pc.apply(View.asList());
    pc.apply(ParDo.of(new TestDoFn()).withSideInputs(pcView));
    RunnerApi.Pipeline pipeline = PipelineTranslation.toProto(p);
    Node predecessor = createParDoNode("predecessor");
    InstructionOutputNode mainInput = InstructionOutputNode.create(new InstructionOutput(), "fakeId");
    Node sideInputParDo = createParDoNode(findParDoWithSideInput(pipeline));
    MutableNetwork<Node, Edge> network = createEmptyNetwork();
    network.addNode(predecessor);
    network.addNode(mainInput);
    network.addNode(sideInputParDo);
    network.addEdge(predecessor, mainInput, DefaultEdge.create());
    network.addEdge(mainInput, sideInputParDo, DefaultEdge.create());
    network = InsertFetchAndFilterStreamingSideInputNodes.with(pipeline).forNetwork(network);
    Node mainInputClone = InstructionOutputNode.create(mainInput.getInstructionOutput(), "fakeId");
    Node fetchAndFilter = FetchAndFilterStreamingSideInputsNode.create(pcView.getWindowingStrategyInternal(), ImmutableMap.of(pcView, ParDoTranslation.translateWindowMappingFn(pcView.getWindowMappingFn(), SdkComponents.create(PipelineOptionsFactory.create()))), NameContextsForTests.nameContextForTest());
    MutableNetwork<Node, Edge> expectedNetwork = createEmptyNetwork();
    expectedNetwork.addNode(predecessor);
    expectedNetwork.addNode(mainInputClone);
    expectedNetwork.addNode(fetchAndFilter);
    expectedNetwork.addNode(mainInput);
    expectedNetwork.addNode(sideInputParDo);
    expectedNetwork.addEdge(predecessor, mainInputClone, DefaultEdge.create());
    expectedNetwork.addEdge(mainInputClone, fetchAndFilter, DefaultEdge.create());
    expectedNetwork.addEdge(fetchAndFilter, mainInput, DefaultEdge.create());
    expectedNetwork.addEdge(mainInput, sideInputParDo, DefaultEdge.create());
    assertThatNetworksAreIdentical(expectedNetwork, network);
}
Also used : FetchAndFilterStreamingSideInputsNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.FetchAndFilterStreamingSideInputsNode) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) ParallelInstructionNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.ParallelInstructionNode) Node(org.apache.beam.runners.dataflow.worker.graph.Nodes.Node) InstructionOutput(com.google.api.services.dataflow.model.InstructionOutput) Pipeline(org.apache.beam.sdk.Pipeline) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) InstructionOutputNode(org.apache.beam.runners.dataflow.worker.graph.Nodes.InstructionOutputNode) ArrayList(java.util.ArrayList) List(java.util.List) Edge(org.apache.beam.runners.dataflow.worker.graph.Edges.Edge) DefaultEdge(org.apache.beam.runners.dataflow.worker.graph.Edges.DefaultEdge) Test(org.junit.Test)

Example 23 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class RegisterNodeFunction method transformSideInputForSdk.

/**
 * Modifies the process bundle descriptor and updates the PTransform that the SDK harness will see
 * with length prefixed coders used on the side input PCollection and windowing strategy.
 */
private static final void transformSideInputForSdk(RunnerApi.Pipeline pipeline, RunnerApi.PTransform originalPTransform, String sideInputTag, ProcessBundleDescriptor.Builder processBundleDescriptor, RunnerApi.PTransform.Builder updatedPTransform) {
    RunnerApi.PCollection sideInputPCollection = pipeline.getComponents().getPcollectionsOrThrow(originalPTransform.getInputsOrThrow(sideInputTag));
    RunnerApi.WindowingStrategy sideInputWindowingStrategy = pipeline.getComponents().getWindowingStrategiesOrThrow(sideInputPCollection.getWindowingStrategyId());
    // TODO: We should not length prefix the window or key for the SDK side since the
    // key and window are already length delimited via protobuf itself. But we need to
    // maintain the length prefixing within the Runner harness to match the bytes that were
    // materialized to the side input sink.
    // We take the original pipeline coders and add any coders we have added when processing side
    // inputs before building new length prefixed variants.
    RunnerApi.Components.Builder componentsBuilder = pipeline.getComponents().toBuilder();
    componentsBuilder.putAllCoders(processBundleDescriptor.getCodersMap());
    String updatedSdkSideInputCoderId = LengthPrefixUnknownCoders.addLengthPrefixedCoder(sideInputPCollection.getCoderId(), componentsBuilder, false);
    String updatedSdkSideInputWindowCoderId = LengthPrefixUnknownCoders.addLengthPrefixedCoder(sideInputWindowingStrategy.getWindowCoderId(), componentsBuilder, false);
    processBundleDescriptor.putAllCoders(componentsBuilder.getCodersMap());
    String updatedSdkWindowingStrategyId = SyntheticComponents.uniqueId(sideInputPCollection.getWindowingStrategyId() + "-runner_generated", processBundleDescriptor.getWindowingStrategiesMap().keySet()::contains);
    processBundleDescriptor.putWindowingStrategies(updatedSdkWindowingStrategyId, sideInputWindowingStrategy.toBuilder().setWindowCoderId(updatedSdkSideInputWindowCoderId).build());
    RunnerApi.PCollection updatedSdkSideInputPcollection = sideInputPCollection.toBuilder().setCoderId(updatedSdkSideInputCoderId).setWindowingStrategyId(updatedSdkWindowingStrategyId).build();
    // Replace the contents of the PCollection with the updated side input PCollection
    // specification and insert it into the update PTransform.
    processBundleDescriptor.putPcollections(originalPTransform.getInputsOrThrow(sideInputTag), updatedSdkSideInputPcollection);
    updatedPTransform.putInputs(sideInputTag, originalPTransform.getInputsOrThrow(sideInputTag));
}
Also used : SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) SyntheticComponents(org.apache.beam.runners.core.construction.SyntheticComponents) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)

Example 24 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class ReadSourcePortableTest method testExecution.

@Test(timeout = 120_000)
public void testExecution() throws Exception {
    PipelineOptions options = PipelineOptionsFactory.fromArgs("--experiments=use_deprecated_read").create();
    options.setRunner(CrashingRunner.class);
    options.as(FlinkPipelineOptions.class).setFlinkMaster("[local]");
    options.as(FlinkPipelineOptions.class).setStreaming(isStreaming);
    options.as(FlinkPipelineOptions.class).setParallelism(2);
    options.as(PortablePipelineOptions.class).setDefaultEnvironmentType(Environments.ENVIRONMENT_EMBEDDED);
    Pipeline p = Pipeline.create(options);
    PCollection<Long> result = p.apply(Read.from(new Source(10))).apply(Window.into(FixedWindows.of(Duration.millis(1))));
    PAssert.that(result).containsInAnyOrder(ImmutableList.of(0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L));
    SplittableParDo.convertReadBasedSplittableDoFnsToPrimitiveReads(p);
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
    List<RunnerApi.PTransform> readTransforms = pipelineProto.getComponents().getTransformsMap().values().stream().filter(transform -> transform.getSpec().getUrn().equals(PTransformTranslation.READ_TRANSFORM_URN)).collect(Collectors.toList());
    assertThat(readTransforms, not(empty()));
    // execute the pipeline
    JobInvocation jobInvocation = FlinkJobInvoker.create(null).createJobInvocation("fakeId", "fakeRetrievalToken", flinkJobExecutor, pipelineProto, options.as(FlinkPipelineOptions.class), new FlinkPipelineRunner(options.as(FlinkPipelineOptions.class), null, Collections.emptyList()));
    jobInvocation.start();
    while (jobInvocation.getState() != JobState.Enum.DONE) {
        assertThat(jobInvocation.getState(), not(JobState.Enum.FAILED));
        Thread.sleep(100);
    }
}
Also used : SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) BeforeClass(org.junit.BeforeClass) PortablePipelineOptions(org.apache.beam.sdk.options.PortablePipelineOptions) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource) Matchers.not(org.hamcrest.Matchers.not) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Parameters(org.junit.runners.Parameterized.Parameters) LoggerFactory(org.slf4j.LoggerFactory) Coder(org.apache.beam.sdk.coders.Coder) PipelineTranslation(org.apache.beam.runners.core.construction.PipelineTranslation) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) Environments(org.apache.beam.runners.core.construction.Environments) JobInvocation(org.apache.beam.runners.jobsubmission.JobInvocation) Read(org.apache.beam.sdk.io.Read) Window(org.apache.beam.sdk.transforms.windowing.Window) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Pipeline(org.apache.beam.sdk.Pipeline) NoSuchElementException(java.util.NoSuchElementException) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Parameterized(org.junit.runners.Parameterized) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Matchers.empty(org.hamcrest.Matchers.empty) AfterClass(org.junit.AfterClass) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) MoreExecutors(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.MoreExecutors) Logger(org.slf4j.Logger) PAssert(org.apache.beam.sdk.testing.PAssert) Parameter(org.junit.runners.Parameterized.Parameter) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) Test(org.junit.Test) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) Serializable(java.io.Serializable) TimeUnit(java.util.concurrent.TimeUnit) CrashingRunner(org.apache.beam.sdk.testing.CrashingRunner) List(java.util.List) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) ListeningExecutorService(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.util.concurrent.ListeningExecutorService) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Collections(java.util.Collections) JobState(org.apache.beam.model.jobmanagement.v1.JobApi.JobState) JobInvocation(org.apache.beam.runners.jobsubmission.JobInvocation) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource) Pipeline(org.apache.beam.sdk.Pipeline) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) PortablePipelineOptions(org.apache.beam.sdk.options.PortablePipelineOptions) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) PortablePipelineOptions(org.apache.beam.sdk.options.PortablePipelineOptions) Test(org.junit.Test)

Example 25 with PCollection

use of org.apache.beam.model.pipeline.v1.RunnerApi.PCollection in project beam by apache.

the class DataflowPipelineTranslatorTest method testMultiGraphPipelineSerialization.

@Test
public void testMultiGraphPipelineSerialization() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    Pipeline p = Pipeline.create(options);
    PCollection<Integer> input = p.begin().apply(Create.of(1, 2, 3));
    input.apply(new UnrelatedOutputCreator());
    input.apply(new UnboundOutputCreator());
    DataflowPipelineTranslator t = DataflowPipelineTranslator.fromOptions(PipelineOptionsFactory.as(DataflowPipelineOptions.class));
    // Check that translation doesn't fail.
    SdkComponents sdkComponents = createSdkComponents(options);
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
    JobSpecification jobSpecification = t.translate(p, pipelineProto, sdkComponents, DataflowRunner.fromOptions(options), Collections.emptyList());
    assertAllStepOutputsHaveUniqueIds(jobSpecification.getJob());
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)45 Test (org.junit.Test)45 Pipeline (org.apache.beam.sdk.Pipeline)25 PTransform (org.apache.beam.model.pipeline.v1.RunnerApi.PTransform)24 PCollection (org.apache.beam.model.pipeline.v1.RunnerApi.PCollection)22 PTransformNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode)22 Map (java.util.Map)21 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)21 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)21 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)17 ArrayList (java.util.ArrayList)16 HashMap (java.util.HashMap)14 Environment (org.apache.beam.model.pipeline.v1.RunnerApi.Environment)13 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)13 PCollection (org.apache.beam.sdk.values.PCollection)12 Coder (org.apache.beam.sdk.coders.Coder)11 KV (org.apache.beam.sdk.values.KV)11 Collection (java.util.Collection)10 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)10 IOException (java.io.IOException)9