Search in sources :

Example 56 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class DataflowPipelineTranslatorTest method testNetworkConfigMissing.

@Test
public void testNetworkConfigMissing() throws IOException {
    DataflowPipelineOptions options = buildPipelineOptions();
    Pipeline p = buildPipeline(options);
    p.traverseTopologically(new RecordingPipelineVisitor());
    SdkComponents sdkComponents = createSdkComponents(options);
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
    Job job = DataflowPipelineTranslator.fromOptions(options).translate(p, pipelineProto, sdkComponents, DataflowRunner.fromOptions(options), Collections.emptyList()).getJob();
    assertEquals(1, job.getEnvironment().getWorkerPools().size());
    assertNull(job.getEnvironment().getWorkerPools().get(0).getNetwork());
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) Job(com.google.api.services.dataflow.model.Job) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 57 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class DataflowPipelineTranslatorTest method testTaggedNamesOverridden.

/**
 * Test that in translation the name for collections of a multi-output ParDo - a special case
 * because the user can name tags - are overridden to be what the Dataflow service expects.
 */
@Test
public void testTaggedNamesOverridden() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    options.setStreaming(false);
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    Pipeline pipeline = Pipeline.create(options);
    TupleTag<Integer> tag1 = new TupleTag<Integer>("frazzle") {
    };
    TupleTag<Integer> tag2 = new TupleTag<Integer>("bazzle") {
    };
    TupleTag<Integer> tag3 = new TupleTag<Integer>() {
    };
    PCollectionTuple outputs = pipeline.apply(Create.of(3)).apply(ParDo.of(new DoFn<Integer, Integer>() {

        @ProcessElement
        public void drop() {
        }
    }).withOutputTags(tag1, TupleTagList.of(tag2).and(tag3)));
    outputs.get(tag1).setName("bizbazzle");
    outputs.get(tag2).setName("gonzaggle");
    outputs.get(tag3).setName("froonazzle");
    runner.replaceV1Transforms(pipeline);
    SdkComponents sdkComponents = createSdkComponents(options);
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
    Job job = translator.translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList()).getJob();
    // The ParDo step
    Step step = job.getSteps().get(1);
    String stepName = getString(step.getProperties(), PropertyNames.USER_NAME);
    List<Map<String, Object>> outputInfos = Structs.getListOfMaps(step.getProperties(), PropertyNames.OUTPUT_INFO, null);
    assertThat(outputInfos.size(), equalTo(3));
    // The names set by the user _and_ the tags _must_ be ignored, or metrics will not show up.
    for (int i = 0; i < outputInfos.size(); ++i) {
        assertThat(getString(outputInfos.get(i), PropertyNames.USER_NAME), equalTo(String.format("%s.out%s", stepName, i)));
    }
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) TupleTag(org.apache.beam.sdk.values.TupleTag) Step(com.google.api.services.dataflow.model.Step) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) Pipeline(org.apache.beam.sdk.Pipeline) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Job(com.google.api.services.dataflow.model.Job) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) Test(org.junit.Test)

Example 58 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class DataflowPipelineTranslatorTest method testResourceHintsTranslationsResolvesHintsOnOptionsAndComposites.

@Test
public void testResourceHintsTranslationsResolvesHintsOnOptionsAndComposites() {
    ResourceHintsOptions options = PipelineOptionsFactory.as(ResourceHintsOptions.class);
    options.setResourceHints(Arrays.asList("accelerator=set_via_options", "minRam=1B"));
    Pipeline pipeline = Pipeline.create(options);
    PCollection<byte[]> root = pipeline.apply(Impulse.create());
    root.apply(new Outer().setResourceHints(ResourceHints.create().withAccelerator("set_on_outer_transform").withMinRam(20)));
    root.apply("Leaf", ParDo.of(new IdentityDoFn<byte[]>()));
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, false);
    assertThat(pipelineProto.getComponents().getEnvironmentsMap().get(getLeafTransform(pipelineProto, "Leaf").getEnvironmentId()).getResourceHintsMap(), org.hamcrest.Matchers.allOf(org.hamcrest.Matchers.hasEntry("beam:resources:min_ram_bytes:v1", ByteString.copyFromUtf8("1")), org.hamcrest.Matchers.hasEntry("beam:resources:accelerator:v1", ByteString.copyFromUtf8("set_via_options"))));
    assertThat(pipelineProto.getComponents().getEnvironmentsMap().get(getLeafTransform(pipelineProto, "Innermost").getEnvironmentId()).getResourceHintsMap(), org.hamcrest.Matchers.allOf(org.hamcrest.Matchers.hasEntry("beam:resources:min_ram_bytes:v1", ByteString.copyFromUtf8("20")), org.hamcrest.Matchers.hasEntry("beam:resources:accelerator:v1", ByteString.copyFromUtf8("set_in_inner_transform"))));
}
Also used : ResourceHintsOptions(org.apache.beam.sdk.transforms.resourcehints.ResourceHintsOptions) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 59 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class SparkPipelineRunner method run.

@Override
public PortablePipelineResult run(RunnerApi.Pipeline pipeline, JobInfo jobInfo) {
    SparkPortablePipelineTranslator translator;
    boolean isStreaming = pipelineOptions.isStreaming() || hasUnboundedPCollections(pipeline);
    if (isStreaming) {
        translator = new SparkStreamingPortablePipelineTranslator();
    } else {
        translator = new SparkBatchPortablePipelineTranslator();
    }
    // Expand any splittable DoFns within the graph to enable sizing and splitting of bundles.
    Pipeline pipelineWithSdfExpanded = ProtoOverrides.updateTransform(PTransformTranslation.PAR_DO_TRANSFORM_URN, pipeline, SplittableParDoExpander.createSizedReplacement());
    // Don't let the fuser fuse any subcomponents of native transforms.
    Pipeline trimmedPipeline = TrivialNativeTransformExpander.forKnownUrns(pipelineWithSdfExpanded, translator.knownUrns());
    // Fused pipeline proto.
    // TODO: Consider supporting partially-fused graphs.
    RunnerApi.Pipeline fusedPipeline = trimmedPipeline.getComponents().getTransformsMap().values().stream().anyMatch(proto -> ExecutableStage.URN.equals(proto.getSpec().getUrn())) ? trimmedPipeline : GreedyPipelineFuser.fuse(trimmedPipeline).toPipeline();
    prepareFilesToStage(pipelineOptions);
    PortablePipelineResult result;
    final JavaSparkContext jsc = SparkContextFactory.getSparkContext(pipelineOptions);
    final long startTime = Instant.now().getMillis();
    EventLoggingListener eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
    // Initialize accumulators.
    AggregatorsAccumulator.init(pipelineOptions, jsc);
    MetricsEnvironment.setMetricsSupported(true);
    MetricsAccumulator.init(pipelineOptions, jsc);
    final SparkTranslationContext context = translator.createTranslationContext(jsc, pipelineOptions, jobInfo);
    final ExecutorService executorService = Executors.newSingleThreadExecutor();
    LOG.info(String.format("Running job %s on Spark master %s", jobInfo.jobId(), jsc.master()));
    if (isStreaming) {
        final JavaStreamingContext jssc = ((SparkStreamingTranslationContext) context).getStreamingContext();
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
        // Register user-defined listeners.
        for (JavaStreamingListener listener : pipelineOptions.as(SparkContextOptions.class).getListeners()) {
            LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
            jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
        }
        // Register Watermarks listener to broadcast the advanced WMs.
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new GlobalWatermarkHolder.WatermarkAdvancingStreamingListener()));
        jssc.checkpoint(pipelineOptions.getCheckpointDir());
        // Obtain timeout from options.
        Long timeout = pipelineOptions.as(SparkPortableStreamingPipelineOptions.class).getStreamingTimeoutMs();
        final Future<?> submissionFuture = executorService.submit(() -> {
            translator.translate(fusedPipeline, context);
            LOG.info(String.format("Job %s: Pipeline translated successfully. Computing outputs", jobInfo.jobId()));
            context.computeOutputs();
            jssc.start();
            try {
                jssc.awaitTerminationOrTimeout(timeout);
            } catch (InterruptedException e) {
                LOG.warn("Streaming context interrupted, shutting down.", e);
            }
            jssc.stop();
            LOG.info(String.format("Job %s finished.", jobInfo.jobId()));
        });
        result = new SparkPipelineResult.PortableStreamingMode(submissionFuture, jssc);
    } else {
        final Future<?> submissionFuture = executorService.submit(() -> {
            translator.translate(fusedPipeline, context);
            LOG.info(String.format("Job %s: Pipeline translated successfully. Computing outputs", jobInfo.jobId()));
            context.computeOutputs();
            LOG.info(String.format("Job %s finished.", jobInfo.jobId()));
        });
        result = new SparkPipelineResult.PortableBatchMode(submissionFuture, jsc);
    }
    executorService.shutdown();
    result.waitUntilFinish();
    MetricsPusher metricsPusher = new MetricsPusher(MetricsAccumulator.getInstance().value(), pipelineOptions.as(MetricsOptions.class), result);
    metricsPusher.start();
    if (eventLoggingListener != null) {
        eventLoggingListener.onApplicationStart(SparkCompat.buildSparkListenerApplicationStart(jsc, pipelineOptions, startTime, result));
        eventLoggingListener.onApplicationEnd(new SparkListenerApplicationEnd(Instant.now().getMillis()));
        eventLoggingListener.stop();
    }
    return result;
}
Also used : MetricsAccumulator(org.apache.beam.runners.spark.metrics.MetricsAccumulator) ArtifactApi(org.apache.beam.model.jobmanagement.v1.ArtifactApi) LoggerFactory(org.slf4j.LoggerFactory) GreedyPipelineFuser(org.apache.beam.runners.core.construction.graph.GreedyPipelineFuser) PortablePipelineRunner(org.apache.beam.runners.jobsubmission.PortablePipelineRunner) SparkCompat(org.apache.beam.runners.spark.util.SparkCompat) Future(java.util.concurrent.Future) JobInfo(org.apache.beam.runners.fnexecution.provisioning.JobInfo) SparkListenerApplicationEnd(org.apache.spark.scheduler.SparkListenerApplicationEnd) SparkStreamingPortablePipelineTranslator(org.apache.beam.runners.spark.translation.SparkStreamingPortablePipelineTranslator) CmdLineParser(org.kohsuke.args4j.CmdLineParser) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) SparkPortablePipelineTranslator(org.apache.beam.runners.spark.translation.SparkPortablePipelineTranslator) Struct(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.Struct) UUID(java.util.UUID) TrivialNativeTransformExpander(org.apache.beam.runners.core.construction.graph.TrivialNativeTransformExpander) Option(org.kohsuke.args4j.Option) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) Executors(java.util.concurrent.Executors) MetricsPusher(org.apache.beam.runners.core.metrics.MetricsPusher) CmdLineException(org.kohsuke.args4j.CmdLineException) ProtoOverrides(org.apache.beam.runners.core.construction.graph.ProtoOverrides) AggregatorsAccumulator(org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PipelineOptionsTranslation(org.apache.beam.runners.core.construction.PipelineOptionsTranslation) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) SparkCommon.startEventLoggingListener(org.apache.beam.runners.spark.util.SparkCommon.startEventLoggingListener) SparkBatchPortablePipelineTranslator(org.apache.beam.runners.spark.translation.SparkBatchPortablePipelineTranslator) PortablePipelineResult(org.apache.beam.runners.jobsubmission.PortablePipelineResult) SparkTranslationContext(org.apache.beam.runners.spark.translation.SparkTranslationContext) PipelineTranslatorUtils.hasUnboundedPCollections(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.hasUnboundedPCollections) GlobalWatermarkHolder(org.apache.beam.runners.spark.util.GlobalWatermarkHolder) JavaStreamingListenerWrapper(org.apache.spark.streaming.api.java.JavaStreamingListenerWrapper) ExecutorService(java.util.concurrent.ExecutorService) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) JavaStreamingListener(org.apache.spark.streaming.api.java.JavaStreamingListener) Logger(org.slf4j.Logger) PortablePipelineJarUtils(org.apache.beam.runners.jobsubmission.PortablePipelineJarUtils) SparkStreamingTranslationContext(org.apache.beam.runners.spark.translation.SparkStreamingTranslationContext) SparkContextFactory(org.apache.beam.runners.spark.translation.SparkContextFactory) SplittableParDoExpander(org.apache.beam.runners.core.construction.graph.SplittableParDoExpander) MetricsEnvironment(org.apache.beam.sdk.metrics.MetricsEnvironment) MetricsOptions(org.apache.beam.sdk.metrics.MetricsOptions) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) SparkCommonPipelineOptions.prepareFilesToStage(org.apache.beam.runners.spark.SparkCommonPipelineOptions.prepareFilesToStage) Preconditions(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions) Instant(org.joda.time.Instant) Nullable(edu.umd.cs.findbugs.annotations.Nullable) EventLoggingListener(org.apache.spark.scheduler.EventLoggingListener) FileSystems(org.apache.beam.sdk.io.FileSystems) MetricsOptions(org.apache.beam.sdk.metrics.MetricsOptions) SparkPortablePipelineTranslator(org.apache.beam.runners.spark.translation.SparkPortablePipelineTranslator) SparkBatchPortablePipelineTranslator(org.apache.beam.runners.spark.translation.SparkBatchPortablePipelineTranslator) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) SparkListenerApplicationEnd(org.apache.spark.scheduler.SparkListenerApplicationEnd) PortablePipelineResult(org.apache.beam.runners.jobsubmission.PortablePipelineResult) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) MetricsPusher(org.apache.beam.runners.core.metrics.MetricsPusher) SparkCommon.startEventLoggingListener(org.apache.beam.runners.spark.util.SparkCommon.startEventLoggingListener) EventLoggingListener(org.apache.spark.scheduler.EventLoggingListener) SparkTranslationContext(org.apache.beam.runners.spark.translation.SparkTranslationContext) SparkStreamingPortablePipelineTranslator(org.apache.beam.runners.spark.translation.SparkStreamingPortablePipelineTranslator) JavaStreamingListenerWrapper(org.apache.spark.streaming.api.java.JavaStreamingListenerWrapper) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) JavaStreamingListener(org.apache.spark.streaming.api.java.JavaStreamingListener) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) SparkStreamingTranslationContext(org.apache.beam.runners.spark.translation.SparkStreamingTranslationContext) ExecutorService(java.util.concurrent.ExecutorService)

Example 60 with Pipeline

use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.

the class KafkaIOExternalTest method testConstructKafkaWrite.

@Test
public void testConstructKafkaWrite() throws Exception {
    String topic = "topic";
    String keySerializer = "org.apache.kafka.common.serialization.ByteArraySerializer";
    String valueSerializer = "org.apache.kafka.common.serialization.LongSerializer";
    ImmutableMap<String, String> producerConfig = ImmutableMap.<String, String>builder().put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "server1:port,server2:port").put("retries", "3").build();
    ExternalTransforms.ExternalConfigurationPayload payload = encodeRow(Row.withSchema(Schema.of(Field.of("topic", FieldType.STRING), Field.of("producer_config", FieldType.map(FieldType.STRING, FieldType.STRING)), Field.of("key_serializer", FieldType.STRING), Field.of("value_serializer", FieldType.STRING))).withFieldValue("topic", topic).withFieldValue("producer_config", producerConfig).withFieldValue("key_serializer", keySerializer).withFieldValue("value_serializer", valueSerializer).build());
    Pipeline p = Pipeline.create();
    p.apply(Impulse.create()).apply(WithKeys.of("key"));
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
    String inputPCollection = Iterables.getOnlyElement(Iterables.getLast(pipelineProto.getComponents().getTransformsMap().values()).getOutputsMap().values());
    ExpansionApi.ExpansionRequest request = ExpansionApi.ExpansionRequest.newBuilder().setComponents(pipelineProto.getComponents()).setTransform(RunnerApi.PTransform.newBuilder().setUniqueName("test").putInputs("input", inputPCollection).setSpec(RunnerApi.FunctionSpec.newBuilder().setUrn(org.apache.beam.sdk.io.kafka.KafkaIO.Write.External.URN).setPayload(payload.toByteString()))).setNamespace("test_namespace").build();
    ExpansionService expansionService = new ExpansionService();
    TestStreamObserver<ExpansionApi.ExpansionResponse> observer = new TestStreamObserver<>();
    expansionService.expand(request, observer);
    ExpansionApi.ExpansionResponse result = observer.result;
    RunnerApi.PTransform transform = result.getTransform();
    assertThat(transform.getSubtransformsList(), Matchers.hasItem(MatchesPattern.matchesPattern(".*Kafka-ProducerRecord.*")));
    assertThat(transform.getSubtransformsList(), Matchers.hasItem(MatchesPattern.matchesPattern(".*KafkaIO-WriteRecords.*")));
    assertThat(transform.getInputsCount(), Matchers.is(1));
    assertThat(transform.getOutputsCount(), Matchers.is(0));
    RunnerApi.PTransform writeComposite = result.getComponents().getTransformsOrThrow(transform.getSubtransforms(1));
    RunnerApi.PTransform writeParDo = result.getComponents().getTransformsOrThrow(result.getComponents().getTransformsOrThrow(writeComposite.getSubtransforms(0)).getSubtransforms(0));
    RunnerApi.ParDoPayload parDoPayload = RunnerApi.ParDoPayload.parseFrom(writeParDo.getSpec().getPayload());
    DoFn kafkaWriter = ParDoTranslation.getDoFn(parDoPayload);
    assertThat(kafkaWriter, Matchers.instanceOf(KafkaWriter.class));
    KafkaIO.WriteRecords spec = (KafkaIO.WriteRecords) Whitebox.getInternalState(kafkaWriter, "spec");
    assertThat(spec.getProducerConfig(), Matchers.is(producerConfig));
    assertThat(spec.getTopic(), Matchers.is(topic));
    assertThat(spec.getKeySerializer().getName(), Matchers.is(keySerializer));
    assertThat(spec.getValueSerializer().getName(), Matchers.is(valueSerializer));
}
Also used : ExpansionService(org.apache.beam.sdk.expansion.service.ExpansionService) ExternalTransforms(org.apache.beam.model.pipeline.v1.ExternalTransforms) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) Pipeline(org.apache.beam.sdk.Pipeline) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) ExpansionApi(org.apache.beam.model.expansion.v1.ExpansionApi) ExternalConfigurationPayload(org.apache.beam.model.pipeline.v1.ExternalTransforms.ExternalConfigurationPayload) Test(org.junit.Test)

Aggregations

RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)117 Test (org.junit.Test)87 Pipeline (org.apache.beam.sdk.Pipeline)82 SdkComponents (org.apache.beam.runners.core.construction.SdkComponents)44 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)43 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)38 Map (java.util.Map)32 KV (org.apache.beam.sdk.values.KV)26 Job (com.google.api.services.dataflow.model.Job)25 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)24 KvCoder (org.apache.beam.sdk.coders.KvCoder)24 Components (org.apache.beam.model.pipeline.v1.RunnerApi.Components)23 Coder (org.apache.beam.sdk.coders.Coder)23 ArrayList (java.util.ArrayList)22 WindowedValue (org.apache.beam.sdk.util.WindowedValue)22 HashMap (java.util.HashMap)20 List (java.util.List)20 ExecutableStage (org.apache.beam.runners.core.construction.graph.ExecutableStage)19 IOException (java.io.IOException)18 PCollection (org.apache.beam.sdk.values.PCollection)18