use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class DataflowPipelineTranslatorTest method testNetworkConfigMissing.
@Test
public void testNetworkConfigMissing() throws IOException {
DataflowPipelineOptions options = buildPipelineOptions();
Pipeline p = buildPipeline(options);
p.traverseTopologically(new RecordingPipelineVisitor());
SdkComponents sdkComponents = createSdkComponents(options);
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p, sdkComponents, true);
Job job = DataflowPipelineTranslator.fromOptions(options).translate(p, pipelineProto, sdkComponents, DataflowRunner.fromOptions(options), Collections.emptyList()).getJob();
assertEquals(1, job.getEnvironment().getWorkerPools().size());
assertNull(job.getEnvironment().getWorkerPools().get(0).getNetwork());
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class DataflowPipelineTranslatorTest method testTaggedNamesOverridden.
/**
* Test that in translation the name for collections of a multi-output ParDo - a special case
* because the user can name tags - are overridden to be what the Dataflow service expects.
*/
@Test
public void testTaggedNamesOverridden() throws Exception {
DataflowPipelineOptions options = buildPipelineOptions();
DataflowRunner runner = DataflowRunner.fromOptions(options);
options.setStreaming(false);
DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
Pipeline pipeline = Pipeline.create(options);
TupleTag<Integer> tag1 = new TupleTag<Integer>("frazzle") {
};
TupleTag<Integer> tag2 = new TupleTag<Integer>("bazzle") {
};
TupleTag<Integer> tag3 = new TupleTag<Integer>() {
};
PCollectionTuple outputs = pipeline.apply(Create.of(3)).apply(ParDo.of(new DoFn<Integer, Integer>() {
@ProcessElement
public void drop() {
}
}).withOutputTags(tag1, TupleTagList.of(tag2).and(tag3)));
outputs.get(tag1).setName("bizbazzle");
outputs.get(tag2).setName("gonzaggle");
outputs.get(tag3).setName("froonazzle");
runner.replaceV1Transforms(pipeline);
SdkComponents sdkComponents = createSdkComponents(options);
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
Job job = translator.translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList()).getJob();
// The ParDo step
Step step = job.getSteps().get(1);
String stepName = getString(step.getProperties(), PropertyNames.USER_NAME);
List<Map<String, Object>> outputInfos = Structs.getListOfMaps(step.getProperties(), PropertyNames.OUTPUT_INFO, null);
assertThat(outputInfos.size(), equalTo(3));
// The names set by the user _and_ the tags _must_ be ignored, or metrics will not show up.
for (int i = 0; i < outputInfos.size(); ++i) {
assertThat(getString(outputInfos.get(i), PropertyNames.USER_NAME), equalTo(String.format("%s.out%s", stepName, i)));
}
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class DataflowPipelineTranslatorTest method testResourceHintsTranslationsResolvesHintsOnOptionsAndComposites.
@Test
public void testResourceHintsTranslationsResolvesHintsOnOptionsAndComposites() {
ResourceHintsOptions options = PipelineOptionsFactory.as(ResourceHintsOptions.class);
options.setResourceHints(Arrays.asList("accelerator=set_via_options", "minRam=1B"));
Pipeline pipeline = Pipeline.create(options);
PCollection<byte[]> root = pipeline.apply(Impulse.create());
root.apply(new Outer().setResourceHints(ResourceHints.create().withAccelerator("set_on_outer_transform").withMinRam(20)));
root.apply("Leaf", ParDo.of(new IdentityDoFn<byte[]>()));
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, false);
assertThat(pipelineProto.getComponents().getEnvironmentsMap().get(getLeafTransform(pipelineProto, "Leaf").getEnvironmentId()).getResourceHintsMap(), org.hamcrest.Matchers.allOf(org.hamcrest.Matchers.hasEntry("beam:resources:min_ram_bytes:v1", ByteString.copyFromUtf8("1")), org.hamcrest.Matchers.hasEntry("beam:resources:accelerator:v1", ByteString.copyFromUtf8("set_via_options"))));
assertThat(pipelineProto.getComponents().getEnvironmentsMap().get(getLeafTransform(pipelineProto, "Innermost").getEnvironmentId()).getResourceHintsMap(), org.hamcrest.Matchers.allOf(org.hamcrest.Matchers.hasEntry("beam:resources:min_ram_bytes:v1", ByteString.copyFromUtf8("20")), org.hamcrest.Matchers.hasEntry("beam:resources:accelerator:v1", ByteString.copyFromUtf8("set_in_inner_transform"))));
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class SparkPipelineRunner method run.
@Override
public PortablePipelineResult run(RunnerApi.Pipeline pipeline, JobInfo jobInfo) {
SparkPortablePipelineTranslator translator;
boolean isStreaming = pipelineOptions.isStreaming() || hasUnboundedPCollections(pipeline);
if (isStreaming) {
translator = new SparkStreamingPortablePipelineTranslator();
} else {
translator = new SparkBatchPortablePipelineTranslator();
}
// Expand any splittable DoFns within the graph to enable sizing and splitting of bundles.
Pipeline pipelineWithSdfExpanded = ProtoOverrides.updateTransform(PTransformTranslation.PAR_DO_TRANSFORM_URN, pipeline, SplittableParDoExpander.createSizedReplacement());
// Don't let the fuser fuse any subcomponents of native transforms.
Pipeline trimmedPipeline = TrivialNativeTransformExpander.forKnownUrns(pipelineWithSdfExpanded, translator.knownUrns());
// Fused pipeline proto.
// TODO: Consider supporting partially-fused graphs.
RunnerApi.Pipeline fusedPipeline = trimmedPipeline.getComponents().getTransformsMap().values().stream().anyMatch(proto -> ExecutableStage.URN.equals(proto.getSpec().getUrn())) ? trimmedPipeline : GreedyPipelineFuser.fuse(trimmedPipeline).toPipeline();
prepareFilesToStage(pipelineOptions);
PortablePipelineResult result;
final JavaSparkContext jsc = SparkContextFactory.getSparkContext(pipelineOptions);
final long startTime = Instant.now().getMillis();
EventLoggingListener eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
// Initialize accumulators.
AggregatorsAccumulator.init(pipelineOptions, jsc);
MetricsEnvironment.setMetricsSupported(true);
MetricsAccumulator.init(pipelineOptions, jsc);
final SparkTranslationContext context = translator.createTranslationContext(jsc, pipelineOptions, jobInfo);
final ExecutorService executorService = Executors.newSingleThreadExecutor();
LOG.info(String.format("Running job %s on Spark master %s", jobInfo.jobId(), jsc.master()));
if (isStreaming) {
final JavaStreamingContext jssc = ((SparkStreamingTranslationContext) context).getStreamingContext();
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
// Register user-defined listeners.
for (JavaStreamingListener listener : pipelineOptions.as(SparkContextOptions.class).getListeners()) {
LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
}
// Register Watermarks listener to broadcast the advanced WMs.
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new GlobalWatermarkHolder.WatermarkAdvancingStreamingListener()));
jssc.checkpoint(pipelineOptions.getCheckpointDir());
// Obtain timeout from options.
Long timeout = pipelineOptions.as(SparkPortableStreamingPipelineOptions.class).getStreamingTimeoutMs();
final Future<?> submissionFuture = executorService.submit(() -> {
translator.translate(fusedPipeline, context);
LOG.info(String.format("Job %s: Pipeline translated successfully. Computing outputs", jobInfo.jobId()));
context.computeOutputs();
jssc.start();
try {
jssc.awaitTerminationOrTimeout(timeout);
} catch (InterruptedException e) {
LOG.warn("Streaming context interrupted, shutting down.", e);
}
jssc.stop();
LOG.info(String.format("Job %s finished.", jobInfo.jobId()));
});
result = new SparkPipelineResult.PortableStreamingMode(submissionFuture, jssc);
} else {
final Future<?> submissionFuture = executorService.submit(() -> {
translator.translate(fusedPipeline, context);
LOG.info(String.format("Job %s: Pipeline translated successfully. Computing outputs", jobInfo.jobId()));
context.computeOutputs();
LOG.info(String.format("Job %s finished.", jobInfo.jobId()));
});
result = new SparkPipelineResult.PortableBatchMode(submissionFuture, jsc);
}
executorService.shutdown();
result.waitUntilFinish();
MetricsPusher metricsPusher = new MetricsPusher(MetricsAccumulator.getInstance().value(), pipelineOptions.as(MetricsOptions.class), result);
metricsPusher.start();
if (eventLoggingListener != null) {
eventLoggingListener.onApplicationStart(SparkCompat.buildSparkListenerApplicationStart(jsc, pipelineOptions, startTime, result));
eventLoggingListener.onApplicationEnd(new SparkListenerApplicationEnd(Instant.now().getMillis()));
eventLoggingListener.stop();
}
return result;
}
use of org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline in project beam by apache.
the class KafkaIOExternalTest method testConstructKafkaWrite.
@Test
public void testConstructKafkaWrite() throws Exception {
String topic = "topic";
String keySerializer = "org.apache.kafka.common.serialization.ByteArraySerializer";
String valueSerializer = "org.apache.kafka.common.serialization.LongSerializer";
ImmutableMap<String, String> producerConfig = ImmutableMap.<String, String>builder().put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "server1:port,server2:port").put("retries", "3").build();
ExternalTransforms.ExternalConfigurationPayload payload = encodeRow(Row.withSchema(Schema.of(Field.of("topic", FieldType.STRING), Field.of("producer_config", FieldType.map(FieldType.STRING, FieldType.STRING)), Field.of("key_serializer", FieldType.STRING), Field.of("value_serializer", FieldType.STRING))).withFieldValue("topic", topic).withFieldValue("producer_config", producerConfig).withFieldValue("key_serializer", keySerializer).withFieldValue("value_serializer", valueSerializer).build());
Pipeline p = Pipeline.create();
p.apply(Impulse.create()).apply(WithKeys.of("key"));
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
String inputPCollection = Iterables.getOnlyElement(Iterables.getLast(pipelineProto.getComponents().getTransformsMap().values()).getOutputsMap().values());
ExpansionApi.ExpansionRequest request = ExpansionApi.ExpansionRequest.newBuilder().setComponents(pipelineProto.getComponents()).setTransform(RunnerApi.PTransform.newBuilder().setUniqueName("test").putInputs("input", inputPCollection).setSpec(RunnerApi.FunctionSpec.newBuilder().setUrn(org.apache.beam.sdk.io.kafka.KafkaIO.Write.External.URN).setPayload(payload.toByteString()))).setNamespace("test_namespace").build();
ExpansionService expansionService = new ExpansionService();
TestStreamObserver<ExpansionApi.ExpansionResponse> observer = new TestStreamObserver<>();
expansionService.expand(request, observer);
ExpansionApi.ExpansionResponse result = observer.result;
RunnerApi.PTransform transform = result.getTransform();
assertThat(transform.getSubtransformsList(), Matchers.hasItem(MatchesPattern.matchesPattern(".*Kafka-ProducerRecord.*")));
assertThat(transform.getSubtransformsList(), Matchers.hasItem(MatchesPattern.matchesPattern(".*KafkaIO-WriteRecords.*")));
assertThat(transform.getInputsCount(), Matchers.is(1));
assertThat(transform.getOutputsCount(), Matchers.is(0));
RunnerApi.PTransform writeComposite = result.getComponents().getTransformsOrThrow(transform.getSubtransforms(1));
RunnerApi.PTransform writeParDo = result.getComponents().getTransformsOrThrow(result.getComponents().getTransformsOrThrow(writeComposite.getSubtransforms(0)).getSubtransforms(0));
RunnerApi.ParDoPayload parDoPayload = RunnerApi.ParDoPayload.parseFrom(writeParDo.getSpec().getPayload());
DoFn kafkaWriter = ParDoTranslation.getDoFn(parDoPayload);
assertThat(kafkaWriter, Matchers.instanceOf(KafkaWriter.class));
KafkaIO.WriteRecords spec = (KafkaIO.WriteRecords) Whitebox.getInternalState(kafkaWriter, "spec");
assertThat(spec.getProducerConfig(), Matchers.is(producerConfig));
assertThat(spec.getTopic(), Matchers.is(topic));
assertThat(spec.getKeySerializer().getName(), Matchers.is(keySerializer));
assertThat(spec.getValueSerializer().getName(), Matchers.is(valueSerializer));
}
Aggregations