use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.
the class SparkRunner method run.
@Override
public SparkPipelineResult run(final Pipeline pipeline) {
LOG.info("Executing pipeline using the SparkRunner.");
final SparkPipelineResult result;
final Future<?> startPipeline;
final SparkPipelineTranslator translator;
final ExecutorService executorService = Executors.newSingleThreadExecutor();
MetricsEnvironment.setMetricsSupported(true);
// visit the pipeline to determine the translation mode
detectTranslationMode(pipeline);
// TODO(BEAM-10670): Use SDF read as default when we address performance issue.
if (!ExperimentalOptions.hasExperiment(pipeline.getOptions(), "beam_fn_api")) {
SplittableParDo.convertReadBasedSplittableDoFnsToPrimitiveReadsIfNecessary(pipeline);
}
pipeline.replaceAll(SparkTransformOverrides.getDefaultOverrides(pipelineOptions.isStreaming()));
prepareFilesToStage(pipelineOptions);
final long startTime = Instant.now().getMillis();
EventLoggingListener eventLoggingListener = null;
JavaSparkContext jsc = null;
if (pipelineOptions.isStreaming()) {
CheckpointDir checkpointDir = new CheckpointDir(pipelineOptions.getCheckpointDir());
SparkRunnerStreamingContextFactory streamingContextFactory = new SparkRunnerStreamingContextFactory(pipeline, pipelineOptions, checkpointDir);
final JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory);
jsc = jssc.sparkContext();
eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
// Checkpoint aggregator/metrics values
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
// register user-defined listeners.
for (JavaStreamingListener listener : pipelineOptions.as(SparkContextOptions.class).getListeners()) {
LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
}
// register Watermarks listener to broadcast the advanced WMs.
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new WatermarkAdvancingStreamingListener()));
// The reason we call initAccumulators here even though it is called in
// SparkRunnerStreamingContextFactory is because the factory is not called when resuming
// from checkpoint (When not resuming from checkpoint initAccumulators will be called twice
// but this is fine since it is idempotent).
initAccumulators(pipelineOptions, jssc.sparkContext());
startPipeline = executorService.submit(() -> {
LOG.info("Starting streaming pipeline execution.");
jssc.start();
});
executorService.shutdown();
result = new SparkPipelineResult.StreamingMode(startPipeline, jssc);
} else {
jsc = SparkContextFactory.getSparkContext(pipelineOptions);
eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
final EvaluationContext evaluationContext = new EvaluationContext(jsc, pipeline, pipelineOptions);
translator = new TransformTranslator.Translator();
// update the cache candidates
updateCacheCandidates(pipeline, translator, evaluationContext);
initAccumulators(pipelineOptions, jsc);
startPipeline = executorService.submit(() -> {
pipeline.traverseTopologically(new Evaluator(translator, evaluationContext));
evaluationContext.computeOutputs();
LOG.info("Batch pipeline execution complete.");
});
executorService.shutdown();
result = new SparkPipelineResult.BatchMode(startPipeline, jsc);
}
if (pipelineOptions.getEnableSparkMetricSinks()) {
registerMetricsSource(pipelineOptions.getAppName());
}
// it would have been better to create MetricsPusher from runner-core but we need
// runner-specific
// MetricsContainerStepMap
MetricsPusher metricsPusher = new MetricsPusher(MetricsAccumulator.getInstance().value(), pipelineOptions.as(MetricsOptions.class), result);
metricsPusher.start();
if (eventLoggingListener != null && jsc != null) {
eventLoggingListener.onApplicationStart(SparkCompat.buildSparkListenerApplicationStart(jsc, pipelineOptions, startTime, result));
eventLoggingListener.onApplicationEnd(new SparkListenerApplicationEnd(Instant.now().getMillis()));
eventLoggingListener.stop();
}
return result;
}
use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.
the class SparkRunnerDebugger method run.
@Override
public SparkPipelineResult run(Pipeline pipeline) {
boolean isStreaming = options.isStreaming() || options.as(TestSparkPipelineOptions.class).isForceStreaming();
// TODO(BEAM-10670): Use SDF read as default when we address performance issue.
if (!ExperimentalOptions.hasExperiment(pipeline.getOptions(), "beam_fn_api")) {
SplittableParDo.convertReadBasedSplittableDoFnsToPrimitiveReadsIfNecessary(pipeline);
}
JavaSparkContext jsc = new JavaSparkContext("local[1]", "Debug_Pipeline");
JavaStreamingContext jssc = new JavaStreamingContext(jsc, new org.apache.spark.streaming.Duration(1000));
SparkRunner.initAccumulators(options, jsc);
TransformTranslator.Translator translator = new TransformTranslator.Translator();
SparkNativePipelineVisitor visitor;
if (isStreaming) {
SparkPipelineTranslator streamingTranslator = new StreamingTransformTranslator.Translator(translator);
EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
visitor = new SparkNativePipelineVisitor(streamingTranslator, ctxt);
} else {
EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
visitor = new SparkNativePipelineVisitor(translator, ctxt);
}
pipeline.traverseTopologically(visitor);
jsc.stop();
String debugString = visitor.getDebugString();
LOG.info("Translated Native Spark pipeline:\n" + debugString);
return new DebugSparkPipelineResult(debugString);
}
use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.
the class StreamingTransformTranslator method combineGrouped.
private static <K, InputT, OutputT> TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>> combineGrouped() {
return new TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>>() {
@Override
public void evaluate(final Combine.GroupedValues<K, InputT, OutputT> transform, EvaluationContext context) {
// get the applied combine function.
PCollection<? extends KV<K, ? extends Iterable<InputT>>> input = context.getInput(transform);
final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
@SuppressWarnings("unchecked") final CombineWithContext.CombineFnWithContext<InputT, ?, OutputT> fn = (CombineWithContext.CombineFnWithContext<InputT, ?, OutputT>) CombineFnUtil.toFnWithContext(transform.getFn());
@SuppressWarnings("unchecked") UnboundedDataset<KV<K, Iterable<InputT>>> unboundedDataset = (UnboundedDataset<KV<K, Iterable<InputT>>>) context.borrowDataset(transform);
JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> dStream = unboundedDataset.getDStream();
final SerializablePipelineOptions options = context.getSerializableOptions();
final SparkPCollectionView pviews = context.getPViews();
JavaDStream<WindowedValue<KV<K, OutputT>>> outStream = dStream.transform(rdd -> {
SparkCombineFn<KV<K, InputT>, InputT, ?, OutputT> combineFnWithContext = SparkCombineFn.keyed(fn, options, TranslationUtils.getSideInputs(transform.getSideInputs(), new JavaSparkContext(rdd.context()), pviews), windowingStrategy);
return rdd.map(new TranslationUtils.CombineGroupedValues<>(combineFnWithContext));
});
context.putDataset(transform, new UnboundedDataset<>(outStream, unboundedDataset.getStreamSources()));
}
@Override
public String toNativeString() {
return "map(new <fn>())";
}
};
}
use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.
the class CacheTest method shouldCacheTest.
@Test
public void shouldCacheTest() {
SparkPipelineOptions options = createOptions();
options.setCacheDisabled(true);
Pipeline pipeline = Pipeline.create(options);
Values<String> valuesTransform = Create.of("foo", "bar");
PCollection pCollection = mock(PCollection.class);
JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
ctxt.getCacheCandidates().put(pCollection, 2L);
assertFalse(ctxt.shouldCache(valuesTransform, pCollection));
options.setCacheDisabled(false);
assertTrue(ctxt.shouldCache(valuesTransform, pCollection));
GroupByKey<String, String> gbkTransform = GroupByKey.create();
assertFalse(ctxt.shouldCache(gbkTransform, pCollection));
}
use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.
the class CacheTest method cacheCandidatesUpdaterTest.
/**
* Test checks how the cache candidates map is populated by the runner when evaluating the
* pipeline.
*/
@Test
public void cacheCandidatesUpdaterTest() {
SparkPipelineOptions options = createOptions();
Pipeline pipeline = Pipeline.create(options);
PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));
// First use of pCollection.
pCollection.apply(Count.globally());
// Second use of pCollection.
PCollectionView<List<String>> view = pCollection.apply(View.asList());
// Internally View.asList() creates a PCollection that underlies the PCollectionView, that
// PCollection should not be cached as the SparkRunner does not access that PCollection to
// access the PCollectionView.
pipeline.apply(Create.of("foo", "baz")).apply(ParDo.of(new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext processContext) {
if (processContext.sideInput(view).contains(processContext.element())) {
processContext.output(processContext.element());
}
}
}).withSideInputs(view));
JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
SparkRunner.CacheVisitor cacheVisitor = new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
pipeline.traverseTopologically(cacheVisitor);
assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count());
}
Aggregations