Search in sources :

Example 1 with WatermarksListener

use of org.apache.beam.runners.spark.util.GlobalWatermarkHolder.WatermarksListener in project beam by apache.

the class SparkRunner method run.

@Override
public SparkPipelineResult run(final Pipeline pipeline) {
    LOG.info("Executing pipeline using the SparkRunner.");
    final SparkPipelineResult result;
    final Future<?> startPipeline;
    final SparkPipelineTranslator translator;
    final ExecutorService executorService = Executors.newSingleThreadExecutor();
    MetricsEnvironment.setMetricsSupported(true);
    // visit the pipeline to determine the translation mode
    detectTranslationMode(pipeline);
    if (mOptions.isStreaming()) {
        CheckpointDir checkpointDir = new CheckpointDir(mOptions.getCheckpointDir());
        SparkRunnerStreamingContextFactory streamingContextFactory = new SparkRunnerStreamingContextFactory(pipeline, mOptions, checkpointDir);
        final JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory);
        // Checkpoint aggregator/metrics values
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
        // register user-defined listeners.
        for (JavaStreamingListener listener : mOptions.as(SparkContextOptions.class).getListeners()) {
            LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
            jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
        }
        // register Watermarks listener to broadcast the advanced WMs.
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new WatermarksListener(jssc)));
        // The reason we call initAccumulators here even though it is called in
        // SparkRunnerStreamingContextFactory is because the factory is not called when resuming
        // from checkpoint (When not resuming from checkpoint initAccumulators will be called twice
        // but this is fine since it is idempotent).
        initAccumulators(mOptions, jssc.sparkContext());
        startPipeline = executorService.submit(new Runnable() {

            @Override
            public void run() {
                LOG.info("Starting streaming pipeline execution.");
                jssc.start();
            }
        });
        result = new SparkPipelineResult.StreamingMode(startPipeline, jssc);
    } else {
        // create the evaluation context
        final JavaSparkContext jsc = SparkContextFactory.getSparkContext(mOptions);
        final EvaluationContext evaluationContext = new EvaluationContext(jsc, pipeline, mOptions);
        translator = new TransformTranslator.Translator();
        // update the cache candidates
        updateCacheCandidates(pipeline, translator, evaluationContext);
        initAccumulators(mOptions, jsc);
        startPipeline = executorService.submit(new Runnable() {

            @Override
            public void run() {
                pipeline.traverseTopologically(new Evaluator(translator, evaluationContext));
                evaluationContext.computeOutputs();
                LOG.info("Batch pipeline execution complete.");
            }
        });
        result = new SparkPipelineResult.BatchMode(startPipeline, jsc);
    }
    if (mOptions.getEnableSparkMetricSinks()) {
        registerMetricsSource(mOptions.getAppName());
    }
    return result;
}
Also used : JavaStreamingListenerWrapper(org.apache.spark.streaming.api.java.JavaStreamingListenerWrapper) JavaStreamingListener(org.apache.spark.streaming.api.java.JavaStreamingListener) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) SparkRunnerStreamingContextFactory(org.apache.beam.runners.spark.translation.streaming.SparkRunnerStreamingContextFactory) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) WatermarksListener(org.apache.beam.runners.spark.util.GlobalWatermarkHolder.WatermarksListener) ExecutorService(java.util.concurrent.ExecutorService) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator) CheckpointDir(org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir)

Aggregations

ExecutorService (java.util.concurrent.ExecutorService)1 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)1 SparkPipelineTranslator (org.apache.beam.runners.spark.translation.SparkPipelineTranslator)1 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)1 TransformTranslator (org.apache.beam.runners.spark.translation.TransformTranslator)1 CheckpointDir (org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir)1 SparkRunnerStreamingContextFactory (org.apache.beam.runners.spark.translation.streaming.SparkRunnerStreamingContextFactory)1 WatermarksListener (org.apache.beam.runners.spark.util.GlobalWatermarkHolder.WatermarksListener)1 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)1 JavaStreamingContext (org.apache.spark.streaming.api.java.JavaStreamingContext)1 JavaStreamingListener (org.apache.spark.streaming.api.java.JavaStreamingListener)1 JavaStreamingListenerWrapper (org.apache.spark.streaming.api.java.JavaStreamingListenerWrapper)1