Search in sources :

Example 1 with WatermarkAdvancingStreamingListener

use of org.apache.beam.runners.spark.util.GlobalWatermarkHolder.WatermarkAdvancingStreamingListener in project beam by apache.

the class SparkRunner method run.

@Override
public SparkPipelineResult run(final Pipeline pipeline) {
    LOG.info("Executing pipeline using the SparkRunner.");
    final SparkPipelineResult result;
    final Future<?> startPipeline;
    final SparkPipelineTranslator translator;
    final ExecutorService executorService = Executors.newSingleThreadExecutor();
    MetricsEnvironment.setMetricsSupported(true);
    // visit the pipeline to determine the translation mode
    detectTranslationMode(pipeline);
    // TODO(BEAM-10670): Use SDF read as default when we address performance issue.
    if (!ExperimentalOptions.hasExperiment(pipeline.getOptions(), "beam_fn_api")) {
        SplittableParDo.convertReadBasedSplittableDoFnsToPrimitiveReadsIfNecessary(pipeline);
    }
    pipeline.replaceAll(SparkTransformOverrides.getDefaultOverrides(pipelineOptions.isStreaming()));
    prepareFilesToStage(pipelineOptions);
    final long startTime = Instant.now().getMillis();
    EventLoggingListener eventLoggingListener = null;
    JavaSparkContext jsc = null;
    if (pipelineOptions.isStreaming()) {
        CheckpointDir checkpointDir = new CheckpointDir(pipelineOptions.getCheckpointDir());
        SparkRunnerStreamingContextFactory streamingContextFactory = new SparkRunnerStreamingContextFactory(pipeline, pipelineOptions, checkpointDir);
        final JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory);
        jsc = jssc.sparkContext();
        eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
        // Checkpoint aggregator/metrics values
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
        // register user-defined listeners.
        for (JavaStreamingListener listener : pipelineOptions.as(SparkContextOptions.class).getListeners()) {
            LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
            jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
        }
        // register Watermarks listener to broadcast the advanced WMs.
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new WatermarkAdvancingStreamingListener()));
        // The reason we call initAccumulators here even though it is called in
        // SparkRunnerStreamingContextFactory is because the factory is not called when resuming
        // from checkpoint (When not resuming from checkpoint initAccumulators will be called twice
        // but this is fine since it is idempotent).
        initAccumulators(pipelineOptions, jssc.sparkContext());
        startPipeline = executorService.submit(() -> {
            LOG.info("Starting streaming pipeline execution.");
            jssc.start();
        });
        executorService.shutdown();
        result = new SparkPipelineResult.StreamingMode(startPipeline, jssc);
    } else {
        jsc = SparkContextFactory.getSparkContext(pipelineOptions);
        eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
        final EvaluationContext evaluationContext = new EvaluationContext(jsc, pipeline, pipelineOptions);
        translator = new TransformTranslator.Translator();
        // update the cache candidates
        updateCacheCandidates(pipeline, translator, evaluationContext);
        initAccumulators(pipelineOptions, jsc);
        startPipeline = executorService.submit(() -> {
            pipeline.traverseTopologically(new Evaluator(translator, evaluationContext));
            evaluationContext.computeOutputs();
            LOG.info("Batch pipeline execution complete.");
        });
        executorService.shutdown();
        result = new SparkPipelineResult.BatchMode(startPipeline, jsc);
    }
    if (pipelineOptions.getEnableSparkMetricSinks()) {
        registerMetricsSource(pipelineOptions.getAppName());
    }
    // it would have been better to create MetricsPusher from runner-core but we need
    // runner-specific
    // MetricsContainerStepMap
    MetricsPusher metricsPusher = new MetricsPusher(MetricsAccumulator.getInstance().value(), pipelineOptions.as(MetricsOptions.class), result);
    metricsPusher.start();
    if (eventLoggingListener != null && jsc != null) {
        eventLoggingListener.onApplicationStart(SparkCompat.buildSparkListenerApplicationStart(jsc, pipelineOptions, startTime, result));
        eventLoggingListener.onApplicationEnd(new SparkListenerApplicationEnd(Instant.now().getMillis()));
        eventLoggingListener.stop();
    }
    return result;
}
Also used : MetricsOptions(org.apache.beam.sdk.metrics.MetricsOptions) JavaStreamingListenerWrapper(org.apache.spark.streaming.api.java.JavaStreamingListenerWrapper) JavaStreamingListener(org.apache.spark.streaming.api.java.JavaStreamingListener) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) SparkRunnerStreamingContextFactory(org.apache.beam.runners.spark.translation.streaming.SparkRunnerStreamingContextFactory) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) SparkListenerApplicationEnd(org.apache.spark.scheduler.SparkListenerApplicationEnd) ExecutorService(java.util.concurrent.ExecutorService) WatermarkAdvancingStreamingListener(org.apache.beam.runners.spark.util.GlobalWatermarkHolder.WatermarkAdvancingStreamingListener) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) MetricsPusher(org.apache.beam.runners.core.metrics.MetricsPusher) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator) CheckpointDir(org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir) SparkCommon.startEventLoggingListener(org.apache.beam.runners.spark.util.SparkCommon.startEventLoggingListener) EventLoggingListener(org.apache.spark.scheduler.EventLoggingListener)

Aggregations

ExecutorService (java.util.concurrent.ExecutorService)1 MetricsPusher (org.apache.beam.runners.core.metrics.MetricsPusher)1 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)1 SparkPipelineTranslator (org.apache.beam.runners.spark.translation.SparkPipelineTranslator)1 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)1 TransformTranslator (org.apache.beam.runners.spark.translation.TransformTranslator)1 CheckpointDir (org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir)1 SparkRunnerStreamingContextFactory (org.apache.beam.runners.spark.translation.streaming.SparkRunnerStreamingContextFactory)1 WatermarkAdvancingStreamingListener (org.apache.beam.runners.spark.util.GlobalWatermarkHolder.WatermarkAdvancingStreamingListener)1 SparkCommon.startEventLoggingListener (org.apache.beam.runners.spark.util.SparkCommon.startEventLoggingListener)1 MetricsOptions (org.apache.beam.sdk.metrics.MetricsOptions)1 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)1 EventLoggingListener (org.apache.spark.scheduler.EventLoggingListener)1 SparkListenerApplicationEnd (org.apache.spark.scheduler.SparkListenerApplicationEnd)1 JavaStreamingContext (org.apache.spark.streaming.api.java.JavaStreamingContext)1 JavaStreamingListener (org.apache.spark.streaming.api.java.JavaStreamingListener)1 JavaStreamingListenerWrapper (org.apache.spark.streaming.api.java.JavaStreamingListenerWrapper)1