use of org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir in project beam by apache.
the class MetricsAccumulator method init.
/**
* Init metrics accumulator if it has not been initiated. This method is idempotent.
*/
public static void init(SparkPipelineOptions opts, JavaSparkContext jsc) {
if (instance == null) {
synchronized (MetricsAccumulator.class) {
if (instance == null) {
Optional<CheckpointDir> maybeCheckpointDir = opts.isStreaming() ? Optional.of(new CheckpointDir(opts.getCheckpointDir())) : Optional.absent();
MetricsContainerStepMap metricsContainerStepMap = new SparkMetricsContainerStepMap();
MetricsContainerStepMapAccumulator accumulator = new MetricsContainerStepMapAccumulator(metricsContainerStepMap);
if (maybeCheckpointDir.isPresent()) {
Optional<MetricsContainerStepMap> maybeRecoveredValue = recoverValueFromCheckpoint(jsc, maybeCheckpointDir.get());
if (maybeRecoveredValue.isPresent()) {
accumulator = new MetricsContainerStepMapAccumulator(maybeRecoveredValue.get());
}
}
jsc.sc().register(accumulator, ACCUMULATOR_NAME);
instance = accumulator;
}
}
LOG.info("Instantiated metrics accumulator: " + instance.value());
} else {
instance.reset();
}
}
use of org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir in project beam by apache.
the class SparkRunner method run.
@Override
public SparkPipelineResult run(final Pipeline pipeline) {
LOG.info("Executing pipeline using the SparkRunner.");
final SparkPipelineResult result;
final Future<?> startPipeline;
final SparkPipelineTranslator translator;
final ExecutorService executorService = Executors.newSingleThreadExecutor();
MetricsEnvironment.setMetricsSupported(true);
// visit the pipeline to determine the translation mode
detectTranslationMode(pipeline);
// TODO(BEAM-10670): Use SDF read as default when we address performance issue.
if (!ExperimentalOptions.hasExperiment(pipeline.getOptions(), "beam_fn_api")) {
SplittableParDo.convertReadBasedSplittableDoFnsToPrimitiveReadsIfNecessary(pipeline);
}
pipeline.replaceAll(SparkTransformOverrides.getDefaultOverrides(pipelineOptions.isStreaming()));
prepareFilesToStage(pipelineOptions);
final long startTime = Instant.now().getMillis();
EventLoggingListener eventLoggingListener = null;
JavaSparkContext jsc = null;
if (pipelineOptions.isStreaming()) {
CheckpointDir checkpointDir = new CheckpointDir(pipelineOptions.getCheckpointDir());
SparkRunnerStreamingContextFactory streamingContextFactory = new SparkRunnerStreamingContextFactory(pipeline, pipelineOptions, checkpointDir);
final JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory);
jsc = jssc.sparkContext();
eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
// Checkpoint aggregator/metrics values
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
// register user-defined listeners.
for (JavaStreamingListener listener : pipelineOptions.as(SparkContextOptions.class).getListeners()) {
LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
}
// register Watermarks listener to broadcast the advanced WMs.
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new WatermarkAdvancingStreamingListener()));
// The reason we call initAccumulators here even though it is called in
// SparkRunnerStreamingContextFactory is because the factory is not called when resuming
// from checkpoint (When not resuming from checkpoint initAccumulators will be called twice
// but this is fine since it is idempotent).
initAccumulators(pipelineOptions, jssc.sparkContext());
startPipeline = executorService.submit(() -> {
LOG.info("Starting streaming pipeline execution.");
jssc.start();
});
executorService.shutdown();
result = new SparkPipelineResult.StreamingMode(startPipeline, jssc);
} else {
jsc = SparkContextFactory.getSparkContext(pipelineOptions);
eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
final EvaluationContext evaluationContext = new EvaluationContext(jsc, pipeline, pipelineOptions);
translator = new TransformTranslator.Translator();
// update the cache candidates
updateCacheCandidates(pipeline, translator, evaluationContext);
initAccumulators(pipelineOptions, jsc);
startPipeline = executorService.submit(() -> {
pipeline.traverseTopologically(new Evaluator(translator, evaluationContext));
evaluationContext.computeOutputs();
LOG.info("Batch pipeline execution complete.");
});
executorService.shutdown();
result = new SparkPipelineResult.BatchMode(startPipeline, jsc);
}
if (pipelineOptions.getEnableSparkMetricSinks()) {
registerMetricsSource(pipelineOptions.getAppName());
}
// it would have been better to create MetricsPusher from runner-core but we need
// runner-specific
// MetricsContainerStepMap
MetricsPusher metricsPusher = new MetricsPusher(MetricsAccumulator.getInstance().value(), pipelineOptions.as(MetricsOptions.class), result);
metricsPusher.start();
if (eventLoggingListener != null && jsc != null) {
eventLoggingListener.onApplicationStart(SparkCompat.buildSparkListenerApplicationStart(jsc, pipelineOptions, startTime, result));
eventLoggingListener.onApplicationEnd(new SparkListenerApplicationEnd(Instant.now().getMillis()));
eventLoggingListener.stop();
}
return result;
}
Aggregations