Search in sources :

Example 6 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class SparkRunner method run.

@Override
public SparkPipelineResult run(final Pipeline pipeline) {
    LOG.info("Executing pipeline using the SparkRunner.");
    final SparkPipelineResult result;
    final Future<?> startPipeline;
    final SparkPipelineTranslator translator;
    final ExecutorService executorService = Executors.newSingleThreadExecutor();
    MetricsEnvironment.setMetricsSupported(true);
    // visit the pipeline to determine the translation mode
    detectTranslationMode(pipeline);
    // TODO(BEAM-10670): Use SDF read as default when we address performance issue.
    if (!ExperimentalOptions.hasExperiment(pipeline.getOptions(), "beam_fn_api")) {
        SplittableParDo.convertReadBasedSplittableDoFnsToPrimitiveReadsIfNecessary(pipeline);
    }
    pipeline.replaceAll(SparkTransformOverrides.getDefaultOverrides(pipelineOptions.isStreaming()));
    prepareFilesToStage(pipelineOptions);
    final long startTime = Instant.now().getMillis();
    EventLoggingListener eventLoggingListener = null;
    JavaSparkContext jsc = null;
    if (pipelineOptions.isStreaming()) {
        CheckpointDir checkpointDir = new CheckpointDir(pipelineOptions.getCheckpointDir());
        SparkRunnerStreamingContextFactory streamingContextFactory = new SparkRunnerStreamingContextFactory(pipeline, pipelineOptions, checkpointDir);
        final JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory);
        jsc = jssc.sparkContext();
        eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
        // Checkpoint aggregator/metrics values
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
        // register user-defined listeners.
        for (JavaStreamingListener listener : pipelineOptions.as(SparkContextOptions.class).getListeners()) {
            LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
            jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
        }
        // register Watermarks listener to broadcast the advanced WMs.
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new WatermarkAdvancingStreamingListener()));
        // The reason we call initAccumulators here even though it is called in
        // SparkRunnerStreamingContextFactory is because the factory is not called when resuming
        // from checkpoint (When not resuming from checkpoint initAccumulators will be called twice
        // but this is fine since it is idempotent).
        initAccumulators(pipelineOptions, jssc.sparkContext());
        startPipeline = executorService.submit(() -> {
            LOG.info("Starting streaming pipeline execution.");
            jssc.start();
        });
        executorService.shutdown();
        result = new SparkPipelineResult.StreamingMode(startPipeline, jssc);
    } else {
        jsc = SparkContextFactory.getSparkContext(pipelineOptions);
        eventLoggingListener = startEventLoggingListener(jsc, pipelineOptions, startTime);
        final EvaluationContext evaluationContext = new EvaluationContext(jsc, pipeline, pipelineOptions);
        translator = new TransformTranslator.Translator();
        // update the cache candidates
        updateCacheCandidates(pipeline, translator, evaluationContext);
        initAccumulators(pipelineOptions, jsc);
        startPipeline = executorService.submit(() -> {
            pipeline.traverseTopologically(new Evaluator(translator, evaluationContext));
            evaluationContext.computeOutputs();
            LOG.info("Batch pipeline execution complete.");
        });
        executorService.shutdown();
        result = new SparkPipelineResult.BatchMode(startPipeline, jsc);
    }
    if (pipelineOptions.getEnableSparkMetricSinks()) {
        registerMetricsSource(pipelineOptions.getAppName());
    }
    // it would have been better to create MetricsPusher from runner-core but we need
    // runner-specific
    // MetricsContainerStepMap
    MetricsPusher metricsPusher = new MetricsPusher(MetricsAccumulator.getInstance().value(), pipelineOptions.as(MetricsOptions.class), result);
    metricsPusher.start();
    if (eventLoggingListener != null && jsc != null) {
        eventLoggingListener.onApplicationStart(SparkCompat.buildSparkListenerApplicationStart(jsc, pipelineOptions, startTime, result));
        eventLoggingListener.onApplicationEnd(new SparkListenerApplicationEnd(Instant.now().getMillis()));
        eventLoggingListener.stop();
    }
    return result;
}
Also used : MetricsOptions(org.apache.beam.sdk.metrics.MetricsOptions) JavaStreamingListenerWrapper(org.apache.spark.streaming.api.java.JavaStreamingListenerWrapper) JavaStreamingListener(org.apache.spark.streaming.api.java.JavaStreamingListener) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) SparkRunnerStreamingContextFactory(org.apache.beam.runners.spark.translation.streaming.SparkRunnerStreamingContextFactory) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) SparkListenerApplicationEnd(org.apache.spark.scheduler.SparkListenerApplicationEnd) ExecutorService(java.util.concurrent.ExecutorService) WatermarkAdvancingStreamingListener(org.apache.beam.runners.spark.util.GlobalWatermarkHolder.WatermarkAdvancingStreamingListener) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) MetricsPusher(org.apache.beam.runners.core.metrics.MetricsPusher) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator) CheckpointDir(org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir) SparkCommon.startEventLoggingListener(org.apache.beam.runners.spark.util.SparkCommon.startEventLoggingListener) EventLoggingListener(org.apache.spark.scheduler.EventLoggingListener)

Example 7 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class SparkRunnerDebugger method run.

@Override
public SparkPipelineResult run(Pipeline pipeline) {
    boolean isStreaming = options.isStreaming() || options.as(TestSparkPipelineOptions.class).isForceStreaming();
    // TODO(BEAM-10670): Use SDF read as default when we address performance issue.
    if (!ExperimentalOptions.hasExperiment(pipeline.getOptions(), "beam_fn_api")) {
        SplittableParDo.convertReadBasedSplittableDoFnsToPrimitiveReadsIfNecessary(pipeline);
    }
    JavaSparkContext jsc = new JavaSparkContext("local[1]", "Debug_Pipeline");
    JavaStreamingContext jssc = new JavaStreamingContext(jsc, new org.apache.spark.streaming.Duration(1000));
    SparkRunner.initAccumulators(options, jsc);
    TransformTranslator.Translator translator = new TransformTranslator.Translator();
    SparkNativePipelineVisitor visitor;
    if (isStreaming) {
        SparkPipelineTranslator streamingTranslator = new StreamingTransformTranslator.Translator(translator);
        EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
        visitor = new SparkNativePipelineVisitor(streamingTranslator, ctxt);
    } else {
        EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
        visitor = new SparkNativePipelineVisitor(translator, ctxt);
    }
    pipeline.traverseTopologically(visitor);
    jsc.stop();
    String debugString = visitor.getDebugString();
    LOG.info("Translated Native Spark pipeline:\n" + debugString);
    return new DebugSparkPipelineResult(debugString);
}
Also used : JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) StreamingTransformTranslator(org.apache.beam.runners.spark.translation.streaming.StreamingTransformTranslator) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) StreamingTransformTranslator(org.apache.beam.runners.spark.translation.streaming.StreamingTransformTranslator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator)

Example 8 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class StreamingTransformTranslator method combineGrouped.

private static <K, InputT, OutputT> TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>> combineGrouped() {
    return new TransformEvaluator<Combine.GroupedValues<K, InputT, OutputT>>() {

        @Override
        public void evaluate(final Combine.GroupedValues<K, InputT, OutputT> transform, EvaluationContext context) {
            // get the applied combine function.
            PCollection<? extends KV<K, ? extends Iterable<InputT>>> input = context.getInput(transform);
            final WindowingStrategy<?, ?> windowingStrategy = input.getWindowingStrategy();
            @SuppressWarnings("unchecked") final CombineWithContext.CombineFnWithContext<InputT, ?, OutputT> fn = (CombineWithContext.CombineFnWithContext<InputT, ?, OutputT>) CombineFnUtil.toFnWithContext(transform.getFn());
            @SuppressWarnings("unchecked") UnboundedDataset<KV<K, Iterable<InputT>>> unboundedDataset = (UnboundedDataset<KV<K, Iterable<InputT>>>) context.borrowDataset(transform);
            JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> dStream = unboundedDataset.getDStream();
            final SerializablePipelineOptions options = context.getSerializableOptions();
            final SparkPCollectionView pviews = context.getPViews();
            JavaDStream<WindowedValue<KV<K, OutputT>>> outStream = dStream.transform(rdd -> {
                SparkCombineFn<KV<K, InputT>, InputT, ?, OutputT> combineFnWithContext = SparkCombineFn.keyed(fn, options, TranslationUtils.getSideInputs(transform.getSideInputs(), new JavaSparkContext(rdd.context()), pviews), windowingStrategy);
                return rdd.map(new TranslationUtils.CombineGroupedValues<>(combineFnWithContext));
            });
            context.putDataset(transform, new UnboundedDataset<>(outStream, unboundedDataset.getStreamSources()));
        }

        @Override
        public String toNativeString() {
            return "map(new <fn>())";
        }
    };
}
Also used : Combine(org.apache.beam.sdk.transforms.Combine) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) CombineWithContext(org.apache.beam.sdk.transforms.CombineWithContext) KV(org.apache.beam.sdk.values.KV) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 9 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class CacheTest method shouldCacheTest.

@Test
public void shouldCacheTest() {
    SparkPipelineOptions options = createOptions();
    options.setCacheDisabled(true);
    Pipeline pipeline = Pipeline.create(options);
    Values<String> valuesTransform = Create.of("foo", "bar");
    PCollection pCollection = mock(PCollection.class);
    JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
    EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
    ctxt.getCacheCandidates().put(pCollection, 2L);
    assertFalse(ctxt.shouldCache(valuesTransform, pCollection));
    options.setCacheDisabled(false);
    assertTrue(ctxt.shouldCache(valuesTransform, pCollection));
    GroupByKey<String, String> gbkTransform = GroupByKey.create();
    assertFalse(ctxt.shouldCache(gbkTransform, pCollection));
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 10 with EvaluationContext

use of org.apache.beam.runners.spark.translation.EvaluationContext in project beam by apache.

the class CacheTest method cacheCandidatesUpdaterTest.

/**
 * Test checks how the cache candidates map is populated by the runner when evaluating the
 * pipeline.
 */
@Test
public void cacheCandidatesUpdaterTest() {
    SparkPipelineOptions options = createOptions();
    Pipeline pipeline = Pipeline.create(options);
    PCollection<String> pCollection = pipeline.apply(Create.of("foo", "bar"));
    // First use of pCollection.
    pCollection.apply(Count.globally());
    // Second use of pCollection.
    PCollectionView<List<String>> view = pCollection.apply(View.asList());
    // Internally View.asList() creates a PCollection that underlies the PCollectionView, that
    // PCollection should not be cached as the SparkRunner does not access that PCollection to
    // access the PCollectionView.
    pipeline.apply(Create.of("foo", "baz")).apply(ParDo.of(new DoFn<String, String>() {

        @ProcessElement
        public void processElement(ProcessContext processContext) {
            if (processContext.sideInput(view).contains(processContext.element())) {
                processContext.output(processContext.element());
            }
        }
    }).withSideInputs(view));
    JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
    EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options);
    SparkRunner.CacheVisitor cacheVisitor = new SparkRunner.CacheVisitor(new TransformTranslator.Translator(), ctxt);
    pipeline.traverseTopologically(cacheVisitor);
    assertEquals(2L, (long) ctxt.getCacheCandidates().get(pCollection));
    assertEquals(1L, ctxt.getCacheCandidates().values().stream().filter(l -> l > 1).count());
}
Also used : Pipeline(org.apache.beam.sdk.Pipeline) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) List(java.util.List) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) Test(org.junit.Test)

Aggregations

EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)10 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)6 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 WindowedValue (org.apache.beam.sdk.util.WindowedValue)5 TransformTranslator (org.apache.beam.runners.spark.translation.TransformTranslator)4 JavaStreamingContext (org.apache.spark.streaming.api.java.JavaStreamingContext)4 SparkPipelineTranslator (org.apache.beam.runners.spark.translation.SparkPipelineTranslator)3 KV (org.apache.beam.sdk.values.KV)3 PCollection (org.apache.beam.sdk.values.PCollection)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 LinkedBlockingQueue (java.util.concurrent.LinkedBlockingQueue)2 SerializablePipelineOptions (org.apache.beam.runners.core.construction.SerializablePipelineOptions)2 SparkPCollectionView (org.apache.beam.runners.spark.translation.SparkPCollectionView)2 TranslationUtils (org.apache.beam.runners.spark.translation.TranslationUtils)2 Pipeline (org.apache.beam.sdk.Pipeline)2 KvCoder (org.apache.beam.sdk.coders.KvCoder)2 TupleTag (org.apache.beam.sdk.values.TupleTag)2 JavaDStream (org.apache.spark.streaming.api.java.JavaDStream)2 Test (org.junit.Test)2 ArrayList (java.util.ArrayList)1