Search in sources :

Example 11 with JavaStreamingContext

use of org.apache.spark.streaming.api.java.JavaStreamingContext in project beam by apache.

the class TrackStreamingSourcesTest method testTrackFlattened.

@Test
public void testTrackFlattened() {
    options.setRunner(SparkRunner.class);
    JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
    JavaStreamingContext jssc = new JavaStreamingContext(jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));
    Pipeline p = Pipeline.create(options);
    CreateStream<Integer> queueStream1 = CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis())).emptyBatch();
    CreateStream<Integer> queueStream2 = CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis())).emptyBatch();
    PCollection<Integer> pcol1 = p.apply(queueStream1);
    PCollection<Integer> pcol2 = p.apply(queueStream2);
    PCollection<Integer> flattened = PCollectionList.of(pcol1).and(pcol2).apply(Flatten.<Integer>pCollections());
    flattened.apply(ParDo.of(new PassthroughFn<>()));
    p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1));
    assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}
Also used : JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 12 with JavaStreamingContext

use of org.apache.spark.streaming.api.java.JavaStreamingContext in project beam by apache.

the class SparkRunner method run.

@Override
public SparkPipelineResult run(final Pipeline pipeline) {
    LOG.info("Executing pipeline using the SparkRunner.");
    final SparkPipelineResult result;
    final Future<?> startPipeline;
    final SparkPipelineTranslator translator;
    final ExecutorService executorService = Executors.newSingleThreadExecutor();
    MetricsEnvironment.setMetricsSupported(true);
    // visit the pipeline to determine the translation mode
    detectTranslationMode(pipeline);
    if (mOptions.isStreaming()) {
        CheckpointDir checkpointDir = new CheckpointDir(mOptions.getCheckpointDir());
        SparkRunnerStreamingContextFactory streamingContextFactory = new SparkRunnerStreamingContextFactory(pipeline, mOptions, checkpointDir);
        final JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory);
        // Checkpoint aggregator/metrics values
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
        // register user-defined listeners.
        for (JavaStreamingListener listener : mOptions.as(SparkContextOptions.class).getListeners()) {
            LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
            jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
        }
        // register Watermarks listener to broadcast the advanced WMs.
        jssc.addStreamingListener(new JavaStreamingListenerWrapper(new WatermarksListener(jssc)));
        // The reason we call initAccumulators here even though it is called in
        // SparkRunnerStreamingContextFactory is because the factory is not called when resuming
        // from checkpoint (When not resuming from checkpoint initAccumulators will be called twice
        // but this is fine since it is idempotent).
        initAccumulators(mOptions, jssc.sparkContext());
        startPipeline = executorService.submit(new Runnable() {

            @Override
            public void run() {
                LOG.info("Starting streaming pipeline execution.");
                jssc.start();
            }
        });
        result = new SparkPipelineResult.StreamingMode(startPipeline, jssc);
    } else {
        // create the evaluation context
        final JavaSparkContext jsc = SparkContextFactory.getSparkContext(mOptions);
        final EvaluationContext evaluationContext = new EvaluationContext(jsc, pipeline, mOptions);
        translator = new TransformTranslator.Translator();
        // update the cache candidates
        updateCacheCandidates(pipeline, translator, evaluationContext);
        initAccumulators(mOptions, jsc);
        startPipeline = executorService.submit(new Runnable() {

            @Override
            public void run() {
                pipeline.traverseTopologically(new Evaluator(translator, evaluationContext));
                evaluationContext.computeOutputs();
                LOG.info("Batch pipeline execution complete.");
            }
        });
        result = new SparkPipelineResult.BatchMode(startPipeline, jsc);
    }
    if (mOptions.getEnableSparkMetricSinks()) {
        registerMetricsSource(mOptions.getAppName());
    }
    return result;
}
Also used : JavaStreamingListenerWrapper(org.apache.spark.streaming.api.java.JavaStreamingListenerWrapper) JavaStreamingListener(org.apache.spark.streaming.api.java.JavaStreamingListener) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) SparkRunnerStreamingContextFactory(org.apache.beam.runners.spark.translation.streaming.SparkRunnerStreamingContextFactory) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) WatermarksListener(org.apache.beam.runners.spark.util.GlobalWatermarkHolder.WatermarksListener) ExecutorService(java.util.concurrent.ExecutorService) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator) CheckpointDir(org.apache.beam.runners.spark.translation.streaming.Checkpoint.CheckpointDir)

Example 13 with JavaStreamingContext

use of org.apache.spark.streaming.api.java.JavaStreamingContext in project beam by apache.

the class SparkRunnerDebugger method run.

@Override
public SparkPipelineResult run(Pipeline pipeline) {
    JavaSparkContext jsc = new JavaSparkContext("local[1]", "Debug_Pipeline");
    JavaStreamingContext jssc = new JavaStreamingContext(jsc, new org.apache.spark.streaming.Duration(1000));
    SparkRunner.initAccumulators(options, jsc);
    TransformTranslator.Translator translator = new TransformTranslator.Translator();
    SparkNativePipelineVisitor visitor;
    if (options.isStreaming() || options instanceof TestSparkPipelineOptions && ((TestSparkPipelineOptions) options).isForceStreaming()) {
        SparkPipelineTranslator streamingTranslator = new StreamingTransformTranslator.Translator(translator);
        EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
        visitor = new SparkNativePipelineVisitor(streamingTranslator, ctxt);
    } else {
        EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
        visitor = new SparkNativePipelineVisitor(translator, ctxt);
    }
    pipeline.traverseTopologically(visitor);
    jsc.stop();
    String debugString = visitor.getDebugString();
    LOG.info("Translated Native Spark pipeline:\n" + debugString);
    return new DebugSparkPipelineResult(debugString);
}
Also used : JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) StreamingTransformTranslator(org.apache.beam.runners.spark.translation.streaming.StreamingTransformTranslator) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator) TransformTranslator(org.apache.beam.runners.spark.translation.TransformTranslator) StreamingTransformTranslator(org.apache.beam.runners.spark.translation.streaming.StreamingTransformTranslator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) SparkPipelineTranslator(org.apache.beam.runners.spark.translation.SparkPipelineTranslator)

Example 14 with JavaStreamingContext

use of org.apache.spark.streaming.api.java.JavaStreamingContext in project cdap by caskdata.

the class MockSource method getStream.

@Override
public JavaDStream<StructuredRecord> getStream(StreamingContext context) throws Exception {
    Schema schema = Schema.parseJson(conf.schema);
    List<String> recordsAsStrings = new Gson().fromJson(conf.records, STRING_LIST_TYPE);
    final List<StructuredRecord> inputRecords = new ArrayList<>();
    for (String recordStr : recordsAsStrings) {
        inputRecords.add(StructuredRecordStringConverter.fromJsonString(recordStr, schema));
    }
    JavaStreamingContext jsc = context.getSparkStreamingContext();
    return jsc.receiverStream(new Receiver<StructuredRecord>(StorageLevel.MEMORY_ONLY()) {

        @Override
        public StorageLevel storageLevel() {
            return StorageLevel.MEMORY_ONLY();
        }

        @Override
        public void onStart() {
            new Thread() {

                @Override
                public void run() {
                    for (StructuredRecord record : inputRecords) {
                        if (isStarted()) {
                            store(record);
                            try {
                                TimeUnit.MILLISECONDS.sleep(conf.intervalMillis);
                            } catch (InterruptedException e) {
                                throw new RuntimeException(e);
                            }
                        }
                    }
                }

                @Override
                public void interrupt() {
                    super.interrupt();
                }
            }.start();
        }

        @Override
        public void onStop() {
        }
    });
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) Gson(com.google.gson.Gson) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) StorageLevel(org.apache.spark.storage.StorageLevel)

Example 15 with JavaStreamingContext

use of org.apache.spark.streaming.api.java.JavaStreamingContext in project cdap by caskdata.

the class SparkStreamingPipelineDriver method run.

@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
    final DataStreamsPipelineSpec pipelineSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
    PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(SUPPORTED_PLUGIN_TYPES).addConnections(pipelineSpec.getConnections());
    for (StageSpec stageSpec : pipelineSpec.getStages()) {
        phaseBuilder.addStage(StageInfo.builder(stageSpec.getName(), stageSpec.getPlugin().getType()).addInputs(stageSpec.getInputs()).addOutputs(stageSpec.getOutputs()).addInputSchemas(stageSpec.getInputSchemas()).setOutputSchema(stageSpec.getOutputSchema()).setErrorSchema(stageSpec.getErrorSchema()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build());
    }
    final PipelinePhase pipelinePhase = phaseBuilder.build();
    boolean checkpointsDisabled = pipelineSpec.isCheckpointsDisabled();
    String checkpointDir = null;
    if (!checkpointsDisabled) {
        // Get the location of the checkpoint directory.
        String pipelineName = sec.getApplicationSpecification().getName();
        String relativeCheckpointDir = pipelineSpec.getCheckpointDirectory();
        // there isn't any way to instantiate the fileset except in a TxRunnable, so need to use a reference.
        final AtomicReference<Location> checkpointBaseRef = new AtomicReference<>();
        Transactionals.execute(sec, new TxRunnable() {

            @Override
            public void run(DatasetContext context) throws Exception {
                FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
                checkpointBaseRef.set(checkpointFileSet.getBaseLocation());
            }
        }, Exception.class);
        Location pipelineCheckpointDir = checkpointBaseRef.get().append(pipelineName).append(relativeCheckpointDir);
        checkpointDir = pipelineCheckpointDir.toURI().toString();
    }
    JavaStreamingContext jssc = run(pipelineSpec, pipelinePhase, sec, checkpointDir);
    jssc.start();
    boolean stopped = false;
    try {
        // most programs will just keep running forever.
        // however, when CDAP stops the program, we get an interrupted exception.
        // at that point, we need to call stop on jssc, otherwise the program will hang and never stop.
        stopped = jssc.awaitTerminationOrTimeout(Long.MAX_VALUE);
    } finally {
        if (!stopped) {
            jssc.stop(true, pipelineSpec.isStopGracefully());
        }
    }
}
Also used : FileSet(co.cask.cdap.api.dataset.lib.FileSet) AtomicReference(java.util.concurrent.atomic.AtomicReference) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) TxRunnable(co.cask.cdap.api.TxRunnable) StageSpec(co.cask.cdap.etl.spec.StageSpec) DatasetContext(co.cask.cdap.api.data.DatasetContext) Location(org.apache.twill.filesystem.Location)

Aggregations

JavaStreamingContext (org.apache.spark.streaming.api.java.JavaStreamingContext)16 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)8 Duration (org.apache.spark.streaming.Duration)5 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)4 SparkConf (org.apache.spark.SparkConf)4 SparkPipelineTranslator (org.apache.beam.runners.spark.translation.SparkPipelineTranslator)3 TransformTranslator (org.apache.beam.runners.spark.translation.TransformTranslator)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 TransformEvaluator (org.apache.beam.runners.spark.translation.TransformEvaluator)2 Pipeline (org.apache.beam.sdk.Pipeline)2 Test (org.junit.Test)2 TxRunnable (co.cask.cdap.api.TxRunnable)1 DatasetContext (co.cask.cdap.api.data.DatasetContext)1 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)1 Schema (co.cask.cdap.api.data.schema.Schema)1 FileSet (co.cask.cdap.api.dataset.lib.FileSet)1 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)1 PipelinePluginContext (co.cask.cdap.etl.common.plugin.PipelinePluginContext)1 StageSpec (co.cask.cdap.etl.spec.StageSpec)1 KafkaIO (com.cloudera.dataflow.io.KafkaIO)1