use of org.apache.spark.streaming.api.java.JavaStreamingContext in project beam by apache.
the class TrackStreamingSourcesTest method testTrackFlattened.
@Test
public void testTrackFlattened() {
options.setRunner(SparkRunner.class);
JavaSparkContext jsc = SparkContextFactory.getSparkContext(options);
JavaStreamingContext jssc = new JavaStreamingContext(jsc, new org.apache.spark.streaming.Duration(options.getBatchIntervalMillis()));
Pipeline p = Pipeline.create(options);
CreateStream<Integer> queueStream1 = CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis())).emptyBatch();
CreateStream<Integer> queueStream2 = CreateStream.of(VarIntCoder.of(), Duration.millis(options.getBatchIntervalMillis())).emptyBatch();
PCollection<Integer> pcol1 = p.apply(queueStream1);
PCollection<Integer> pcol2 = p.apply(queueStream2);
PCollection<Integer> flattened = PCollectionList.of(pcol1).and(pcol2).apply(Flatten.<Integer>pCollections());
flattened.apply(ParDo.of(new PassthroughFn<>()));
p.traverseTopologically(new StreamingSourceTracker(jssc, p, ParDo.MultiOutput.class, 0, 1));
assertThat(StreamingSourceTracker.numAssertions, equalTo(1));
}
use of org.apache.spark.streaming.api.java.JavaStreamingContext in project beam by apache.
the class SparkRunner method run.
@Override
public SparkPipelineResult run(final Pipeline pipeline) {
LOG.info("Executing pipeline using the SparkRunner.");
final SparkPipelineResult result;
final Future<?> startPipeline;
final SparkPipelineTranslator translator;
final ExecutorService executorService = Executors.newSingleThreadExecutor();
MetricsEnvironment.setMetricsSupported(true);
// visit the pipeline to determine the translation mode
detectTranslationMode(pipeline);
if (mOptions.isStreaming()) {
CheckpointDir checkpointDir = new CheckpointDir(mOptions.getCheckpointDir());
SparkRunnerStreamingContextFactory streamingContextFactory = new SparkRunnerStreamingContextFactory(pipeline, mOptions, checkpointDir);
final JavaStreamingContext jssc = JavaStreamingContext.getOrCreate(checkpointDir.getSparkCheckpointDir().toString(), streamingContextFactory);
// Checkpoint aggregator/metrics values
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new AggregatorsAccumulator.AccumulatorCheckpointingSparkListener()));
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new MetricsAccumulator.AccumulatorCheckpointingSparkListener()));
// register user-defined listeners.
for (JavaStreamingListener listener : mOptions.as(SparkContextOptions.class).getListeners()) {
LOG.info("Registered listener {}." + listener.getClass().getSimpleName());
jssc.addStreamingListener(new JavaStreamingListenerWrapper(listener));
}
// register Watermarks listener to broadcast the advanced WMs.
jssc.addStreamingListener(new JavaStreamingListenerWrapper(new WatermarksListener(jssc)));
// The reason we call initAccumulators here even though it is called in
// SparkRunnerStreamingContextFactory is because the factory is not called when resuming
// from checkpoint (When not resuming from checkpoint initAccumulators will be called twice
// but this is fine since it is idempotent).
initAccumulators(mOptions, jssc.sparkContext());
startPipeline = executorService.submit(new Runnable() {
@Override
public void run() {
LOG.info("Starting streaming pipeline execution.");
jssc.start();
}
});
result = new SparkPipelineResult.StreamingMode(startPipeline, jssc);
} else {
// create the evaluation context
final JavaSparkContext jsc = SparkContextFactory.getSparkContext(mOptions);
final EvaluationContext evaluationContext = new EvaluationContext(jsc, pipeline, mOptions);
translator = new TransformTranslator.Translator();
// update the cache candidates
updateCacheCandidates(pipeline, translator, evaluationContext);
initAccumulators(mOptions, jsc);
startPipeline = executorService.submit(new Runnable() {
@Override
public void run() {
pipeline.traverseTopologically(new Evaluator(translator, evaluationContext));
evaluationContext.computeOutputs();
LOG.info("Batch pipeline execution complete.");
}
});
result = new SparkPipelineResult.BatchMode(startPipeline, jsc);
}
if (mOptions.getEnableSparkMetricSinks()) {
registerMetricsSource(mOptions.getAppName());
}
return result;
}
use of org.apache.spark.streaming.api.java.JavaStreamingContext in project beam by apache.
the class SparkRunnerDebugger method run.
@Override
public SparkPipelineResult run(Pipeline pipeline) {
JavaSparkContext jsc = new JavaSparkContext("local[1]", "Debug_Pipeline");
JavaStreamingContext jssc = new JavaStreamingContext(jsc, new org.apache.spark.streaming.Duration(1000));
SparkRunner.initAccumulators(options, jsc);
TransformTranslator.Translator translator = new TransformTranslator.Translator();
SparkNativePipelineVisitor visitor;
if (options.isStreaming() || options instanceof TestSparkPipelineOptions && ((TestSparkPipelineOptions) options).isForceStreaming()) {
SparkPipelineTranslator streamingTranslator = new StreamingTransformTranslator.Translator(translator);
EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
visitor = new SparkNativePipelineVisitor(streamingTranslator, ctxt);
} else {
EvaluationContext ctxt = new EvaluationContext(jsc, pipeline, options, jssc);
visitor = new SparkNativePipelineVisitor(translator, ctxt);
}
pipeline.traverseTopologically(visitor);
jsc.stop();
String debugString = visitor.getDebugString();
LOG.info("Translated Native Spark pipeline:\n" + debugString);
return new DebugSparkPipelineResult(debugString);
}
use of org.apache.spark.streaming.api.java.JavaStreamingContext in project cdap by caskdata.
the class MockSource method getStream.
@Override
public JavaDStream<StructuredRecord> getStream(StreamingContext context) throws Exception {
Schema schema = Schema.parseJson(conf.schema);
List<String> recordsAsStrings = new Gson().fromJson(conf.records, STRING_LIST_TYPE);
final List<StructuredRecord> inputRecords = new ArrayList<>();
for (String recordStr : recordsAsStrings) {
inputRecords.add(StructuredRecordStringConverter.fromJsonString(recordStr, schema));
}
JavaStreamingContext jsc = context.getSparkStreamingContext();
return jsc.receiverStream(new Receiver<StructuredRecord>(StorageLevel.MEMORY_ONLY()) {
@Override
public StorageLevel storageLevel() {
return StorageLevel.MEMORY_ONLY();
}
@Override
public void onStart() {
new Thread() {
@Override
public void run() {
for (StructuredRecord record : inputRecords) {
if (isStarted()) {
store(record);
try {
TimeUnit.MILLISECONDS.sleep(conf.intervalMillis);
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
}
}
}
@Override
public void interrupt() {
super.interrupt();
}
}.start();
}
@Override
public void onStop() {
}
});
}
use of org.apache.spark.streaming.api.java.JavaStreamingContext in project cdap by caskdata.
the class SparkStreamingPipelineDriver method run.
@Override
public void run(final JavaSparkExecutionContext sec) throws Exception {
final DataStreamsPipelineSpec pipelineSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), DataStreamsPipelineSpec.class);
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(SUPPORTED_PLUGIN_TYPES).addConnections(pipelineSpec.getConnections());
for (StageSpec stageSpec : pipelineSpec.getStages()) {
phaseBuilder.addStage(StageInfo.builder(stageSpec.getName(), stageSpec.getPlugin().getType()).addInputs(stageSpec.getInputs()).addOutputs(stageSpec.getOutputs()).addInputSchemas(stageSpec.getInputSchemas()).setOutputSchema(stageSpec.getOutputSchema()).setErrorSchema(stageSpec.getErrorSchema()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build());
}
final PipelinePhase pipelinePhase = phaseBuilder.build();
boolean checkpointsDisabled = pipelineSpec.isCheckpointsDisabled();
String checkpointDir = null;
if (!checkpointsDisabled) {
// Get the location of the checkpoint directory.
String pipelineName = sec.getApplicationSpecification().getName();
String relativeCheckpointDir = pipelineSpec.getCheckpointDirectory();
// there isn't any way to instantiate the fileset except in a TxRunnable, so need to use a reference.
final AtomicReference<Location> checkpointBaseRef = new AtomicReference<>();
Transactionals.execute(sec, new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
FileSet checkpointFileSet = context.getDataset(DataStreamsApp.CHECKPOINT_FILESET);
checkpointBaseRef.set(checkpointFileSet.getBaseLocation());
}
}, Exception.class);
Location pipelineCheckpointDir = checkpointBaseRef.get().append(pipelineName).append(relativeCheckpointDir);
checkpointDir = pipelineCheckpointDir.toURI().toString();
}
JavaStreamingContext jssc = run(pipelineSpec, pipelinePhase, sec, checkpointDir);
jssc.start();
boolean stopped = false;
try {
// most programs will just keep running forever.
// however, when CDAP stops the program, we get an interrupted exception.
// at that point, we need to call stop on jssc, otherwise the program will hang and never stop.
stopped = jssc.awaitTerminationOrTimeout(Long.MAX_VALUE);
} finally {
if (!stopped) {
jssc.stop(true, pipelineSpec.isStopGracefully());
}
}
}
Aggregations