use of co.cask.cdap.etl.spark.SparkPipelineRuntime in project cdap by caskdata.
the class StreamingBatchSinkFunction method call.
@Override
public Void call(JavaRDD<T> data, Time batchTime) throws Exception {
if (data.isEmpty()) {
return null;
}
final long logicalStartTime = batchTime.milliseconds();
MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(sec), logicalStartTime, sec.getSecureStore(), sec.getNamespace());
PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), stageSpec.isStageLoggingEnabled(), stageSpec.isProcessTimingEnabled());
final SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
final String stageName = stageSpec.getName();
final BatchSink<Object, Object, Object> batchSink = pluginContext.newPluginInstance(stageName, evaluator);
final PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec, logicalStartTime);
boolean isPrepared = false;
boolean isDone = false;
try {
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
SparkBatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, sec, datasetContext, pipelineRuntime, stageSpec);
batchSink.prepareRun(sinkContext);
}
});
isPrepared = true;
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, pipelineRuntime.getArguments().asMap(), batchTime.milliseconds(), new NoopStageStatisticsCollector());
PairFlatMapFunc<T, Object, Object> sinkFunction = new BatchSinkFunction<T, Object, Object>(pluginFunctionContext);
sinkFactory.writeFromRDD(data.flatMapToPair(Compat.convert(sinkFunction)), sec, stageName, Object.class, Object.class);
isDone = true;
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
SparkBatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, sec, datasetContext, pipelineRuntime, stageSpec);
batchSink.onRunFinish(true, sinkContext);
}
});
} catch (Exception e) {
LOG.error("Error writing to sink {} for the batch for time {}.", stageName, logicalStartTime, e);
} finally {
if (isPrepared && !isDone) {
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
SparkBatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, sec, datasetContext, pipelineRuntime, stageSpec);
batchSink.onRunFinish(false, sinkContext);
}
});
}
}
return null;
}
use of co.cask.cdap.etl.spark.SparkPipelineRuntime in project cdap by caskdata.
the class DStreamCollection method compute.
@Override
public <U> SparkCollection<U> compute(final StageSpec stageSpec, SparkCompute<T, U> compute) throws Exception {
final SparkCompute<T, U> wrappedCompute = new DynamicSparkCompute<>(new DynamicDriverContext(stageSpec, sec, new NoopStageStatisticsCollector()), compute);
Transactionals.execute(sec, new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, JavaSparkContext.fromSparkContext(stream.context().sparkContext()), datasetContext, pipelineRuntime, stageSpec);
wrappedCompute.initialize(sparkPluginContext);
}
}, Exception.class);
return wrap(stream.transform(new ComputeTransformFunction<>(sec, stageSpec, wrappedCompute)));
}
use of co.cask.cdap.etl.spark.SparkPipelineRuntime in project cdap by caskdata.
the class StreamingSparkSinkFunction method call.
@Override
public Void call(JavaRDD<T> data, Time batchTime) throws Exception {
if (data.isEmpty()) {
return null;
}
final long logicalStartTime = batchTime.milliseconds();
MacroEvaluator evaluator = new DefaultMacroEvaluator(new BasicArguments(sec), logicalStartTime, sec.getSecureStore(), sec.getNamespace());
final PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), stageSpec.isStageLoggingEnabled(), stageSpec.isProcessTimingEnabled());
final PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec, batchTime.milliseconds());
final String stageName = stageSpec.getName();
final SparkSink<T> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
boolean isPrepared = false;
boolean isDone = false;
try {
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
SparkPluginContext context = new BasicSparkPluginContext(null, pipelineRuntime, stageSpec, datasetContext, sec.getAdmin());
sparkSink.prepareRun(context);
}
});
isPrepared = true;
final SparkExecutionPluginContext sparkExecutionPluginContext = new SparkStreamingExecutionContext(sec, JavaSparkContext.fromSparkContext(data.rdd().context()), logicalStartTime, stageSpec);
final JavaRDD<T> countedRDD = data.map(new CountingFunction<T>(stageName, sec.getMetrics(), "records.in", null)).cache();
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext context) throws Exception {
sparkSink.run(sparkExecutionPluginContext, countedRDD);
}
});
isDone = true;
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
SparkPluginContext context = new BasicSparkPluginContext(null, pipelineRuntime, stageSpec, datasetContext, sec.getAdmin());
sparkSink.onRunFinish(true, context);
}
});
} catch (Exception e) {
LOG.error("Error while executing sink {} for the batch for time {}.", stageName, logicalStartTime, e);
} finally {
if (isPrepared && !isDone) {
sec.execute(new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
SparkPluginContext context = new BasicSparkPluginContext(null, pipelineRuntime, stageSpec, datasetContext, sec.getAdmin());
sparkSink.onRunFinish(false, context);
}
});
}
}
return null;
}
use of co.cask.cdap.etl.spark.SparkPipelineRuntime in project cdap by caskdata.
the class RDDCollection method publishAlerts.
@Override
public void publishAlerts(StageSpec stageSpec, StageStatisticsCollector collector) throws Exception {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
AlertPublisher alertPublisher = pluginFunctionContext.createPlugin();
PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
AlertPublisherContext alertPublisherContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, sec.getMessagingContext(), sec.getAdmin());
alertPublisher.initialize(alertPublisherContext);
StageMetrics stageMetrics = new DefaultStageMetrics(sec.getMetrics(), stageSpec.getName());
TrackedIterator<Alert> trackedAlerts = new TrackedIterator<>(((JavaRDD<Alert>) rdd).collect().iterator(), stageMetrics, Constants.Metrics.RECORDS_IN);
alertPublisher.publish(trackedAlerts);
alertPublisher.destroy();
}
use of co.cask.cdap.etl.spark.SparkPipelineRuntime in project cdap by caskdata.
the class RDDCollection method store.
@Override
public void store(StageSpec stageSpec, SparkSink<T> sink) throws Exception {
String stageName = stageSpec.getName();
PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, jsc, datasetContext, pipelineRuntime, stageSpec);
JavaRDD<T> countedRDD = rdd.map(new CountingFunction<T>(stageName, sec.getMetrics(), "records.in", null)).cache();
sink.run(sparkPluginContext, countedRDD);
}
Aggregations