use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.
the class ETLSpark method initialize.
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
cleanupFiles = new ArrayList<>();
CompositeFinisher.Builder finishers = CompositeFinisher.builder();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.speculation", "false");
context.setSparkConf(sparkConf);
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
Map<String, Integer> stagePartitions = new HashMap<>();
PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
for (StageInfo stageInfo : phaseSpec.getPhase()) {
String stageName = stageInfo.getName();
String pluginType = stageInfo.getPluginType();
if (BatchSource.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<BatchSourceContext> batchSource = pluginContext.newPluginInstance(stageName, evaluator);
BatchSourceContext sourceContext = new SparkBatchSourceContext(sourceFactory, context, stageInfo);
batchSource.prepareRun(sourceContext);
finishers.add(batchSource, sourceContext);
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<BatchSinkContext> batchSink = pluginContext.newPluginInstance(stageName, evaluator);
BatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, context, null, stageInfo);
batchSink.prepareRun(sinkContext);
finishers.add(batchSink, sinkContext);
} else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<SparkPluginContext> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
SparkPluginContext sparkPluginContext = new BasicSparkPluginContext(context, stageInfo);
sparkSink.prepareRun(sparkPluginContext);
finishers.add(sparkSink, sparkPluginContext);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
BatchAggregator aggregator = pluginContext.newPluginInstance(stageName, evaluator);
DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, stageInfo);
aggregator.prepareRun(aggregatorContext);
finishers.add(aggregator, aggregatorContext);
stagePartitions.put(stageName, aggregatorContext.getNumPartitions());
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
BatchJoiner joiner = pluginContext.newPluginInstance(stageName, evaluator);
DefaultJoinerContext sparkJoinerContext = new DefaultJoinerContext(context, stageInfo);
joiner.prepareRun(sparkJoinerContext);
finishers.add(joiner, sparkJoinerContext);
stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
}
}
File configFile = File.createTempFile("HydratorSpark", ".config");
cleanupFiles.add(configFile);
try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
writer.write(GSON.toJson(sourceSinkInfo));
}
finisher = finishers.build();
context.localize("HydratorSpark.config", configFile.toURI());
}
use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.
the class ETLWorker method initializeTransforms.
private void initializeTransforms(WorkerContext context, Map<String, TransformDetail> transformDetailMap, PipelinePhase pipeline) throws Exception {
Set<StageInfo> transformInfos = pipeline.getStagesOfType(Transform.PLUGIN_TYPE);
Preconditions.checkArgument(transformInfos != null);
tranformIdToDatasetName = new HashMap<>(transformInfos.size());
for (StageInfo transformInfo : transformInfos) {
String transformName = transformInfo.getName();
try {
Transform<?, ?> transform = context.newPluginInstance(transformName);
;
transform = new WrappedTransform<>(transform, Caller.DEFAULT);
WorkerRealtimeContext transformContext = new WorkerRealtimeContext(context, metrics, new TxLookupProvider(context), transformInfo);
LOG.debug("Transform Class : {}", transform.getClass().getName());
transform.initialize(transformContext);
StageMetrics stageMetrics = new DefaultStageMetrics(metrics, transformName);
transformDetailMap.put(transformName, new TransformDetail(new TrackedTransform<>(transform, stageMetrics, context.getDataTracer(transformName)), pipeline.getStageOutputs(transformName)));
if (transformInfo.getErrorDatasetName() != null) {
tranformIdToDatasetName.put(transformName, transformInfo.getErrorDatasetName());
}
} catch (InstantiationException e) {
LOG.error("Unable to instantiate Transform", e);
Throwables.propagate(e);
}
}
}
use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.
the class ETLWorker method initializeSinks.
@SuppressWarnings("unchecked")
private void initializeSinks(WorkerContext context, Map<String, TransformDetail> transformationMap, PipelinePhase pipeline) throws Exception {
Set<StageInfo> sinkInfos = pipeline.getStagesOfType(RealtimeSink.PLUGIN_TYPE);
sinks = new HashMap<>(sinkInfos.size());
for (StageInfo sinkInfo : sinkInfos) {
String sinkName = sinkInfo.getName();
RealtimeSink sink = context.newPluginInstance(sinkName);
sink = new LoggedRealtimeSink(sinkName, sink);
WorkerRealtimeContext sinkContext = new WorkerRealtimeContext(context, metrics, new TxLookupProvider(context), sinkInfo);
LOG.debug("Sink Class : {}", sink.getClass().getName());
sink.initialize(sinkContext);
sink = new TrackedRealtimeSink(sink, new DefaultStageMetrics(metrics, sinkName));
Transformation identityTransformation = new Transformation() {
@Override
public void transform(Object input, Emitter emitter) throws Exception {
emitter.emit(input);
}
};
// we use identity transformation to simplify executing transformation in pipeline (similar to ETLMapreduce),
// since we want to emit metrics during write to sink and not during this transformation, we use NoOpMetrics.
TrackedTransform trackedTransform = new TrackedTransform(identityTransformation, new DefaultStageMetrics(metrics, sinkName), TrackedTransform.RECORDS_IN, null, context.getDataTracer(sinkName));
transformationMap.put(sinkInfo.getName(), new TransformDetail(trackedTransform, new HashSet<String>()));
sinks.put(sinkInfo.getName(), sink);
}
}
use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.
the class DynamicSparkCompute method lazyInit.
// when checkpointing is enabled, and Spark is loading DStream operations from an existing checkpoint,
// delegate will be null and the initialize() method won't have been called. So we need to instantiate
// the delegate and initialize it.
private void lazyInit(final JavaSparkContext jsc) throws Exception {
if (delegate == null) {
PluginFunctionContext pluginFunctionContext = dynamicDriverContext.getPluginFunctionContext();
delegate = pluginFunctionContext.createPlugin();
final StageInfo stageInfo = pluginFunctionContext.getStageInfo();
final JavaSparkExecutionContext sec = dynamicDriverContext.getSparkExecutionContext();
Transactionals.execute(sec, new TxRunnable() {
@Override
public void run(DatasetContext datasetContext) throws Exception {
SparkExecutionPluginContext sparkPluginContext = new BasicSparkExecutionPluginContext(sec, jsc, datasetContext, stageInfo);
delegate.initialize(sparkPluginContext);
}
}, Exception.class);
}
}
use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.
the class TransformExecutorFactory method create.
/**
* Create a transform executor for the specified pipeline. Will instantiate and initialize all sources,
* transforms, and sinks in the pipeline.
*
* @param pipeline the pipeline to create a transform executor for
* @return executor for the pipeline
* @throws InstantiationException if there was an error instantiating a plugin
* @throws Exception if there was an error initializing a plugin
*/
public <KEY_OUT, VAL_OUT> PipeTransformExecutor<T> create(PipelinePhase pipeline, OutputWriter<KEY_OUT, VAL_OUT> outputWriter, Map<String, ErrorOutputWriter<Object, Object>> transformErrorSinkMap) throws Exception {
Map<String, PipeTransformDetail> transformations = new HashMap<>();
Set<String> sources = pipeline.getSources();
// Set input and output schema for this stage
for (String pluginType : pipeline.getPluginTypes()) {
for (StageInfo stageInfo : pipeline.getStagesOfType(pluginType)) {
String stageName = stageInfo.getName();
outputSchemas.put(stageName, stageInfo.getOutputSchema());
perStageInputSchemas.put(stageName, stageInfo.getInputSchemas());
}
}
// recursively set PipeTransformDetail for all the stages
for (String source : sources) {
setPipeTransformDetail(pipeline, source, transformations, transformErrorSinkMap, outputWriter);
}
// sourceStageName will be null in reducers, so need to handle that case
Set<String> startingPoints = (sourceStageName == null) ? pipeline.getSources() : Sets.newHashSet(sourceStageName);
return new PipeTransformExecutor<>(transformations, startingPoints);
}
Aggregations