use of co.cask.cdap.api.macro.MacroEvaluator in project cdap by caskdata.
the class ETLWorkflow method initialize.
@Override
public void initialize(WorkflowContext context) throws Exception {
super.initialize(context);
postActions = new LinkedHashMap<>();
BatchPipelineSpec batchPipelineSpec = GSON.fromJson(context.getWorkflowSpecification().getProperty("pipeline.spec"), BatchPipelineSpec.class);
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(context.getToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
for (ActionSpec actionSpec : batchPipelineSpec.getEndingActions()) {
postActions.put(actionSpec.getName(), (PostAction) context.newPluginInstance(actionSpec.getName(), macroEvaluator));
}
}
use of co.cask.cdap.api.macro.MacroEvaluator in project cdap by caskdata.
the class ETLSpark method initialize.
@Override
public void initialize() throws Exception {
SparkClientContext context = getContext();
cleanupFiles = new ArrayList<>();
CompositeFinisher.Builder finishers = CompositeFinisher.builder();
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
sparkConf.set("spark.speculation", "false");
context.setSparkConf(sparkConf);
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
Map<String, Integer> stagePartitions = new HashMap<>();
PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
for (StageInfo stageInfo : phaseSpec.getPhase()) {
String stageName = stageInfo.getName();
String pluginType = stageInfo.getPluginType();
if (BatchSource.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<BatchSourceContext> batchSource = pluginContext.newPluginInstance(stageName, evaluator);
BatchSourceContext sourceContext = new SparkBatchSourceContext(sourceFactory, context, stageInfo);
batchSource.prepareRun(sourceContext);
finishers.add(batchSource, sourceContext);
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<BatchSinkContext> batchSink = pluginContext.newPluginInstance(stageName, evaluator);
BatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, context, null, stageInfo);
batchSink.prepareRun(sinkContext);
finishers.add(batchSink, sinkContext);
} else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
BatchConfigurable<SparkPluginContext> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
SparkPluginContext sparkPluginContext = new BasicSparkPluginContext(context, stageInfo);
sparkSink.prepareRun(sparkPluginContext);
finishers.add(sparkSink, sparkPluginContext);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
BatchAggregator aggregator = pluginContext.newPluginInstance(stageName, evaluator);
DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, stageInfo);
aggregator.prepareRun(aggregatorContext);
finishers.add(aggregator, aggregatorContext);
stagePartitions.put(stageName, aggregatorContext.getNumPartitions());
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
BatchJoiner joiner = pluginContext.newPluginInstance(stageName, evaluator);
DefaultJoinerContext sparkJoinerContext = new DefaultJoinerContext(context, stageInfo);
joiner.prepareRun(sparkJoinerContext);
finishers.add(joiner, sparkJoinerContext);
stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
}
}
File configFile = File.createTempFile("HydratorSpark", ".config");
cleanupFiles.add(configFile);
try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
writer.write(GSON.toJson(sourceSinkInfo));
}
finisher = finishers.build();
context.localize("HydratorSpark.config", configFile.toURI());
}
use of co.cask.cdap.api.macro.MacroEvaluator in project cdap by caskdata.
the class SmartWorkflow method initialize.
@Override
public void initialize(WorkflowContext context) throws Exception {
super.initialize(context);
String arguments = Joiner.on(", ").withKeyValueSeparator("=").join(context.getRuntimeArguments());
WRAPPERLOGGER.info("Pipeline '{}' is started by user '{}' with arguments {}", context.getApplicationSpecification().getName(), UserGroupInformation.getCurrentUser().getShortUserName(), arguments);
postActions = new LinkedHashMap<>();
spec = GSON.fromJson(context.getWorkflowSpecification().getProperty(Constants.PIPELINE_SPEC_KEY), BatchPipelineSpec.class);
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(context.getToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
PluginContext pluginContext = new PipelinePluginContext(context, workflowMetrics, spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled());
for (ActionSpec actionSpec : spec.getEndingActions()) {
postActions.put(actionSpec.getName(), (PostAction) pluginContext.newPluginInstance(actionSpec.getName(), macroEvaluator));
}
WRAPPERLOGGER.info("Pipeline '{}' running", context.getApplicationSpecification().getName());
}
use of co.cask.cdap.api.macro.MacroEvaluator in project cdap by caskdata.
the class SparkStreamingPipelineRunner method getSource.
@Override
protected SparkCollection<Tuple2<Boolean, Object>> getSource(StageInfo stageInfo) throws Exception {
StreamingSource<Object> source;
if (checkpointsDisabled) {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageInfo, sec);
source = pluginFunctionContext.createPlugin();
} else {
// check for macros in any StreamingSource. If checkpoints are enabled,
// SparkStreaming will serialize all InputDStreams created in the checkpoint, which means
// the InputDStream is deserialized directly from the checkpoint instead of instantiated through CDAP.
// This means there isn't any way for us to perform macro evaluation on sources when they are loaded from
// checkpoints. We can work around this in all other pipeline stages by dynamically instantiating the
// plugin in all DStream functions, but can't for InputDStreams because the InputDStream constructor
// adds itself to the context dag. Yay for constructors with global side effects.
// TODO: (HYDRATOR-1030) figure out how to do this at configure time instead of run time
MacroEvaluator macroEvaluator = new ErrorMacroEvaluator("Due to spark limitations, macro evaluation is not allowed in streaming sources when checkpointing " + "is enabled.");
PluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled());
source = pluginContext.newPluginInstance(stageInfo.getName(), macroEvaluator);
}
DataTracer dataTracer = sec.getDataTracer(stageInfo.getName());
StreamingContext sourceContext = new DefaultStreamingContext(stageInfo, sec, streamingContext);
JavaDStream<Object> javaDStream = source.getStream(sourceContext);
if (dataTracer.isEnabled()) {
// it will create a new function for each RDD, which would limit each RDD but not the entire DStream.
javaDStream = javaDStream.transform(new LimitingFunction<>(spec.getNumOfRecordsPreview()));
}
JavaDStream<Tuple2<Boolean, Object>> outputDStream = javaDStream.transform(new CountingTransformFunction<>(stageInfo.getName(), sec.getMetrics(), "records.out", dataTracer)).map(new WrapOutputTransformFunction<>());
return new DStreamCollection<>(sec, outputDStream);
}
Aggregations