use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.
the class MultiSinkFunction method initializeBranchExecutors.
private void initializeBranchExecutors() {
emitter = new DefaultEmitter<>();
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(pipelineRuntime.getPluginContext(), pipelineRuntime.getMetrics(), phaseSpec, new SingleConnectorFactory());
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), pipelineRuntime.getLogicalStartTime(), pipelineRuntime.getSecureStore(), pipelineRuntime.getServiceDiscoverer(), pipelineRuntime.getNamespace());
executorFactory = new SparkTransformExecutorFactory(pluginInstantiator, macroEvaluator, null, collectors, dataTracers, pipelineRuntime, emitter);
/*
If the dag is:
|--> t1 --> k1
s1 --|
|--> k2
^
s2 ---|
the group is t1, k1, and k2.
*/
PipelinePhase pipelinePhase = phaseSpec.getPhase();
branchExecutors = new HashMap<>();
inputConnections = new HashMap<>();
for (String groupSource : group) {
// group "sources" are stages in the group that don't have an input from another stage in the group.
if (Sets.difference(pipelinePhase.getStageInputs(groupSource), group).isEmpty()) {
continue;
}
// get the branch by taking a subset of the pipeline starting from the "source".
// with the example above, the two branches are t1 -> k1, and k2.
PipelinePhase branch;
if (pipelinePhase.getSinks().contains(groupSource)) {
// pipelinePhase.subsetFrom() throws an exception if the new "source" is also a sink,
// since a Dag cannot be a single node. so build it manually.
branch = PipelinePhase.builder(pipelinePhase.getPluginTypes()).addStage(pipelinePhase.getStage(groupSource)).build();
} else {
branch = pipelinePhase.subsetFrom(Collections.singleton(groupSource));
}
try {
branchExecutors.put(groupSource, executorFactory.create(branch));
} catch (Exception e) {
throw new IllegalStateException(String.format("Unable to get subset of pipeline starting from stage %s. " + "This indicates a planning error. Please report this bug and turn off stage " + "consolidation by setting %s to false in the runtime arguments.", groupSource, Constants.CONSOLIDATE_STAGES), e);
}
/*
create a mapping from possible inputs to "group sources". This will help identify which incoming
records should be sent to which branch executor.
for example, the pipeline may look like:
|port a --> k1
s --> split --|
|port b --> k2
In this scenario, k1, and k2, are all in the same group, so the map contains:
{ stageName: split, port: a, type: output } -> [k1]
{ stageName: split, port: b, type: output } -> [k2]
A slightly more complicated example:
|--> k1
s1 --> transform --|
| |--> k2
|
|--> error collector --> k3
In this scenario, k1, k2, k3, and error collector are in the same group, so the map contains:
{ stageName: transform, type: output } -> [k1, k2]
{ stageName: transform, type: error } -> [k3]
*/
String groupSourceType = pipelinePhase.getStage(groupSource).getPluginType();
RecordType recordType = ErrorTransform.PLUGIN_TYPE.equals(groupSourceType) ? RecordType.ERROR : RecordType.OUTPUT;
for (String inputStage : pipelinePhase.getStageInputs(groupSource)) {
Map<String, StageSpec.Port> ports = pipelinePhase.getStage(inputStage).getOutputPorts();
String port = ports.get(groupSource).getPort();
InputInfo inputInfo = new InputInfo(inputStage, recordType, port);
Set<String> groupSources = inputConnections.computeIfAbsent(inputInfo, key -> new HashSet<>());
groupSources.add(groupSource);
}
}
}
use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.
the class SparkStreamingPipelineDriver method run.
private JavaStreamingContext run(DataStreamsPipelineSpec pipelineSpec, PipelinePhase pipelinePhase, JavaSparkExecutionContext sec, @Nullable String checkpointDir, @Nullable JavaSparkContext context) throws Exception {
PipelinePluginContext pluginContext = new PipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled());
PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
SparkStreamingPreparer preparer = new SparkStreamingPreparer(pluginContext, sec.getMetrics(), evaluator, pipelineRuntime, sec);
try {
SparkFieldLineageRecorder recorder = new SparkFieldLineageRecorder(sec, pipelinePhase, pipelineSpec, preparer);
recorder.record();
} catch (Exception e) {
LOG.warn("Failed to emit field lineage operations for streaming pipeline", e);
}
Set<String> uncombinableSinks = preparer.getUncombinableSinks();
// the content in the function might not run due to spark checkpointing, currently just have the lineage logic
// before anything is run
Function0<JavaStreamingContext> contextFunction = (Function0<JavaStreamingContext>) () -> {
JavaSparkContext javaSparkContext = context == null ? new JavaSparkContext() : context;
JavaStreamingContext jssc = new JavaStreamingContext(javaSparkContext, Durations.milliseconds(pipelineSpec.getBatchIntervalMillis()));
SparkStreamingPipelineRunner runner = new SparkStreamingPipelineRunner(sec, jssc, pipelineSpec, pipelineSpec.isCheckpointsDisabled());
// Seems like they should be set at configure time instead of runtime? but that requires an API change.
try {
PhaseSpec phaseSpec = new PhaseSpec(sec.getApplicationSpecification().getName(), pipelinePhase, Collections.emptyMap(), pipelineSpec.isStageLoggingEnabled(), pipelineSpec.isProcessTimingEnabled());
boolean shouldConsolidateStages = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CONSOLIDATE_STAGES, Boolean.TRUE.toString()));
boolean shouldCacheFunctions = Boolean.parseBoolean(sec.getRuntimeArguments().getOrDefault(Constants.CACHE_FUNCTIONS, Boolean.TRUE.toString()));
runner.runPipeline(phaseSpec, StreamingSource.PLUGIN_TYPE, sec, Collections.emptyMap(), pluginContext, Collections.emptyMap(), uncombinableSinks, shouldConsolidateStages, shouldCacheFunctions);
} catch (Exception e) {
throw new RuntimeException(e);
}
if (checkpointDir != null) {
jssc.checkpoint(checkpointDir);
jssc.sparkContext().hadoopConfiguration().set("fs.defaultFS", checkpointDir);
}
return jssc;
};
return checkpointDir == null ? contextFunction.call() : JavaStreamingContext.getOrCreate(checkpointDir, contextFunction, context.hadoopConfiguration());
}
use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.
the class PipelineAction method run.
@Override
public void run() throws Exception {
CustomActionContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePhase phase = phaseSpec.getPhase();
StageSpec stageSpec = phase.iterator().next();
PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace()));
ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
action.run(actionContext);
}
WorkflowToken token = context.getWorkflowToken();
if (token == null) {
throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
}
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.
the class ETLMapReduce method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
MapReduceContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
Job job = context.getHadoopJob();
Configuration hConf = job.getConfiguration();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
// should never happen if planner is correct
Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
if (reducers.size() > 1) {
Iterator<StageSpec> reducerIter = reducers.iterator();
StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
while (reducerIter.hasNext()) {
reducersStr.append(",");
reducersStr.append(reducerIter.next().getName());
}
throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
}
job.setMapperClass(ETLMapper.class);
if (reducers.isEmpty()) {
job.setNumReduceTasks(0);
} else {
job.setReducerClass(ETLReducer.class);
}
// instantiate plugins and call their prepare methods
Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace());
MapReducePreparer preparer = new MapReducePreparer(context, mrMetrics, evaluator, pipelineRuntime, connectorDatasets);
List<Finisher> finishers = preparer.prepare(phaseSpec, job);
finisher = new CompositeFinisher(finishers);
}
use of io.cdap.cdap.etl.common.DefaultMacroEvaluator in project cdap by cdapio.
the class JavaSparkMainWrapper method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
String stageName = sec.getSpecification().getProperty(ExternalSparkProgram.STAGE_NAME);
BatchPhaseSpec batchPhaseSpec = GSON.fromJson(sec.getSpecification().getProperty(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePluginContext pluginContext = new SparkPipelinePluginContext(sec.getPluginContext(), sec.getMetrics(), batchPhaseSpec.isStageLoggingEnabled(), batchPhaseSpec.isProcessTimingEnabled());
Class<?> mainClass = pluginContext.loadPluginClass(stageName);
// if it's a CDAP JavaSparkMain, instantiate it and call the run method
if (JavaSparkMain.class.isAssignableFrom(mainClass)) {
MacroEvaluator macroEvaluator = new DefaultMacroEvaluator(new BasicArguments(sec), sec.getLogicalStartTime(), sec.getSecureStore(), sec.getServiceDiscoverer(), sec.getNamespace());
JavaSparkMain javaSparkMain = pluginContext.newPluginInstance(stageName, macroEvaluator);
javaSparkMain.run(sec);
} else {
// otherwise, assume there is a 'main' method and call it
String programArgs = getProgramArgs(sec, stageName);
String[] args = programArgs == null ? RuntimeArguments.toPosixArray(sec.getRuntimeArguments()) : programArgs.split(" ");
final Method mainMethod = mainClass.getMethod("main", String[].class);
final Object[] methodArgs = new Object[1];
methodArgs[0] = args;
Caller caller = pluginContext.getCaller(stageName);
caller.call(new Callable<Void>() {
@Override
public Void call() throws Exception {
mainMethod.invoke(null, methodArgs);
return null;
}
});
}
}
Aggregations