use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class PipelinePlanner method populateActionPhases.
/**
* This method is responsible for populating phases and phaseConnections with the Action phases.
* Action phase is a single stage {@link PipelinePhase} which does not have any dag.
*
* @param pipelineSpec the overall spec for the pipeline
* @param specs the Map of stage specs
* @param actionNodes the Set of action nodes in the pipeline
* @param phases the Map of phases created so far
* @param phaseConnections the Set of connections between phases added so far
* @param outgoingActionConnections the Map that holds set of stages to which
* there is an outgoing connection from a Action stage
* @param incomingActionConnections the Map that holds set of stages to which
* there is a incoming connection to an Action stage
* @param subdags subdags created so far from the pipeline stages
*/
private void populateActionPhases(PipelineSpec pipelineSpec, Map<String, StageSpec> specs, Set<String> actionNodes, Map<String, PipelinePhase> phases, Set<Connection> phaseConnections, SetMultimap<String, String> outgoingActionConnections, SetMultimap<String, String> incomingActionConnections, Map<String, Dag> subdags) {
// Create single stage phases for the Action nodes
for (String node : actionNodes) {
StageSpec actionStageSpec = specs.get(node);
String type = specs.get(node).getPlugin().getType();
StageInfo actionStageInfo = StageInfo.builder(node, type).addInputs(actionStageSpec.getInputs()).addInputSchemas(actionStageSpec.getInputSchemas()).addOutputs(actionStageSpec.getOutputs()).setOutputSchema(actionStageSpec.getOutputSchema()).setErrorSchema(actionStageSpec.getErrorSchema()).setErrorDatasetName(actionStageSpec.getErrorDatasetName()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build();
phases.put(node, PipelinePhase.builder(supportedPluginTypes).addStage(actionStageInfo).build());
}
// Build phaseConnections for the Action nodes
for (String sourceAction : outgoingActionConnections.keySet()) {
// Check if destination is one of the source stages in the pipeline
for (Map.Entry<String, Dag> subdagEntry : subdags.entrySet()) {
if (Sets.intersection(outgoingActionConnections.get(sourceAction), subdagEntry.getValue().getSources()).size() > 0) {
phaseConnections.add(new Connection(sourceAction, subdagEntry.getKey()));
}
}
// Check if destination is other Action node
for (String destination : outgoingActionConnections.get(sourceAction)) {
if (actionNodes.contains(destination)) {
phaseConnections.add(new Connection(sourceAction, destination));
}
}
}
for (String destinationAction : incomingActionConnections.keySet()) {
// Check if source is one of the sink stages in the pipeline
for (Map.Entry<String, Dag> subdagEntry : subdags.entrySet()) {
if (Sets.intersection(incomingActionConnections.get(destinationAction), subdagEntry.getValue().getSinks()).size() > 0) {
phaseConnections.add(new Connection(subdagEntry.getKey(), destinationAction));
}
}
}
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class ETLWorker method configure.
@Override
public void configure() {
setName(NAME);
setDescription("Worker Driver for Realtime ETL Pipelines");
int instances = config.getInstances();
if (instances < 1) {
throw new IllegalArgumentException("instances must be greater than 0.");
}
setInstances(instances);
if (config.getResources() != null) {
setResources(config.getResources());
}
PipelineSpecGenerator<ETLRealtimeConfig, PipelineSpec> specGenerator = new RealtimePipelineSpecGenerator(getConfigurer(), ImmutableSet.of(RealtimeSource.PLUGIN_TYPE), ImmutableSet.of(RealtimeSink.PLUGIN_TYPE), Table.class, TableProperties.builder().setSchema(ERROR_SCHEMA).build());
PipelineSpec spec = specGenerator.generateSpec(config);
int sourceCount = 0;
for (StageSpec stageSpec : spec.getStages()) {
if (RealtimeSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
sourceCount++;
}
}
if (sourceCount != 1) {
throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
}
PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
PipelinePlan plan = planner.plan(spec);
if (plan.getPhases().size() != 1) {
// should never happen
throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
}
PipelinePhase pipeline = plan.getPhases().values().iterator().next();
Map<String, String> properties = new HashMap<>();
properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
properties.put(Constants.PIPELINEID, GSON.toJson(pipeline));
// Generate unique id for this app creation.
properties.put(UNIQUE_ID, String.valueOf(System.currentTimeMillis()));
properties.put(Constants.STAGE_LOGGING_ENABLED, String.valueOf(config.isStageLoggingEnabled()));
setProperties(properties);
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class SmartWorkflow method configure.
@Override
protected void configure() {
setName(NAME);
setDescription(DESCRIPTION);
// set the pipeline spec as a property in case somebody like the UI wants to read it
Map<String, String> properties = new HashMap<>();
properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
setProperties(properties);
stageSpecs = new HashMap<>();
useSpark = engine == Engine.SPARK;
for (StageSpec stageSpec : spec.getStages()) {
stageSpecs.put(stageSpec.getName(), stageSpec);
String pluginType = stageSpec.getPlugin().getType();
if (SparkCompute.PLUGIN_TYPE.equals(pluginType) || SparkSink.PLUGIN_TYPE.equals(pluginType)) {
useSpark = true;
}
}
PipelinePlanner planner;
Set<String> actionTypes = ImmutableSet.of(Action.PLUGIN_TYPE, Constants.SPARK_PROGRAM_PLUGIN_TYPE);
Set<String> multiPortTypes = ImmutableSet.of(SplitterTransform.PLUGIN_TYPE);
if (useSpark) {
// if the pipeline uses spark, we don't need to break the pipeline up into phases, we can just have
// a single phase.
planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.<String>of(), ImmutableSet.<String>of(), actionTypes, multiPortTypes);
} else {
planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.of(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE), ImmutableSet.of(SparkCompute.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE), actionTypes, multiPortTypes);
}
plan = planner.plan(spec);
WorkflowProgramAdder programAdder = new TrunkProgramAdder(getConfigurer());
// single phase, just add the program directly
if (plan.getPhases().size() == 1) {
addProgram(plan.getPhases().keySet().iterator().next(), programAdder);
return;
}
// Dag classes don't allow a 'dag' without connections
if (plan.getPhaseConnections().isEmpty()) {
WorkflowProgramAdder fork = programAdder.fork();
for (String phaseName : plan.getPhases().keySet()) {
addProgram(phaseName, fork);
}
fork.join();
return;
}
dag = new ControlDag(plan.getPhaseConnections());
boolean dummyNodeAdded = false;
Map<String, ConditionBranches> conditionBranches = plan.getConditionPhaseBranches();
if (conditionBranches.isEmpty()) {
// after flattening, there is guaranteed to be just one source
dag.flatten();
} else if (!conditionBranches.keySet().containsAll(dag.getSources())) {
// Continue only if the conditon node is not the source of the dag, otherwise dag is already in the
// required form
Set<String> conditions = conditionBranches.keySet();
// flatten only the part of the dag starting from sources and ending in conditions/sinks.
Set<String> dagNodes = dag.accessibleFrom(dag.getSources(), Sets.union(dag.getSinks(), conditions));
Set<String> dagNodesWithoutCondition = Sets.difference(dagNodes, conditions);
Set<Connection> connections = new HashSet<>();
Deque<String> bfs = new LinkedList<>();
Set<String> sinks = new HashSet<>();
// If its a single phase without condition then no need to flatten
if (dagNodesWithoutCondition.size() > 1) {
Dag subDag;
try {
subDag = dag.createSubDag(dagNodesWithoutCondition);
} catch (IllegalArgumentException | DisjointConnectionsException e) {
// DisjointConnectionsException thrown when islands are created from the dagNodesWithoutCondition
// IllegalArgumentException thrown when connections are empty
// In both cases we need to add dummy node and create connected Dag
String dummyNode = "dummy";
dummyNodeAdded = true;
Set<Connection> subDagConnections = new HashSet<>();
for (String source : dag.getSources()) {
subDagConnections.add(new Connection(dummyNode, source));
}
Deque<String> subDagBFS = new LinkedList<>();
subDagBFS.addAll(dag.getSources());
while (subDagBFS.peek() != null) {
String node = subDagBFS.poll();
for (String output : dag.getNodeOutputs(node)) {
if (dagNodesWithoutCondition.contains(output)) {
subDagConnections.add(new Connection(node, output));
subDagBFS.add(output);
}
}
}
subDag = new Dag(subDagConnections);
}
ControlDag cdag = new ControlDag(subDag);
cdag.flatten();
// Add all connections from cdag
bfs.addAll(cdag.getSources());
while (bfs.peek() != null) {
String node = bfs.poll();
for (String output : cdag.getNodeOutputs(node)) {
connections.add(new Connection(node, output));
bfs.add(output);
}
}
sinks.addAll(cdag.getSinks());
} else {
sinks.addAll(dagNodesWithoutCondition);
}
// Add back the existing condition nodes and corresponding conditions
Set<String> conditionsFromDag = Sets.intersection(dagNodes, conditions);
for (String condition : conditionsFromDag) {
connections.add(new Connection(sinks.iterator().next(), condition));
}
bfs.addAll(Sets.intersection(dagNodes, conditions));
while (bfs.peek() != null) {
String node = bfs.poll();
ConditionBranches branches = conditionBranches.get(node);
if (branches == null) {
// not a condition node. add outputs
for (String output : dag.getNodeOutputs(node)) {
connections.add(new Connection(node, output));
bfs.add(output);
}
} else {
// condition node
for (Boolean condition : Arrays.asList(true, false)) {
String phase = condition ? branches.getTrueOutput() : branches.getFalseOutput();
if (phase == null) {
continue;
}
connections.add(new Connection(node, phase, condition));
bfs.add(phase);
}
}
}
dag = new ControlDag(connections);
}
if (dummyNodeAdded) {
WorkflowProgramAdder fork = programAdder.fork();
String dummyNode = dag.getSources().iterator().next();
for (String output : dag.getNodeOutputs(dummyNode)) {
// need to make sure we don't call also() if this is the final branch
if (!addBranchPrograms(output, fork)) {
fork = fork.also();
}
}
} else {
String start = dag.getSources().iterator().next();
addPrograms(start, programAdder);
}
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class ETLMapReduce method initialize.
@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
final MapReduceContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
List<Finisher> finishers = new ArrayList<>();
final Job job = context.getHadoopJob();
final Configuration hConf = job.getConfiguration();
hConf.setBoolean("mapreduce.map.speculative", false);
hConf.setBoolean("mapreduce.reduce.speculative", false);
// plugin name -> runtime args for that plugin
MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace());
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
final PipelinePhase phase = phaseSpec.getPhase();
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec, new MultiConnectorFactory());
// should never happen if planner is correct
Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
if (reducers.size() > 1) {
Iterator<StageSpec> reducerIter = reducers.iterator();
StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
while (reducerIter.hasNext()) {
reducersStr.append(",");
reducersStr.append(reducerIter.next().getName());
}
throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
}
job.setMapperClass(ETLMapper.class);
if (reducers.isEmpty()) {
job.setNumReduceTasks(0);
} else {
job.setReducerClass(ETLReducer.class);
}
final Map<String, SinkOutput> sinkOutputs = new HashMap<>();
final Map<String, String> inputAliasToStage = new HashMap<>();
// call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
for (final String stageName : phase.getDag().getTopologicalOrder()) {
final StageSpec stageSpec = phase.getStage(stageName);
String pluginType = stageSpec.getPluginType();
boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
SubmitterPlugin submitterPlugin = null;
if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSource, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {
@Override
public void act(MapReduceBatchContext sourceContext) {
for (String inputAlias : sourceContext.getInputNames()) {
inputAliasToStage.put(inputAlias, stageName);
}
}
});
} else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSink, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {
@Override
public void act(MapReduceBatchContext sinkContext) {
sinkOutputs.put(stageName, new SinkOutput(sinkContext.getOutputNames()));
}
});
} else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
submitterPlugin = new SubmitterPlugin<>(stageName, context, transform, contextProvider);
} else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
final BatchAggregator<?, ?, ?> aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
submitterPlugin = new SubmitterPlugin<>(stageName, context, aggregator, contextProvider, new SubmitterPlugin.PrepareAction<DefaultAggregatorContext>() {
@Override
public void act(DefaultAggregatorContext aggregatorContext) {
if (aggregatorContext.getNumPartitions() != null) {
job.setNumReduceTasks(aggregatorContext.getNumPartitions());
}
Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
Class<?> outputValClass = aggregatorContext.getGroupValueClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
}
if (outputValClass == null) {
outputValClass = TypeChecker.getGroupValueClass(aggregator);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, outputValClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
job.setMapOutputValueClass(getOutputValClass(stageName, outputValClass));
}
});
} else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
final BatchJoiner<?, ?, ?> batchJoiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
submitterPlugin = new SubmitterPlugin<>(stageName, context, batchJoiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {
@Override
public void act(DefaultJoinerContext joinerContext) {
if (joinerContext.getNumPartitions() != null) {
job.setNumReduceTasks(joinerContext.getNumPartitions());
}
Class<?> outputKeyClass = joinerContext.getJoinKeyClass();
Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
}
if (inputRecordClass == null) {
inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
getOutputValClass(stageName, inputRecordClass);
// for joiner plugin map output is tagged with stageName
job.setMapOutputValueClass(TaggedWritable.class);
}
});
}
if (submitterPlugin != null) {
submitterPlugin.prepareRun();
finishers.add(submitterPlugin);
}
}
hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
finisher = new CompositeFinisher(finishers);
job.setMapperClass(ETLMapper.class);
WorkflowToken token = context.getWorkflowToken();
if (token != null) {
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
// token is null when just the mapreduce job is run but not the entire workflow
// we still want things to work in that case.
hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(pipelineRuntime.getArguments().asMap()));
}
use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.
the class MapReduceTransformExecutorFactory method getPipeStage.
private PipeStage getPipeStage(PipelinePhase pipeline, String stageName, Map<String, PipeStage> pipeStages, OutputWriter<?, ?> outputWriter) throws Exception {
StageSpec stageSpec = pipeline.getStage(stageName);
String pluginType = stageSpec.getPluginType();
// handle ending stage case, which don't use PipeEmitter
if (pipeline.getSinks().contains(stageName)) {
if (Constants.Connector.PLUGIN_TYPE.equals(pluginType) || BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
// connectors and joiners require the getting the RecordInfo class directly instead of unwrapping it
Transformation<RecordInfo<Object>, Object> sink = getTransformation(stageSpec);
return new DirectOutputPipeStage<>(stageName, sink, new SinkEmitter<>(stageName, outputWriter));
} else {
// others (batchsink, aggregators, alertpublisher), only required the value within the RecordInfo
return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), new SinkEmitter<>(stageName, outputWriter));
}
}
// create PipeEmitter, which holds all output PipeStages it needs to write to and wraps any output it gets
// into a RecordInfo
// ConnectorSources require a special emitter since they need to build RecordInfo from the temporary dataset
PipeEmitter.Builder emitterBuilder = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && pipeline.getSources().contains(stageName) ? ConnectorSourceEmitter.builder(stageName) : PipeEmitter.builder(stageName);
Map<String, StageSpec.Port> outputPorts = stageSpec.getOutputPorts();
for (String outputStageName : pipeline.getDag().getNodeOutputs(stageName)) {
StageSpec outputStageSpec = pipeline.getStage(outputStageName);
String outputStageType = outputStageSpec.getPluginType();
PipeStage outputPipeStage = pipeStages.get(outputStageName);
if (ErrorTransform.PLUGIN_TYPE.equals(outputStageType)) {
emitterBuilder.addErrorConsumer(outputPipeStage);
} else if (AlertPublisher.PLUGIN_TYPE.equals(outputStageType)) {
emitterBuilder.addAlertConsumer(outputPipeStage);
} else if (Constants.Connector.PLUGIN_TYPE.equals(pluginType)) {
// connectors only have a single output
emitterBuilder.addOutputConsumer(outputPipeStage);
} else {
// if the output is a connector like agg5.connector, the outputPorts will contain the original 'agg5' as
// a key, but not 'agg5.connector' so we need to lookup the original stage from the connector's plugin spec
String originalOutputName = Constants.Connector.PLUGIN_TYPE.equals(outputStageType) ? outputStageSpec.getPlugin().getProperties().get(Constants.Connector.ORIGINAL_NAME) : outputStageName;
String port = outputPorts.containsKey(originalOutputName) ? outputPorts.get(originalOutputName).getPort() : null;
if (port != null) {
emitterBuilder.addOutputConsumer(outputPipeStage, port);
} else {
emitterBuilder.addOutputConsumer(outputPipeStage);
}
}
}
PipeEmitter pipeEmitter = emitterBuilder.build();
if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
// this is a SplitterTransform, needs to emit records to the right outputs based on port
return new MultiOutputTransformPipeStage<>(stageName, getMultiOutputTransform(stageSpec), pipeEmitter);
} else {
return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), pipeEmitter);
}
}
Aggregations