use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.
the class PipelineSpecGenerator method configureStage.
/**
* Configures a stage and returns the spec for it.
*
* @param stage the user provided configuration for the stage
* @param validatedPipeline the validated pipeline config
* @param pluginConfigurer configurer used to configure the stage
* @return the spec for the stage
* @throws ValidationException if the plugin threw an exception during configuration
*/
protected ConfiguredStage configureStage(ETLStage stage, ValidatedPipeline validatedPipeline, DefaultPipelineConfigurer pluginConfigurer) throws ValidationException {
String stageName = stage.getName();
ETLPlugin stagePlugin = stage.getPlugin();
StageSpec.Builder specBuilder = configureStage(stageName, stagePlugin, pluginConfigurer);
DefaultStageConfigurer stageConfigurer = pluginConfigurer.getStageConfigurer();
String pluginType = stage.getPlugin().getType();
if (pluginType.equals(SplitterTransform.PLUGIN_TYPE)) {
Map<String, Schema> outputPortSchemas = stageConfigurer.getOutputPortSchemas();
for (Map.Entry<String, String> outputEntry : validatedPipeline.getOutputPorts(stageName).entrySet()) {
String outputStage = outputEntry.getKey();
String outputPort = outputEntry.getValue();
if (outputPort == null) {
throw new IllegalArgumentException(String.format("Connection from Splitter '%s' to '%s' must specify a port.", stageName, outputStage));
}
specBuilder.addOutput(outputStage, outputPort, outputPortSchemas.get(outputPort));
}
} else {
Schema outputSchema = stageConfigurer.getOutputSchema();
// all the same
if (Condition.PLUGIN_TYPE.equals(pluginType)) {
outputSchema = null;
for (Schema schema : stageConfigurer.getInputSchemas().values()) {
if (schema != null) {
// todo: fix this cleanly and fully
if (outputSchema != null && !Schemas.equalsIgnoringRecordName(outputSchema, schema)) {
throw new IllegalArgumentException("Cannot have different input schemas going into stage " + stageName);
}
outputSchema = schema;
}
}
}
for (String outputStage : validatedPipeline.getOutputs(stageName)) {
specBuilder.addOutput(outputStage, null, outputSchema);
}
}
StageSpec stageSpec = specBuilder.setProcessTimingEnabled(validatedPipeline.isProcessTimingEnabled()).setStageLoggingEnabled(validatedPipeline.isStageLoggingEnabled()).setMaxPreviewRecords(validatedPipeline.getMaxPreviewRecords()).build();
return new ConfiguredStage(stageSpec, pluginConfigurer.getPipelineProperties());
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.
the class PipelinePlanner method dagToPipeline.
/**
* Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
* The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
*
* @param dag the dag to convert
* @param connectors connector nodes across all dags
* @param specs specifications for every stage
* @return the converted dag
*/
private PipelinePhase dagToPipeline(Dag dag, Map<String, String> connectors, Map<String, StageSpec> specs, Map<String, String> conditionConnectors) {
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
for (String stageName : dag.getTopologicalOrder()) {
Set<String> outputs = dag.getNodeOutputs(stageName);
if (!outputs.isEmpty()) {
phaseBuilder.addConnections(stageName, outputs);
}
// add connectors
String originalName = connectors.get(stageName);
if (originalName != null || conditionConnectors.values().contains(stageName)) {
String connectorType = dag.getSources().contains(stageName) ? Constants.Connector.SOURCE_TYPE : Constants.Connector.SINK_TYPE;
PluginSpec connectorSpec = new PluginSpec(Constants.Connector.PLUGIN_TYPE, "connector", ImmutableMap.of(Constants.Connector.ORIGINAL_NAME, originalName != null ? originalName : stageName, Constants.Connector.TYPE, connectorType), null);
phaseBuilder.addStage(StageSpec.builder(stageName, connectorSpec).build());
continue;
}
// add other plugin types
StageSpec spec = specs.get(stageName);
phaseBuilder.addStage(spec);
}
return phaseBuilder.build();
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.
the class PipelinePlanner method plan.
/**
* Create an execution plan for the given logical pipeline. This is used for batch pipelines.
* Though it may eventually be useful to mark windowing points for realtime pipelines.
*
* A plan consists of one or more phases, with connections between phases.
* A connection between a phase indicates control flow, and not necessarily
* data flow. This class assumes that it receives a valid pipeline spec.
* That is, the pipeline has no cycles, all its nodes have unique names,
* sources don't have any input, sinks don't have any output,
* everything else has both an input and an output, etc.
*
* We start by inserting connector nodes into the logical dag,
* which are used to mark boundaries between mapreduce jobs.
* Each connector represents a node where we will need to write to a local dataset.
*
* Next, the logical pipeline is broken up into phases,
* using the connectors as sinks in one phase, and a source in another.
* After this point, connections between phases do not indicate data flow, but control flow.
*
* @param spec the pipeline spec, representing a logical pipeline
* @return the execution plan
*/
public PipelinePlan plan(PipelineSpec spec) {
// go through the stages and examine their plugin type to determine which stages are reduce stages
Set<String> reduceNodes = new HashSet<>();
Set<String> isolationNodes = new HashSet<>();
Set<String> actionNodes = new HashSet<>();
Set<String> multiPortNodes = new HashSet<>();
Set<String> allNodes = new HashSet<>();
// Map to hold the connection information from condition nodes to the first stage
// they connect to. Condition information also includes whether the stage is connected
// on the 'true' branch or the 'false' branch
Map<String, ConditionBranches> conditionBranches = new HashMap<>();
Map<String, Set<String>> conditionOutputs = new HashMap<>();
Map<String, Set<String>> conditionInputs = new HashMap<>();
Map<String, StageSpec> specs = new HashMap<>();
for (StageSpec stage : spec.getStages()) {
String pluginType = stage.getPlugin().getType();
allNodes.add(stage.getName());
if (reduceTypes.contains(pluginType)) {
reduceNodes.add(stage.getName());
}
if (isolationTypes.contains(pluginType)) {
isolationNodes.add(stage.getName());
}
if (actionTypes.contains(pluginType)) {
// Collect all Action nodes from spec
actionNodes.add(stage.getName());
}
if (multiPortTypes.contains(pluginType)) {
multiPortNodes.add(stage.getName());
}
if (Condition.PLUGIN_TYPE.equals(pluginType)) {
conditionBranches.put(stage.getName(), new ConditionBranches(null, null));
conditionOutputs.put(stage.getName(), new HashSet<String>());
conditionInputs.put(stage.getName(), new HashSet<String>());
}
specs.put(stage.getName(), stage);
}
// Special case for action nodes when there is no connection between them
if (spec.getConnections().isEmpty()) {
// All nodes should be actions
if (!actionNodes.containsAll(allNodes)) {
throw new IllegalStateException("No connections are specified.");
}
Map<String, PipelinePhase> phases = new HashMap<>();
for (String actionNode : actionNodes) {
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
PipelinePhase actionPhase = phaseBuilder.addStage(specs.get(actionNode)).build();
phases.put(actionNode, actionPhase);
}
return new PipelinePlan(phases, new HashSet<Connection>());
}
// Set representing control nodes (Conditions and Actions)
Set<String> controlNodes = Sets.union(actionNodes, conditionBranches.keySet());
Map<String, String> conditionChildToParent = new HashMap<>();
for (Connection connection : spec.getConnections()) {
if (conditionBranches.containsKey(connection.getFrom())) {
conditionOutputs.get(connection.getFrom()).add(connection.getTo());
}
if (conditionBranches.containsKey(connection.getTo())) {
conditionInputs.get(connection.getTo()).add(connection.getFrom());
}
if (conditionBranches.containsKey(connection.getFrom())) {
if (conditionBranches.containsKey(connection.getTo())) {
// conditions are chained
conditionChildToParent.put(connection.getTo(), connection.getFrom());
}
// Outgoing connection from condition
ConditionBranches branches = conditionBranches.get(connection.getFrom());
String trueOutput;
String falseOutput;
if (connection.getCondition()) {
trueOutput = connection.getTo();
falseOutput = branches.getFalseOutput();
} else {
trueOutput = branches.getTrueOutput();
falseOutput = connection.getTo();
}
conditionBranches.put(connection.getFrom(), new ConditionBranches(trueOutput, falseOutput));
}
}
Map<String, String> connectorNodes = new HashMap<>();
// now split the logical pipeline into pipeline phases, using the connectors as split points
Set<Dag> splittedDag = split(spec.getConnections(), conditionBranches.keySet(), reduceNodes, isolationNodes, actionNodes, multiPortNodes, connectorNodes);
Map<String, String> controlConnectors = getConnectorsAssociatedWithConditions(conditionBranches.keySet(), conditionChildToParent, conditionInputs, conditionOutputs, actionNodes);
Map<String, Dag> subdags = new HashMap<>();
for (Dag subdag : splittedDag) {
subdags.put(getPhaseName(subdag), subdag);
}
// build connections between phases and convert dags to PipelinePhase.
Set<Connection> phaseConnections = new HashSet<>();
Map<String, PipelinePhase> phases = new HashMap<>();
for (Map.Entry<String, Dag> dagEntry1 : subdags.entrySet()) {
String dag1Name = dagEntry1.getKey();
Dag dag1 = dagEntry1.getValue();
// convert the dag to a PipelinePhase
// add a separate pipeline phase for each control node in the subdag
Set<String> dag1ControlNodes = Sets.intersection(controlNodes, dag1.getNodes());
for (String dag1ControlNode : dag1ControlNodes) {
if (!phases.containsKey(dag1ControlNode)) {
phases.put(dag1ControlNode, PipelinePhase.builder(supportedPluginTypes).addStage(specs.get(dag1ControlNode)).build());
}
}
// if there are non-control nodes in the subdag, add a pipeline phase for it
if (!controlNodes.containsAll(dag1.getNodes())) {
// the updated dag replaces conditions with the corresponding connector if applicable.
Dag updatedDag = getUpdatedDag(dag1, controlConnectors);
// Remove any control nodes from this dag
if (!Sets.intersection(updatedDag.getNodes(), controlNodes).isEmpty()) {
Set<String> nodes = Sets.difference(updatedDag.getNodes(), controlNodes);
updatedDag = updatedDag.createSubDag(nodes);
}
phases.put(dag1Name, dagToPipeline(updatedDag, connectorNodes, specs, controlConnectors));
}
for (String controlSource : Sets.intersection(controlNodes, dag1.getSources())) {
ConditionBranches branches = conditionBranches.get(controlSource);
Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
for (String output : dag1.getNodeOutputs(controlSource)) {
if (controlNodes.contains(output)) {
// control source -> control node, add a phase connection between the control phases
phaseConnections.add(new Connection(controlSource, output, condition));
} else {
// control source -> non-control nodes, add a phase connection from the control phase to this dag
phaseConnections.add(new Connection(controlSource, dag1Name, condition));
}
}
}
// from this dag to the control phase
for (String controlSink : Sets.intersection(controlNodes, dag1.getSinks())) {
for (String input : dag1.getNodeInputs(controlSink)) {
if (controlNodes.contains(input)) {
// control node -> control-sink, add a phase connection between the control phases
ConditionBranches branches = conditionBranches.get(input);
Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
phaseConnections.add(new Connection(input, controlSink, condition));
} else {
// non-control node -> control-sink, add a phase connection from this dag to the control phase
phaseConnections.add(new Connection(dag1Name, controlSink));
}
}
}
// find connected subdags (they have a source that is a sink in dag1)
Set<String> nonControlSinks = Sets.difference(dag1.getSinks(), controlNodes);
for (Map.Entry<String, Dag> dagEntry2 : subdags.entrySet()) {
String dag2Name = dagEntry2.getKey();
Dag dag2 = dagEntry2.getValue();
if (dag1Name.equals(dag2Name)) {
continue;
}
if (!Sets.intersection(nonControlSinks, dag2.getSources()).isEmpty()) {
phaseConnections.add(new Connection(dag1Name, dag2Name));
}
}
}
return new PipelinePlan(phases, phaseConnections);
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.
the class SparkStreamingPipelineRunner method handleJoin.
@Override
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
String stageName = stageSpec.getName();
BatchJoiner<?, ?, ?> joiner;
if (plugin instanceof BatchAutoJoiner) {
BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
Map<String, Schema> inputSchemas = new HashMap<>();
for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
}
FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
failureCollector.getOrThrowException();
JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
if (joinDefinition == null) {
throw new IllegalStateException(String.format("Joiner stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
}
joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
} else if (plugin instanceof BatchJoiner) {
joiner = (BatchJoiner) plugin;
} else {
// should never happen unless there is a bug in the code. should have failed during deployment
throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
}
BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
joiner.initialize(joinerRuntimeContext);
shufflers.add(stageName);
return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
}
use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.
the class PipelineAction method run.
@Override
public void run() throws Exception {
CustomActionContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
PipelinePhase phase = phaseSpec.getPhase();
StageSpec stageSpec = phase.iterator().next();
PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace()));
ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
action.run(actionContext);
}
WorkflowToken token = context.getWorkflowToken();
if (token == null) {
throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
}
for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
token.put(entry.getKey(), entry.getValue());
}
}
Aggregations