use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class PipelinePlanner method plan.
/**
* Create an execution plan for the given logical pipeline. This is used for batch pipelines.
* Though it may eventually be useful to mark windowing points for realtime pipelines.
*
* A plan consists of one or more phases, with connections between phases.
* A connection between a phase indicates control flow, and not necessarily
* data flow. This class assumes that it receives a valid pipeline spec.
* That is, the pipeline has no cycles, all its nodes have unique names,
* sources don't have any input, sinks don't have any output,
* everything else has both an input and an output, etc.
*
* We start by inserting connector nodes into the logical dag,
* which are used to mark boundaries between mapreduce jobs.
* Each connector represents a node where we will need to write to a local dataset.
*
* Next, the logical pipeline is broken up into phases,
* using the connectors as sinks in one phase, and a source in another.
* After this point, connections between phases do not indicate data flow, but control flow.
*
* @param spec the pipeline spec, representing a logical pipeline
* @return the execution plan
*/
public PipelinePlan plan(PipelineSpec spec) {
// go through the stages and examine their plugin type to determine which stages are reduce stages
Set<String> reduceNodes = new HashSet<>();
Set<String> isolationNodes = new HashSet<>();
Set<String> actionNodes = new HashSet<>();
Set<String> multiPortNodes = new HashSet<>();
Set<String> allNodes = new HashSet<>();
// Map to hold the connection information from condition nodes to the first stage
// they connect to. Condition information also includes whether the stage is connected
// on the 'true' branch or the 'false' branch
Map<String, ConditionBranches> conditionBranches = new HashMap<>();
Map<String, Set<String>> conditionOutputs = new HashMap<>();
Map<String, Set<String>> conditionInputs = new HashMap<>();
Map<String, StageSpec> specs = new HashMap<>();
for (StageSpec stage : spec.getStages()) {
String pluginType = stage.getPlugin().getType();
allNodes.add(stage.getName());
if (reduceTypes.contains(pluginType)) {
reduceNodes.add(stage.getName());
}
if (isolationTypes.contains(pluginType)) {
isolationNodes.add(stage.getName());
}
if (actionTypes.contains(pluginType)) {
// Collect all Action nodes from spec
actionNodes.add(stage.getName());
}
if (multiPortTypes.contains(pluginType)) {
multiPortNodes.add(stage.getName());
}
if (Condition.PLUGIN_TYPE.equals(pluginType)) {
conditionBranches.put(stage.getName(), new ConditionBranches(null, null));
conditionOutputs.put(stage.getName(), new HashSet<String>());
conditionInputs.put(stage.getName(), new HashSet<String>());
}
specs.put(stage.getName(), stage);
}
// Special case for action nodes when there is no connection between them
if (spec.getConnections().isEmpty()) {
// All nodes should be actions
if (!actionNodes.containsAll(allNodes)) {
throw new IllegalStateException("No connections are specified.");
}
Map<String, PipelinePhase> phases = new HashMap<>();
for (String actionNode : actionNodes) {
PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
PipelinePhase actionPhase = phaseBuilder.addStage(specs.get(actionNode)).build();
phases.put(actionNode, actionPhase);
}
return new PipelinePlan(phases, new HashSet<Connection>());
}
// Set representing control nodes (Conditions and Actions)
Set<String> controlNodes = Sets.union(actionNodes, conditionBranches.keySet());
Map<String, String> conditionChildToParent = new HashMap<>();
for (Connection connection : spec.getConnections()) {
if (conditionBranches.containsKey(connection.getFrom())) {
conditionOutputs.get(connection.getFrom()).add(connection.getTo());
}
if (conditionBranches.containsKey(connection.getTo())) {
conditionInputs.get(connection.getTo()).add(connection.getFrom());
}
if (conditionBranches.containsKey(connection.getFrom())) {
if (conditionBranches.containsKey(connection.getTo())) {
// conditions are chained
conditionChildToParent.put(connection.getTo(), connection.getFrom());
}
// Outgoing connection from condition
ConditionBranches branches = conditionBranches.get(connection.getFrom());
String trueOutput;
String falseOutput;
if (connection.getCondition()) {
trueOutput = connection.getTo();
falseOutput = branches.getFalseOutput();
} else {
trueOutput = branches.getTrueOutput();
falseOutput = connection.getTo();
}
conditionBranches.put(connection.getFrom(), new ConditionBranches(trueOutput, falseOutput));
}
}
Map<String, String> connectorNodes = new HashMap<>();
// now split the logical pipeline into pipeline phases, using the connectors as split points
Set<Dag> splittedDag = split(spec.getConnections(), conditionBranches.keySet(), reduceNodes, isolationNodes, actionNodes, multiPortNodes, connectorNodes);
Map<String, String> controlConnectors = getConnectorsAssociatedWithConditions(conditionBranches.keySet(), conditionChildToParent, conditionInputs, conditionOutputs, actionNodes);
Map<String, Dag> subdags = new HashMap<>();
for (Dag subdag : splittedDag) {
subdags.put(getPhaseName(subdag), subdag);
}
// build connections between phases and convert dags to PipelinePhase.
Set<Connection> phaseConnections = new HashSet<>();
Map<String, PipelinePhase> phases = new HashMap<>();
for (Map.Entry<String, Dag> dagEntry1 : subdags.entrySet()) {
String dag1Name = dagEntry1.getKey();
Dag dag1 = dagEntry1.getValue();
// convert the dag to a PipelinePhase
// add a separate pipeline phase for each control node in the subdag
Set<String> dag1ControlNodes = Sets.intersection(controlNodes, dag1.getNodes());
for (String dag1ControlNode : dag1ControlNodes) {
if (!phases.containsKey(dag1ControlNode)) {
phases.put(dag1ControlNode, PipelinePhase.builder(supportedPluginTypes).addStage(specs.get(dag1ControlNode)).build());
}
}
// if there are non-control nodes in the subdag, add a pipeline phase for it
if (!controlNodes.containsAll(dag1.getNodes())) {
// the updated dag replaces conditions with the corresponding connector if applicable.
Dag updatedDag = getUpdatedDag(dag1, controlConnectors);
// Remove any control nodes from this dag
if (!Sets.intersection(updatedDag.getNodes(), controlNodes).isEmpty()) {
Set<String> nodes = Sets.difference(updatedDag.getNodes(), controlNodes);
updatedDag = updatedDag.createSubDag(nodes);
}
phases.put(dag1Name, dagToPipeline(updatedDag, connectorNodes, specs, controlConnectors));
}
for (String controlSource : Sets.intersection(controlNodes, dag1.getSources())) {
ConditionBranches branches = conditionBranches.get(controlSource);
Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
for (String output : dag1.getNodeOutputs(controlSource)) {
if (controlNodes.contains(output)) {
// control source -> control node, add a phase connection between the control phases
phaseConnections.add(new Connection(controlSource, output, condition));
} else {
// control source -> non-control nodes, add a phase connection from the control phase to this dag
phaseConnections.add(new Connection(controlSource, dag1Name, condition));
}
}
}
// from this dag to the control phase
for (String controlSink : Sets.intersection(controlNodes, dag1.getSinks())) {
for (String input : dag1.getNodeInputs(controlSink)) {
if (controlNodes.contains(input)) {
// control node -> control-sink, add a phase connection between the control phases
ConditionBranches branches = conditionBranches.get(input);
Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
phaseConnections.add(new Connection(input, controlSink, condition));
} else {
// non-control node -> control-sink, add a phase connection from this dag to the control phase
phaseConnections.add(new Connection(dag1Name, controlSink));
}
}
}
// find connected subdags (they have a source that is a sink in dag1)
Set<String> nonControlSinks = Sets.difference(dag1.getSinks(), controlNodes);
for (Map.Entry<String, Dag> dagEntry2 : subdags.entrySet()) {
String dag2Name = dagEntry2.getKey();
Dag dag2 = dagEntry2.getValue();
if (dag1Name.equals(dag2Name)) {
continue;
}
if (!Sets.intersection(nonControlSinks, dag2.getSources()).isEmpty()) {
phaseConnections.add(new Connection(dag1Name, dag2Name));
}
}
}
return new PipelinePlan(phases, phaseConnections);
}
use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class Dag method subsetFrom.
/**
* Return a subset of this dag starting from the specified stage, without going past any node in stopNodes.
* This is equivalent to taking the nodes from {@link #accessibleFrom(Set, Set)} and building a dag from them.
*
* @param stages the stages to start at
* @param stopNodes set of nodes to stop traversal on
* @return a dag created from the nodes accessible from the specified stage
*/
public Dag subsetFrom(Set<String> stages, Set<String> stopNodes) {
Set<String> nodes = accessibleFrom(stages, stopNodes);
Set<Connection> connections = new HashSet<>();
for (String node : nodes) {
for (String outputNode : outgoingConnections.get(node)) {
if (nodes.contains(outputNode)) {
connections.add(new Connection(node, outputNode));
}
}
}
return new Dag(connections);
}
use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class ETLRealtimeConfigTest method testUpgrade.
@Test
public void testUpgrade() throws Exception {
ETLStage source = new ETLStage("DataGenerator", ImmutableMap.of("p1", "v1"), null);
co.cask.cdap.etl.proto.v1.ETLStage sourceNew = new co.cask.cdap.etl.proto.v1.ETLStage("DataGenerator.1", new Plugin(source.getName(), source.getProperties()), source.getErrorDatasetName());
ETLStage transform1 = new ETLStage("Script", ImmutableMap.of("script", "something"), null);
co.cask.cdap.etl.proto.v1.ETLStage transform1New = new co.cask.cdap.etl.proto.v1.ETLStage("Script.2", new Plugin(transform1.getName(), transform1.getProperties()), transform1.getErrorDatasetName());
ETLStage transform2 = new ETLStage("Script", null, null);
co.cask.cdap.etl.proto.v1.ETLStage transform2New = new co.cask.cdap.etl.proto.v1.ETLStage("Script.3", new Plugin(transform2.getName(), transform2.getProperties()), transform2.getErrorDatasetName());
ETLStage transform3 = new ETLStage("Validator", ImmutableMap.of("p1", "v1", "p2", "v2"), "errorDS");
co.cask.cdap.etl.proto.v1.ETLStage transform3New = new co.cask.cdap.etl.proto.v1.ETLStage("Validator.4", new Plugin(transform3.getName(), transform3.getProperties()), transform3.getErrorDatasetName());
ETLStage sink1 = new ETLStage("Table", ImmutableMap.of("rowkey", "xyz"), null);
co.cask.cdap.etl.proto.v1.ETLStage sink1New = new co.cask.cdap.etl.proto.v1.ETLStage("Table.5", new Plugin(sink1.getName(), sink1.getProperties()), sink1.getErrorDatasetName());
ETLStage sink2 = new ETLStage("HDFS", ImmutableMap.of("name", "abc"), null);
co.cask.cdap.etl.proto.v1.ETLStage sink2New = new co.cask.cdap.etl.proto.v1.ETLStage("HDFS.6", new Plugin(sink2.getName(), sink2.getProperties()), sink2.getErrorDatasetName());
List<Connection> connections = new ArrayList<>();
connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
connections.add(new Connection(transform1New.getName(), transform2New.getName()));
connections.add(new Connection(transform2New.getName(), transform3New.getName()));
connections.add(new Connection(transform3New.getName(), sink1New.getName()));
connections.add(new Connection(transform3New.getName(), sink2New.getName()));
Resources resources = new Resources(1024, 1);
ETLRealtimeConfig config = new ETLRealtimeConfig(1, source, ImmutableList.of(sink1, sink2), ImmutableList.of(transform1, transform2, transform3), resources);
co.cask.cdap.etl.proto.v1.ETLRealtimeConfig configNew = co.cask.cdap.etl.proto.v1.ETLRealtimeConfig.builder().setInstances(1).setSource(sourceNew).addSink(sink1New).addSink(sink2New).addTransform(transform1New).addTransform(transform2New).addTransform(transform3New).addConnections(connections).setResources(resources).build();
Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {
@Nullable
@Override
public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
return null;
}
}));
}
use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class ETLBatchConfigTest method testUpgrade.
@Test
public void testUpgrade() throws Exception {
final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
ETLStage source = new ETLStage("DataGenerator", ImmutableMap.of("p1", "v1"), null);
co.cask.cdap.etl.proto.v1.ETLStage sourceNew = new co.cask.cdap.etl.proto.v1.ETLStage("DataGenerator.1", new Plugin(source.getName(), source.getProperties(), artifact), source.getErrorDatasetName());
ETLStage transform1 = new ETLStage("Script", ImmutableMap.of("script", "something"), null);
co.cask.cdap.etl.proto.v1.ETLStage transform1New = new co.cask.cdap.etl.proto.v1.ETLStage("Script.2", new Plugin(transform1.getName(), transform1.getProperties(), artifact), transform1.getErrorDatasetName());
ETLStage transform2 = new ETLStage("Script", null, null);
co.cask.cdap.etl.proto.v1.ETLStage transform2New = new co.cask.cdap.etl.proto.v1.ETLStage("Script.3", new Plugin(transform2.getName(), transform2.getProperties(), artifact), transform2.getErrorDatasetName());
ETLStage transform3 = new ETLStage("Validator", ImmutableMap.of("p1", "v1", "p2", "v2"), "errorDS");
co.cask.cdap.etl.proto.v1.ETLStage transform3New = new co.cask.cdap.etl.proto.v1.ETLStage("Validator.4", new Plugin(transform3.getName(), transform3.getProperties(), artifact), transform3.getErrorDatasetName());
ETLStage sink1 = new ETLStage("Table", ImmutableMap.of("rowkey", "xyz"), null);
co.cask.cdap.etl.proto.v1.ETLStage sink1New = new co.cask.cdap.etl.proto.v1.ETLStage("Table.5", new Plugin(sink1.getName(), sink1.getProperties(), artifact), sink1.getErrorDatasetName());
ETLStage sink2 = new ETLStage("HDFS", ImmutableMap.of("name", "abc"), null);
co.cask.cdap.etl.proto.v1.ETLStage sink2New = new co.cask.cdap.etl.proto.v1.ETLStage("HDFS.6", new Plugin(sink2.getName(), sink2.getProperties(), artifact), sink2.getErrorDatasetName());
ETLStage action = new ETLStage("Email", ImmutableMap.of("email", "slj@example.com"), null);
co.cask.cdap.etl.proto.v1.ETLStage actionNew = new co.cask.cdap.etl.proto.v1.ETLStage("Email.1", new Plugin(action.getName(), action.getProperties(), artifact), action.getErrorDatasetName());
List<Connection> connections = new ArrayList<>();
connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
connections.add(new Connection(transform1New.getName(), transform2New.getName()));
connections.add(new Connection(transform2New.getName(), transform3New.getName()));
connections.add(new Connection(transform3New.getName(), sink1New.getName()));
connections.add(new Connection(transform3New.getName(), sink2New.getName()));
String schedule = "*/5 * * * *";
Resources resources = new Resources(1024, 1);
ETLBatchConfig config = new ETLBatchConfig(schedule, source, ImmutableList.of(sink1, sink2), ImmutableList.of(transform1, transform2, transform3), resources, ImmutableList.of(action));
co.cask.cdap.etl.proto.v1.ETLBatchConfig configNew = co.cask.cdap.etl.proto.v1.ETLBatchConfig.builder(schedule).setSource(sourceNew).addSink(sink1New).addSink(sink2New).addTransform(transform1New).addTransform(transform2New).addTransform(transform3New).addConnections(connections).setResources(resources).setDriverResources(resources).addAction(actionNew).build();
Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {
@Nullable
@Override
public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
return new ArtifactSelectorConfig(ArtifactScope.SYSTEM.name(), "universal", "1.0.0");
}
}));
}
use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class ConnectorDagTest method testSplitDag.
@Test
public void testSplitDag() {
/*
|--- n2(r) ----------|
| | |-- n10
n1 --|--- n3(r) --- n5 ---|--- n6 --- n7(r) --- n8 --- n9(r) --|
| | |-- n11
|--- n4(r) ----------|
There should be a connector after n1, before n7, and before n9. This should result in subdags:
n1 --> n1.out.connector
n1.out.connector --> n2(r) --> n6 --> n7.connector
n1.out.connector --> n3(r) --> n5 --> n6 --> n7.connector
n1.out.connector --> n4(r) --> n6 --> n7.connector
n7.connector --> n7 --> n8 --> n9.connector
|--> n10
n9.connector --> n9 --|
|--> n11
*/
ConnectorDag cdag = ConnectorDag.builder().addConnection("n1", "n2").addConnection("n1", "n3").addConnection("n1", "n4").addConnection("n2", "n6").addConnection("n3", "n5").addConnection("n4", "n6").addConnection("n5", "n6").addConnection("n6", "n7").addConnection("n7", "n8").addConnection("n8", "n9").addConnection("n9", "n10").addConnection("n9", "n11").addReduceNodes("n2", "n3", "n4", "n7", "n9").build();
cdag.insertConnectors();
Set<Dag> actual = new HashSet<>(cdag.split());
Dag dag1 = new Dag(ImmutableSet.of(new Connection("n1", "n1.out.connector")));
Dag dag2 = new Dag(ImmutableSet.of(new Connection("n1.out.connector", "n2"), new Connection("n2", "n6"), new Connection("n6", "n7.connector")));
Dag dag3 = new Dag(ImmutableSet.of(new Connection("n1.out.connector", "n3"), new Connection("n3", "n5"), new Connection("n5", "n6"), new Connection("n6", "n7.connector")));
Dag dag4 = new Dag(ImmutableSet.of(new Connection("n1.out.connector", "n4"), new Connection("n4", "n6"), new Connection("n6", "n7.connector")));
Dag dag5 = new Dag(ImmutableSet.of(new Connection("n7.connector", "n7"), new Connection("n7", "n8"), new Connection("n8", "n9.connector")));
Dag dag6 = new Dag(ImmutableSet.of(new Connection("n9.connector", "n9"), new Connection("n9", "n10"), new Connection("n9", "n11")));
Set<Dag> expected = ImmutableSet.of(dag1, dag2, dag3, dag4, dag5, dag6);
Assert.assertEquals(expected, actual);
/*
|---> n2(r)
| |
n1 --| |
| v
|---> n3(r) ---> n4
n2 and n3 should have connectors inserted in front of them to become:
|---> n2.connector ---> n2(r)
| |
n1 --| |
| v
|-------------------> n3.connector ---> n3(r) ---> n4
*/
cdag = ConnectorDag.builder().addConnection("n1", "n2").addConnection("n1", "n3").addConnection("n2", "n3").addConnection("n3", "n4").addReduceNodes("n2", "n3").build();
cdag.insertConnectors();
actual = new HashSet<>(cdag.split());
/*
|--> n2.connector
n1 --|
|--> n3.connector
*/
dag1 = new Dag(ImmutableSet.of(new Connection("n1", "n2.connector"), new Connection("n1", "n3.connector")));
/*
n2.connector --> n2 --> n3.connector
*/
dag2 = new Dag(ImmutableSet.of(new Connection("n2.connector", "n2"), new Connection("n2", "n3.connector")));
/*
n3.connector --> n3 --> n4
*/
dag3 = new Dag(ImmutableSet.of(new Connection("n3.connector", "n3"), new Connection("n3", "n4")));
expected = ImmutableSet.of(dag1, dag2, dag3);
Assert.assertEquals(expected, actual);
/*
n1 --> n2 --|
|--> n3(r) --> n4 --|
n7 --> n8 --| |--> n5(r) --> n6
|
n9 -----------------------------|
only n5 should have a connector inserted in front of it to become:
n1 --> n2 --|
|--> n3(r) --> n4 --|
n7 --> n8 --| |--> n5.connector --> n5(r) --> n6
|
n9 -----------------------------|
*/
cdag = ConnectorDag.builder().addConnection("n1", "n2").addConnection("n2", "n3").addConnection("n3", "n4").addConnection("n4", "n5").addConnection("n5", "n6").addConnection("n7", "n8").addConnection("n8", "n3").addConnection("n9", "n5").addReduceNodes("n3", "n5").build();
cdag.insertConnectors();
actual = new HashSet<>(cdag.split());
/*
n1 --> n2 --|
|--> n3(r) --> n4 --|
n7 --> n8 --| |--> n5.connector
*/
dag1 = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "n3"), new Connection("n3", "n4"), new Connection("n4", "n5.connector"), new Connection("n7", "n8"), new Connection("n8", "n3")));
/*
|--> n5.connector
|
n9 -----------------------------|
*/
dag2 = new Dag(ImmutableSet.of(new Connection("n9", "n5.connector")));
/*
n5.connector --> n5(r) --> n6
*/
dag3 = new Dag(ImmutableSet.of(new Connection("n5.connector", "n5"), new Connection("n5", "n6")));
expected = ImmutableSet.of(dag1, dag2, dag3);
Assert.assertEquals(expected, actual);
}
Aggregations