use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class PipelinePlanner method populateActionPhases.
/**
* This method is responsible for populating phases and phaseConnections with the Action phases.
* Action phase is a single stage {@link PipelinePhase} which does not have any dag.
*
* @param pipelineSpec the overall spec for the pipeline
* @param specs the Map of stage specs
* @param actionNodes the Set of action nodes in the pipeline
* @param phases the Map of phases created so far
* @param phaseConnections the Set of connections between phases added so far
* @param outgoingActionConnections the Map that holds set of stages to which
* there is an outgoing connection from a Action stage
* @param incomingActionConnections the Map that holds set of stages to which
* there is a incoming connection to an Action stage
* @param subdags subdags created so far from the pipeline stages
*/
private void populateActionPhases(PipelineSpec pipelineSpec, Map<String, StageSpec> specs, Set<String> actionNodes, Map<String, PipelinePhase> phases, Set<Connection> phaseConnections, SetMultimap<String, String> outgoingActionConnections, SetMultimap<String, String> incomingActionConnections, Map<String, Dag> subdags) {
// Create single stage phases for the Action nodes
for (String node : actionNodes) {
StageSpec actionStageSpec = specs.get(node);
String type = specs.get(node).getPlugin().getType();
StageInfo actionStageInfo = StageInfo.builder(node, type).addInputs(actionStageSpec.getInputs()).addInputSchemas(actionStageSpec.getInputSchemas()).addOutputs(actionStageSpec.getOutputs()).setOutputSchema(actionStageSpec.getOutputSchema()).setErrorSchema(actionStageSpec.getErrorSchema()).setErrorDatasetName(actionStageSpec.getErrorDatasetName()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build();
phases.put(node, PipelinePhase.builder(supportedPluginTypes).addStage(actionStageInfo).build());
}
// Build phaseConnections for the Action nodes
for (String sourceAction : outgoingActionConnections.keySet()) {
// Check if destination is one of the source stages in the pipeline
for (Map.Entry<String, Dag> subdagEntry : subdags.entrySet()) {
if (Sets.intersection(outgoingActionConnections.get(sourceAction), subdagEntry.getValue().getSources()).size() > 0) {
phaseConnections.add(new Connection(sourceAction, subdagEntry.getKey()));
}
}
// Check if destination is other Action node
for (String destination : outgoingActionConnections.get(sourceAction)) {
if (actionNodes.contains(destination)) {
phaseConnections.add(new Connection(sourceAction, destination));
}
}
}
for (String destinationAction : incomingActionConnections.keySet()) {
// Check if source is one of the sink stages in the pipeline
for (Map.Entry<String, Dag> subdagEntry : subdags.entrySet()) {
if (Sets.intersection(incomingActionConnections.get(destinationAction), subdagEntry.getValue().getSinks()).size() > 0) {
phaseConnections.add(new Connection(subdagEntry.getKey(), destinationAction));
}
}
}
}
use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class ETLRealtimeConfigTest method testUpgrade.
@Test
public void testUpgrade() throws Exception {
final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
ETLStage source = new ETLStage("source", new Plugin("DataGenerator", ImmutableMap.of("p1", "v1"), artifact), "errorDS");
co.cask.cdap.etl.proto.v2.ETLStage sourceNew = from(source, RealtimeSource.PLUGIN_TYPE);
ETLStage transform1 = new ETLStage("transform1", new Plugin("Script", ImmutableMap.of("script", "something"), null));
co.cask.cdap.etl.proto.v2.ETLStage transform1New = from(transform1, Transform.PLUGIN_TYPE);
ETLStage transform2 = new ETLStage("transform2", new Plugin("Script", null, null));
co.cask.cdap.etl.proto.v2.ETLStage transform2New = from(transform2, Transform.PLUGIN_TYPE);
ETLStage transform3 = new ETLStage("transform3", new Plugin("Validator", ImmutableMap.of("p1", "v1", "p2", "v2")), "errorDS");
co.cask.cdap.etl.proto.v2.ETLStage transform3New = from(transform3, Transform.PLUGIN_TYPE);
ETLStage sink1 = new ETLStage("sink1", new Plugin("Table", ImmutableMap.of("rowkey", "xyz"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sink1New = from(sink1, RealtimeSink.PLUGIN_TYPE);
ETLStage sink2 = new ETLStage("sink2", new Plugin("HDFS", ImmutableMap.of("name", "abc"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sink2New = from(sink2, RealtimeSink.PLUGIN_TYPE);
Set<Connection> connections = new HashSet<>();
connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
connections.add(new Connection(transform1New.getName(), transform2New.getName()));
connections.add(new Connection(transform2New.getName(), transform3New.getName()));
connections.add(new Connection(transform3New.getName(), sink1New.getName()));
connections.add(new Connection(transform3New.getName(), sink2New.getName()));
Resources resources = new Resources(1024, 1);
ETLRealtimeConfig config = ETLRealtimeConfig.builder().setInstances(1).setSource(source).addSink(sink1).addSink(sink2).addTransform(transform1).addTransform(transform2).addTransform(transform3).addConnections(connections).setResources(resources).build();
co.cask.cdap.etl.proto.v2.ETLRealtimeConfig configNew = co.cask.cdap.etl.proto.v2.ETLRealtimeConfig.builder().setInstances(1).addStage(sourceNew).addStage(sink1New).addStage(sink2New).addStage(transform1New).addStage(transform2New).addStage(transform3New).addConnections(connections).setResources(resources).build();
Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {
@Nullable
@Override
public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
return null;
}
}));
}
use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class SmartWorkflow method configure.
@Override
protected void configure() {
setName(NAME);
setDescription(DESCRIPTION);
// set the pipeline spec as a property in case somebody like the UI wants to read it
Map<String, String> properties = new HashMap<>();
properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
setProperties(properties);
stageSpecs = new HashMap<>();
useSpark = engine == Engine.SPARK;
for (StageSpec stageSpec : spec.getStages()) {
stageSpecs.put(stageSpec.getName(), stageSpec);
String pluginType = stageSpec.getPlugin().getType();
if (SparkCompute.PLUGIN_TYPE.equals(pluginType) || SparkSink.PLUGIN_TYPE.equals(pluginType)) {
useSpark = true;
}
}
PipelinePlanner planner;
Set<String> actionTypes = ImmutableSet.of(Action.PLUGIN_TYPE, Constants.SPARK_PROGRAM_PLUGIN_TYPE);
Set<String> multiPortTypes = ImmutableSet.of(SplitterTransform.PLUGIN_TYPE);
if (useSpark) {
// if the pipeline uses spark, we don't need to break the pipeline up into phases, we can just have
// a single phase.
planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.<String>of(), ImmutableSet.<String>of(), actionTypes, multiPortTypes);
} else {
planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.of(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE), ImmutableSet.of(SparkCompute.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE), actionTypes, multiPortTypes);
}
plan = planner.plan(spec);
WorkflowProgramAdder programAdder = new TrunkProgramAdder(getConfigurer());
// single phase, just add the program directly
if (plan.getPhases().size() == 1) {
addProgram(plan.getPhases().keySet().iterator().next(), programAdder);
return;
}
// Dag classes don't allow a 'dag' without connections
if (plan.getPhaseConnections().isEmpty()) {
WorkflowProgramAdder fork = programAdder.fork();
for (String phaseName : plan.getPhases().keySet()) {
addProgram(phaseName, fork);
}
fork.join();
return;
}
dag = new ControlDag(plan.getPhaseConnections());
boolean dummyNodeAdded = false;
Map<String, ConditionBranches> conditionBranches = plan.getConditionPhaseBranches();
if (conditionBranches.isEmpty()) {
// after flattening, there is guaranteed to be just one source
dag.flatten();
} else if (!conditionBranches.keySet().containsAll(dag.getSources())) {
// Continue only if the conditon node is not the source of the dag, otherwise dag is already in the
// required form
Set<String> conditions = conditionBranches.keySet();
// flatten only the part of the dag starting from sources and ending in conditions/sinks.
Set<String> dagNodes = dag.accessibleFrom(dag.getSources(), Sets.union(dag.getSinks(), conditions));
Set<String> dagNodesWithoutCondition = Sets.difference(dagNodes, conditions);
Set<Connection> connections = new HashSet<>();
Deque<String> bfs = new LinkedList<>();
Set<String> sinks = new HashSet<>();
// If its a single phase without condition then no need to flatten
if (dagNodesWithoutCondition.size() > 1) {
Dag subDag;
try {
subDag = dag.createSubDag(dagNodesWithoutCondition);
} catch (IllegalArgumentException | DisjointConnectionsException e) {
// DisjointConnectionsException thrown when islands are created from the dagNodesWithoutCondition
// IllegalArgumentException thrown when connections are empty
// In both cases we need to add dummy node and create connected Dag
String dummyNode = "dummy";
dummyNodeAdded = true;
Set<Connection> subDagConnections = new HashSet<>();
for (String source : dag.getSources()) {
subDagConnections.add(new Connection(dummyNode, source));
}
Deque<String> subDagBFS = new LinkedList<>();
subDagBFS.addAll(dag.getSources());
while (subDagBFS.peek() != null) {
String node = subDagBFS.poll();
for (String output : dag.getNodeOutputs(node)) {
if (dagNodesWithoutCondition.contains(output)) {
subDagConnections.add(new Connection(node, output));
subDagBFS.add(output);
}
}
}
subDag = new Dag(subDagConnections);
}
ControlDag cdag = new ControlDag(subDag);
cdag.flatten();
// Add all connections from cdag
bfs.addAll(cdag.getSources());
while (bfs.peek() != null) {
String node = bfs.poll();
for (String output : cdag.getNodeOutputs(node)) {
connections.add(new Connection(node, output));
bfs.add(output);
}
}
sinks.addAll(cdag.getSinks());
} else {
sinks.addAll(dagNodesWithoutCondition);
}
// Add back the existing condition nodes and corresponding conditions
Set<String> conditionsFromDag = Sets.intersection(dagNodes, conditions);
for (String condition : conditionsFromDag) {
connections.add(new Connection(sinks.iterator().next(), condition));
}
bfs.addAll(Sets.intersection(dagNodes, conditions));
while (bfs.peek() != null) {
String node = bfs.poll();
ConditionBranches branches = conditionBranches.get(node);
if (branches == null) {
// not a condition node. add outputs
for (String output : dag.getNodeOutputs(node)) {
connections.add(new Connection(node, output));
bfs.add(output);
}
} else {
// condition node
for (Boolean condition : Arrays.asList(true, false)) {
String phase = condition ? branches.getTrueOutput() : branches.getFalseOutput();
if (phase == null) {
continue;
}
connections.add(new Connection(node, phase, condition));
bfs.add(phase);
}
}
}
dag = new ControlDag(connections);
}
if (dummyNodeAdded) {
WorkflowProgramAdder fork = programAdder.fork();
String dummyNode = dag.getSources().iterator().next();
for (String output : dag.getNodeOutputs(dummyNode)) {
// need to make sure we don't call also() if this is the final branch
if (!addBranchPrograms(output, fork)) {
fork = fork.also();
}
}
} else {
String start = dag.getSources().iterator().next();
addPrograms(start, programAdder);
}
}
use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class ETLBatchConfigTest method testUpgrade.
@Test
public void testUpgrade() throws Exception {
final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
ETLStage source = new ETLStage("source", new Plugin("DataGenerator", ImmutableMap.of("p1", "v1"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sourceNew = from(source, BatchSource.PLUGIN_TYPE);
ETLStage transform1 = new ETLStage("transform1", new Plugin("Script", ImmutableMap.of("script", "something"), null));
co.cask.cdap.etl.proto.v2.ETLStage transform1New = from(transform1, Transform.PLUGIN_TYPE);
ETLStage transform2 = new ETLStage("transform2", new Plugin("Script", null, null));
co.cask.cdap.etl.proto.v2.ETLStage transform2New = from(transform2, Transform.PLUGIN_TYPE);
ETLStage transform3 = new ETLStage("transform3", new Plugin("Validator", ImmutableMap.of("p1", "v1", "p2", "v2")), null);
co.cask.cdap.etl.proto.v2.ETLStage transform3New = from(transform3, Transform.PLUGIN_TYPE);
ETLStage sink1 = new ETLStage("sink1", new Plugin("Table", ImmutableMap.of("rowkey", "xyz"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sink1New = from(sink1, BatchSink.PLUGIN_TYPE);
ETLStage sink2 = new ETLStage("sink2", new Plugin("HDFS", ImmutableMap.of("name", "abc"), artifact), null);
co.cask.cdap.etl.proto.v2.ETLStage sink2New = from(sink2, BatchSink.PLUGIN_TYPE);
Set<Connection> connections = new HashSet<>();
connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
connections.add(new Connection(transform1New.getName(), transform2New.getName()));
connections.add(new Connection(transform2New.getName(), transform3New.getName()));
connections.add(new Connection(transform3New.getName(), sink1New.getName()));
connections.add(new Connection(transform3New.getName(), sink2New.getName()));
String schedule = "*/5 * * * *";
Resources resources = new Resources(1024, 1);
ETLBatchConfig config = ETLBatchConfig.builder(schedule).setSource(source).addSink(sink1).addSink(sink2).addTransform(transform1).addTransform(transform2).addTransform(transform3).addConnections(connections).setResources(resources).setDriverResources(resources).build();
co.cask.cdap.etl.proto.v2.ETLBatchConfig configNew = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder(schedule).addStage(sourceNew).addStage(sink1New).addStage(sink2New).addStage(transform1New).addStage(transform2New).addStage(transform3New).addConnections(connections).setResources(resources).setDriverResources(resources).build();
Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {
@Nullable
@Override
public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
return null;
}
}));
}
use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.
the class ConnectorDagTest method testConditionDag.
@Test
public void testConditionDag() throws Exception {
/*
file - csv - c1 - t1---agg1--agg2---sink1
|
----c2 - sink2
|
------c3 - sink3
*/
Set<Connection> connections = ImmutableSet.of(new Connection("file", "csv"), new Connection("csv", "c1"), new Connection("c1", "t1"), new Connection("t1", "agg1"), new Connection("agg1", "agg2"), new Connection("agg2", "sink1"), new Connection("c1", "c2"), new Connection("c2", "sink2"), new Connection("c2", "c3"), new Connection("c3", "sink3"));
Set<String> conditions = new HashSet<>(Arrays.asList("c1", "c2", "c3"));
Set<String> reduceNodes = new HashSet<>(Arrays.asList("agg1", "agg2"));
Set<String> isolationNodes = new HashSet<>();
Set<String> multiPortNodes = new HashSet<>();
Set<Dag> actual = PipelinePlanner.split(connections, conditions, reduceNodes, isolationNodes, EMPTY_ACTIONS, multiPortNodes, EMPTY_CONNECTORS);
Dag dag1 = new Dag(ImmutableSet.of(new Connection("file", "csv"), new Connection("csv", "c1")));
Dag dag2 = new Dag(ImmutableSet.of(new Connection("c1", "t1"), new Connection("t1", "agg1"), new Connection("agg1", "agg2.connector")));
Dag dag3 = new Dag(ImmutableSet.of(new Connection("agg2.connector", "agg2"), new Connection("agg2", "sink1")));
Dag dag4 = new Dag(ImmutableSet.of(new Connection("c1", "c2")));
Dag dag5 = new Dag(ImmutableSet.of(new Connection("c2", "sink2")));
Dag dag6 = new Dag(ImmutableSet.of(new Connection("c2", "c3")));
Dag dag7 = new Dag(ImmutableSet.of(new Connection("c3", "sink3")));
Set<Dag> expected = ImmutableSet.of(dag1, dag2, dag3, dag4, dag5, dag6, dag7);
Assert.assertEquals(actual, expected);
}
Aggregations