Search in sources :

Example 1 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class PipelinePlanner method populateActionPhases.

/**
   * This method is responsible for populating phases and phaseConnections with the Action phases.
   * Action phase is a single stage {@link PipelinePhase} which does not have any dag.
   *
   * @param pipelineSpec the overall spec for the pipeline
   * @param specs the Map of stage specs
   * @param actionNodes the Set of action nodes in the pipeline
   * @param phases the Map of phases created so far
   * @param phaseConnections the Set of connections between phases added so far
   * @param outgoingActionConnections the Map that holds set of stages to which
   *                                  there is an outgoing connection from a Action stage
   * @param incomingActionConnections the Map that holds set of stages to which
   *                                  there is a incoming connection to an Action stage
   * @param subdags subdags created so far from the pipeline stages
   */
private void populateActionPhases(PipelineSpec pipelineSpec, Map<String, StageSpec> specs, Set<String> actionNodes, Map<String, PipelinePhase> phases, Set<Connection> phaseConnections, SetMultimap<String, String> outgoingActionConnections, SetMultimap<String, String> incomingActionConnections, Map<String, Dag> subdags) {
    // Create single stage phases for the Action nodes
    for (String node : actionNodes) {
        StageSpec actionStageSpec = specs.get(node);
        String type = specs.get(node).getPlugin().getType();
        StageInfo actionStageInfo = StageInfo.builder(node, type).addInputs(actionStageSpec.getInputs()).addInputSchemas(actionStageSpec.getInputSchemas()).addOutputs(actionStageSpec.getOutputs()).setOutputSchema(actionStageSpec.getOutputSchema()).setErrorSchema(actionStageSpec.getErrorSchema()).setErrorDatasetName(actionStageSpec.getErrorDatasetName()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build();
        phases.put(node, PipelinePhase.builder(supportedPluginTypes).addStage(actionStageInfo).build());
    }
    // Build phaseConnections for the Action nodes
    for (String sourceAction : outgoingActionConnections.keySet()) {
        // Check if destination is one of the source stages in the pipeline
        for (Map.Entry<String, Dag> subdagEntry : subdags.entrySet()) {
            if (Sets.intersection(outgoingActionConnections.get(sourceAction), subdagEntry.getValue().getSources()).size() > 0) {
                phaseConnections.add(new Connection(sourceAction, subdagEntry.getKey()));
            }
        }
        // Check if destination is other Action node
        for (String destination : outgoingActionConnections.get(sourceAction)) {
            if (actionNodes.contains(destination)) {
                phaseConnections.add(new Connection(sourceAction, destination));
            }
        }
    }
    for (String destinationAction : incomingActionConnections.keySet()) {
        // Check if source is one of the sink stages in the pipeline
        for (Map.Entry<String, Dag> subdagEntry : subdags.entrySet()) {
            if (Sets.intersection(incomingActionConnections.get(destinationAction), subdagEntry.getValue().getSinks()).size() > 0) {
                phaseConnections.add(new Connection(subdagEntry.getKey(), destinationAction));
            }
        }
    }
}
Also used : StageSpec(co.cask.cdap.etl.spec.StageSpec) Connection(co.cask.cdap.etl.proto.Connection) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class ETLRealtimeConfigTest method testUpgrade.

@Test
public void testUpgrade() throws Exception {
    final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
    ETLStage source = new ETLStage("source", new Plugin("DataGenerator", ImmutableMap.of("p1", "v1"), artifact), "errorDS");
    co.cask.cdap.etl.proto.v2.ETLStage sourceNew = from(source, RealtimeSource.PLUGIN_TYPE);
    ETLStage transform1 = new ETLStage("transform1", new Plugin("Script", ImmutableMap.of("script", "something"), null));
    co.cask.cdap.etl.proto.v2.ETLStage transform1New = from(transform1, Transform.PLUGIN_TYPE);
    ETLStage transform2 = new ETLStage("transform2", new Plugin("Script", null, null));
    co.cask.cdap.etl.proto.v2.ETLStage transform2New = from(transform2, Transform.PLUGIN_TYPE);
    ETLStage transform3 = new ETLStage("transform3", new Plugin("Validator", ImmutableMap.of("p1", "v1", "p2", "v2")), "errorDS");
    co.cask.cdap.etl.proto.v2.ETLStage transform3New = from(transform3, Transform.PLUGIN_TYPE);
    ETLStage sink1 = new ETLStage("sink1", new Plugin("Table", ImmutableMap.of("rowkey", "xyz"), artifact), null);
    co.cask.cdap.etl.proto.v2.ETLStage sink1New = from(sink1, RealtimeSink.PLUGIN_TYPE);
    ETLStage sink2 = new ETLStage("sink2", new Plugin("HDFS", ImmutableMap.of("name", "abc"), artifact), null);
    co.cask.cdap.etl.proto.v2.ETLStage sink2New = from(sink2, RealtimeSink.PLUGIN_TYPE);
    Set<Connection> connections = new HashSet<>();
    connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
    connections.add(new Connection(transform1New.getName(), transform2New.getName()));
    connections.add(new Connection(transform2New.getName(), transform3New.getName()));
    connections.add(new Connection(transform3New.getName(), sink1New.getName()));
    connections.add(new Connection(transform3New.getName(), sink2New.getName()));
    Resources resources = new Resources(1024, 1);
    ETLRealtimeConfig config = ETLRealtimeConfig.builder().setInstances(1).setSource(source).addSink(sink1).addSink(sink2).addTransform(transform1).addTransform(transform2).addTransform(transform3).addConnections(connections).setResources(resources).build();
    co.cask.cdap.etl.proto.v2.ETLRealtimeConfig configNew = co.cask.cdap.etl.proto.v2.ETLRealtimeConfig.builder().setInstances(1).addStage(sourceNew).addStage(sink1New).addStage(sink2New).addStage(transform1New).addStage(transform2New).addStage(transform3New).addConnections(connections).setResources(resources).build();
    Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {

        @Nullable
        @Override
        public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
            return null;
        }
    }));
}
Also used : ArtifactSelectorConfig(co.cask.cdap.etl.proto.ArtifactSelectorConfig) Connection(co.cask.cdap.etl.proto.Connection) UpgradeContext(co.cask.cdap.etl.proto.UpgradeContext) Resources(co.cask.cdap.api.Resources) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class SmartWorkflow method configure.

@Override
protected void configure() {
    setName(NAME);
    setDescription(DESCRIPTION);
    // set the pipeline spec as a property in case somebody like the UI wants to read it
    Map<String, String> properties = new HashMap<>();
    properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
    setProperties(properties);
    stageSpecs = new HashMap<>();
    useSpark = engine == Engine.SPARK;
    for (StageSpec stageSpec : spec.getStages()) {
        stageSpecs.put(stageSpec.getName(), stageSpec);
        String pluginType = stageSpec.getPlugin().getType();
        if (SparkCompute.PLUGIN_TYPE.equals(pluginType) || SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            useSpark = true;
        }
    }
    PipelinePlanner planner;
    Set<String> actionTypes = ImmutableSet.of(Action.PLUGIN_TYPE, Constants.SPARK_PROGRAM_PLUGIN_TYPE);
    Set<String> multiPortTypes = ImmutableSet.of(SplitterTransform.PLUGIN_TYPE);
    if (useSpark) {
        // if the pipeline uses spark, we don't need to break the pipeline up into phases, we can just have
        // a single phase.
        planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.<String>of(), ImmutableSet.<String>of(), actionTypes, multiPortTypes);
    } else {
        planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.of(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE), ImmutableSet.of(SparkCompute.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE), actionTypes, multiPortTypes);
    }
    plan = planner.plan(spec);
    WorkflowProgramAdder programAdder = new TrunkProgramAdder(getConfigurer());
    // single phase, just add the program directly
    if (plan.getPhases().size() == 1) {
        addProgram(plan.getPhases().keySet().iterator().next(), programAdder);
        return;
    }
    // Dag classes don't allow a 'dag' without connections
    if (plan.getPhaseConnections().isEmpty()) {
        WorkflowProgramAdder fork = programAdder.fork();
        for (String phaseName : plan.getPhases().keySet()) {
            addProgram(phaseName, fork);
        }
        fork.join();
        return;
    }
    dag = new ControlDag(plan.getPhaseConnections());
    boolean dummyNodeAdded = false;
    Map<String, ConditionBranches> conditionBranches = plan.getConditionPhaseBranches();
    if (conditionBranches.isEmpty()) {
        // after flattening, there is guaranteed to be just one source
        dag.flatten();
    } else if (!conditionBranches.keySet().containsAll(dag.getSources())) {
        // Continue only if the conditon node is not the source of the dag, otherwise dag is already in the
        // required form
        Set<String> conditions = conditionBranches.keySet();
        // flatten only the part of the dag starting from sources and ending in conditions/sinks.
        Set<String> dagNodes = dag.accessibleFrom(dag.getSources(), Sets.union(dag.getSinks(), conditions));
        Set<String> dagNodesWithoutCondition = Sets.difference(dagNodes, conditions);
        Set<Connection> connections = new HashSet<>();
        Deque<String> bfs = new LinkedList<>();
        Set<String> sinks = new HashSet<>();
        // If its a single phase without condition then no need to flatten
        if (dagNodesWithoutCondition.size() > 1) {
            Dag subDag;
            try {
                subDag = dag.createSubDag(dagNodesWithoutCondition);
            } catch (IllegalArgumentException | DisjointConnectionsException e) {
                // DisjointConnectionsException thrown when islands are created from the dagNodesWithoutCondition
                // IllegalArgumentException thrown when connections are empty
                // In both cases we need to add dummy node and create connected Dag
                String dummyNode = "dummy";
                dummyNodeAdded = true;
                Set<Connection> subDagConnections = new HashSet<>();
                for (String source : dag.getSources()) {
                    subDagConnections.add(new Connection(dummyNode, source));
                }
                Deque<String> subDagBFS = new LinkedList<>();
                subDagBFS.addAll(dag.getSources());
                while (subDagBFS.peek() != null) {
                    String node = subDagBFS.poll();
                    for (String output : dag.getNodeOutputs(node)) {
                        if (dagNodesWithoutCondition.contains(output)) {
                            subDagConnections.add(new Connection(node, output));
                            subDagBFS.add(output);
                        }
                    }
                }
                subDag = new Dag(subDagConnections);
            }
            ControlDag cdag = new ControlDag(subDag);
            cdag.flatten();
            // Add all connections from cdag
            bfs.addAll(cdag.getSources());
            while (bfs.peek() != null) {
                String node = bfs.poll();
                for (String output : cdag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            }
            sinks.addAll(cdag.getSinks());
        } else {
            sinks.addAll(dagNodesWithoutCondition);
        }
        // Add back the existing condition nodes and corresponding conditions
        Set<String> conditionsFromDag = Sets.intersection(dagNodes, conditions);
        for (String condition : conditionsFromDag) {
            connections.add(new Connection(sinks.iterator().next(), condition));
        }
        bfs.addAll(Sets.intersection(dagNodes, conditions));
        while (bfs.peek() != null) {
            String node = bfs.poll();
            ConditionBranches branches = conditionBranches.get(node);
            if (branches == null) {
                // not a condition node. add outputs
                for (String output : dag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            } else {
                // condition node
                for (Boolean condition : Arrays.asList(true, false)) {
                    String phase = condition ? branches.getTrueOutput() : branches.getFalseOutput();
                    if (phase == null) {
                        continue;
                    }
                    connections.add(new Connection(node, phase, condition));
                    bfs.add(phase);
                }
            }
        }
        dag = new ControlDag(connections);
    }
    if (dummyNodeAdded) {
        WorkflowProgramAdder fork = programAdder.fork();
        String dummyNode = dag.getSources().iterator().next();
        for (String output : dag.getNodeOutputs(dummyNode)) {
            // need to make sure we don't call also() if this is the final branch
            if (!addBranchPrograms(output, fork)) {
                fork = fork.also();
            }
        }
    } else {
        String start = dag.getSources().iterator().next();
        addPrograms(start, programAdder);
    }
}
Also used : ControlDag(co.cask.cdap.etl.planner.ControlDag) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Connection(co.cask.cdap.etl.proto.Connection) Dag(co.cask.cdap.etl.planner.Dag) ControlDag(co.cask.cdap.etl.planner.ControlDag) Deque(java.util.Deque) LinkedList(java.util.LinkedList) ConditionBranches(co.cask.cdap.etl.planner.ConditionBranches) StageSpec(co.cask.cdap.etl.spec.StageSpec) HashSet(java.util.HashSet)

Example 4 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class ETLBatchConfigTest method testUpgrade.

@Test
public void testUpgrade() throws Exception {
    final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
    ETLStage source = new ETLStage("source", new Plugin("DataGenerator", ImmutableMap.of("p1", "v1"), artifact), null);
    co.cask.cdap.etl.proto.v2.ETLStage sourceNew = from(source, BatchSource.PLUGIN_TYPE);
    ETLStage transform1 = new ETLStage("transform1", new Plugin("Script", ImmutableMap.of("script", "something"), null));
    co.cask.cdap.etl.proto.v2.ETLStage transform1New = from(transform1, Transform.PLUGIN_TYPE);
    ETLStage transform2 = new ETLStage("transform2", new Plugin("Script", null, null));
    co.cask.cdap.etl.proto.v2.ETLStage transform2New = from(transform2, Transform.PLUGIN_TYPE);
    ETLStage transform3 = new ETLStage("transform3", new Plugin("Validator", ImmutableMap.of("p1", "v1", "p2", "v2")), null);
    co.cask.cdap.etl.proto.v2.ETLStage transform3New = from(transform3, Transform.PLUGIN_TYPE);
    ETLStage sink1 = new ETLStage("sink1", new Plugin("Table", ImmutableMap.of("rowkey", "xyz"), artifact), null);
    co.cask.cdap.etl.proto.v2.ETLStage sink1New = from(sink1, BatchSink.PLUGIN_TYPE);
    ETLStage sink2 = new ETLStage("sink2", new Plugin("HDFS", ImmutableMap.of("name", "abc"), artifact), null);
    co.cask.cdap.etl.proto.v2.ETLStage sink2New = from(sink2, BatchSink.PLUGIN_TYPE);
    Set<Connection> connections = new HashSet<>();
    connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
    connections.add(new Connection(transform1New.getName(), transform2New.getName()));
    connections.add(new Connection(transform2New.getName(), transform3New.getName()));
    connections.add(new Connection(transform3New.getName(), sink1New.getName()));
    connections.add(new Connection(transform3New.getName(), sink2New.getName()));
    String schedule = "*/5 * * * *";
    Resources resources = new Resources(1024, 1);
    ETLBatchConfig config = ETLBatchConfig.builder(schedule).setSource(source).addSink(sink1).addSink(sink2).addTransform(transform1).addTransform(transform2).addTransform(transform3).addConnections(connections).setResources(resources).setDriverResources(resources).build();
    co.cask.cdap.etl.proto.v2.ETLBatchConfig configNew = co.cask.cdap.etl.proto.v2.ETLBatchConfig.builder(schedule).addStage(sourceNew).addStage(sink1New).addStage(sink2New).addStage(transform1New).addStage(transform2New).addStage(transform3New).addConnections(connections).setResources(resources).setDriverResources(resources).build();
    Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {

        @Nullable
        @Override
        public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
            return null;
        }
    }));
}
Also used : ArtifactSelectorConfig(co.cask.cdap.etl.proto.ArtifactSelectorConfig) Connection(co.cask.cdap.etl.proto.Connection) UpgradeContext(co.cask.cdap.etl.proto.UpgradeContext) Resources(co.cask.cdap.api.Resources) ETLPlugin(co.cask.cdap.etl.proto.v2.ETLPlugin) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 5 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class ConnectorDagTest method testConditionDag.

@Test
public void testConditionDag() throws Exception {
    /*
          file - csv - c1 - t1---agg1--agg2---sink1
                        |
                        ----c2 - sink2
                             |
                              ------c3 - sink3

     */
    Set<Connection> connections = ImmutableSet.of(new Connection("file", "csv"), new Connection("csv", "c1"), new Connection("c1", "t1"), new Connection("t1", "agg1"), new Connection("agg1", "agg2"), new Connection("agg2", "sink1"), new Connection("c1", "c2"), new Connection("c2", "sink2"), new Connection("c2", "c3"), new Connection("c3", "sink3"));
    Set<String> conditions = new HashSet<>(Arrays.asList("c1", "c2", "c3"));
    Set<String> reduceNodes = new HashSet<>(Arrays.asList("agg1", "agg2"));
    Set<String> isolationNodes = new HashSet<>();
    Set<String> multiPortNodes = new HashSet<>();
    Set<Dag> actual = PipelinePlanner.split(connections, conditions, reduceNodes, isolationNodes, EMPTY_ACTIONS, multiPortNodes, EMPTY_CONNECTORS);
    Dag dag1 = new Dag(ImmutableSet.of(new Connection("file", "csv"), new Connection("csv", "c1")));
    Dag dag2 = new Dag(ImmutableSet.of(new Connection("c1", "t1"), new Connection("t1", "agg1"), new Connection("agg1", "agg2.connector")));
    Dag dag3 = new Dag(ImmutableSet.of(new Connection("agg2.connector", "agg2"), new Connection("agg2", "sink1")));
    Dag dag4 = new Dag(ImmutableSet.of(new Connection("c1", "c2")));
    Dag dag5 = new Dag(ImmutableSet.of(new Connection("c2", "sink2")));
    Dag dag6 = new Dag(ImmutableSet.of(new Connection("c2", "c3")));
    Dag dag7 = new Dag(ImmutableSet.of(new Connection("c3", "sink3")));
    Set<Dag> expected = ImmutableSet.of(dag1, dag2, dag3, dag4, dag5, dag6, dag7);
    Assert.assertEquals(actual, expected);
}
Also used : Connection(co.cask.cdap.etl.proto.Connection) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

Connection (co.cask.cdap.etl.proto.Connection)36 Test (org.junit.Test)26 HashSet (java.util.HashSet)23 HashMap (java.util.HashMap)10 StageSpec (co.cask.cdap.etl.spec.StageSpec)8 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)6 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)5 Resources (co.cask.cdap.api.Resources)4 UpgradeContext (co.cask.cdap.etl.proto.UpgradeContext)4 ArrayList (java.util.ArrayList)4 ArtifactSelectorConfig (co.cask.cdap.etl.proto.ArtifactSelectorConfig)3 Dag (co.cask.cdap.etl.planner.Dag)2 Plugin (co.cask.cdap.etl.proto.v1.Plugin)2 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 Map (java.util.Map)2 Set (java.util.Set)2 Schema (co.cask.cdap.api.data.schema.Schema)1 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)1 ConditionBranches (co.cask.cdap.etl.planner.ConditionBranches)1