Search in sources :

Example 71 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class DagTest method testConditionsOnBranches.

@Test
public void testConditionsOnBranches() {
    /*
                        |--> n2
              |--> c1 --|
         n1 --|         |--> n3
              |
              |                |--> n5
              |--> n4 --> c2 --|
                               |--> n6
     */
    Dag dag = new Dag(ImmutableSet.of(new Connection("n1", "c1"), new Connection("n1", "n4"), new Connection("c1", "n2"), new Connection("c1", "n3"), new Connection("n4", "c2"), new Connection("c2", "n5"), new Connection("c2", "n6")));
    Set<Dag> actual = dag.splitByControlNodes(ImmutableSet.of("c1", "c2"), Collections.<String>emptySet());
    Set<Dag> expected = ImmutableSet.of(new Dag(ImmutableSet.of(new Connection("n1", "n4"), new Connection("n1", "c1"), new Connection("n4", "c2"))), new Dag(ImmutableSet.of(new Connection("c1", "n2"))), new Dag(ImmutableSet.of(new Connection("c1", "n3"))), new Dag(ImmutableSet.of(new Connection("c2", "n5"))), new Dag(ImmutableSet.of(new Connection("c2", "n6"))));
    Assert.assertEquals(expected, actual);
}
Also used : Connection(io.cdap.cdap.etl.proto.Connection) Test(org.junit.Test)

Example 72 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class DagTest method testTopologicalOrder.

@Test
public void testTopologicalOrder() {
    // n1 -> n2 -> n3 -> n4
    Dag dag = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "n3"), new Connection("n3", "n4")));
    Assert.assertEquals(ImmutableList.of("n1", "n2", "n3", "n4"), dag.getTopologicalOrder());
    /*
             |--- n2 ---|
        n1 --|          |-- n4
             |--- n3 ---|
     */
    dag = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n1", "n3"), new Connection("n2", "n4"), new Connection("n3", "n4")));
    // could be n1 -> n2 -> n3 -> n4
    // or it could be n1 -> n3 -> n2 -> n4
    List<String> linearized = dag.getTopologicalOrder();
    Assert.assertEquals("n1", linearized.get(0));
    Assert.assertEquals("n4", linearized.get(3));
    assertBefore(linearized, "n1", "n2");
    assertBefore(linearized, "n1", "n3");
    /*
        n1 --|
             |--- n3
        n2 --|
     */
    dag = new Dag(ImmutableSet.of(new Connection("n1", "n3"), new Connection("n2", "n3")));
    // could be n1 -> n2 -> n3
    // or it could be n2 -> n1 -> n3
    linearized = dag.getTopologicalOrder();
    Assert.assertEquals("n3", linearized.get(2));
    assertBefore(linearized, "n1", "n3");
    assertBefore(linearized, "n2", "n3");
    /*
                                     |--- n3
             |--- n2 ----------------|
        n1 --|       |               |--- n5
             |--------- n4 ----------|
             |              |        |
             |---------------- n6 ---|

        vertical arrows are pointing down
     */
    dag = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n1", "n4"), new Connection("n1", "n6"), new Connection("n2", "n3"), new Connection("n2", "n4"), new Connection("n2", "n5"), new Connection("n4", "n3"), new Connection("n4", "n5"), new Connection("n4", "n6"), new Connection("n6", "n3"), new Connection("n6", "n5")));
    linearized = dag.getTopologicalOrder();
    Assert.assertEquals("n1", linearized.get(0));
    Assert.assertEquals("n2", linearized.get(1));
    Assert.assertEquals("n4", linearized.get(2));
    Assert.assertEquals("n6", linearized.get(3));
    assertBefore(linearized, "n6", "n3");
    assertBefore(linearized, "n6", "n5");
}
Also used : Connection(io.cdap.cdap.etl.proto.Connection) Test(org.junit.Test)

Example 73 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class DagTest method testSplitByControlNodes.

@Test
public void testSplitByControlNodes() {
    // In following test cases note that Action nodes are named as (a0, a1...) and condition nodes are named
    // as (c0, c1, ..)
    // Test condition in the beginning and one branch connects to the action.
    // c1 --> a1 --> n1 --> n2
    // |
    // | --> n3 --> n4 --> a2
    Dag dag = new Dag(ImmutableSet.of(new Connection("c1", "a1"), new Connection("a1", "n1"), new Connection("n1", "n2"), new Connection("c1", "n3"), new Connection("n3", "n4"), new Connection("n4", "a2")));
    Set<Dag> actual = dag.splitByControlNodes(ImmutableSet.of("c1"), ImmutableSet.of("a1", "a2"));
    Set<Dag> expectedDags = new HashSet<>();
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("c1", "a1"))));
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("a1", "n1"), new Connection("n1", "n2"))));
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("c1", "n3"), new Connection("n3", "n4"), new Connection("n4", "a2"))));
    Assert.assertEquals(expectedDags, actual);
    // Test condition in the end and branches connects to the Action.
    // n0-->n1--c0-->n2-->c1-->a1
    // |
    // |-->a2
    dag = new Dag(ImmutableSet.of(new Connection("n0", "n1"), new Connection("n1", "c0"), new Connection("c0", "n2"), new Connection("n2", "c1"), new Connection("c1", "a1"), new Connection("c1", "a2")));
    actual = dag.splitByControlNodes(ImmutableSet.of("c0", "c1"), ImmutableSet.of("a1", "a2"));
    expectedDags.clear();
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("n0", "n1"), new Connection("n1", "c0"))));
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("c0", "n2"), new Connection("n2", "c1"))));
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("c1", "a2"))));
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("c1", "a1"))));
    Assert.assertEquals(expectedDags, actual);
    // Test Actions in the beginning and connects to the Condition.
    // a1 - a2 - c1 - n0 - n1
    // |
    // a0 --
    dag = new Dag(ImmutableSet.of(new Connection("a0", "a2"), new Connection("a1", "a2"), new Connection("a2", "c1"), new Connection("c1", "n0"), new Connection("n0", "n1")));
    actual = dag.splitByControlNodes(ImmutableSet.of("c1"), ImmutableSet.of("a0", "a1", "a2"));
    expectedDags.clear();
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("a0", "a2"), new Connection("a1", "a2"))));
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("a2", "c1"))));
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("c1", "n0"), new Connection("n0", "n1"))));
    Assert.assertEquals(expectedDags, actual);
    // Tests Actions in the beginning and connect to the Condition through other plugin
    // a1 - n0 - c1 - n1
    // |
    // a0 --
    dag = new Dag(ImmutableSet.of(new Connection("a0", "n0"), new Connection("a1", "n0"), new Connection("n0", "c1"), new Connection("c1", "n1")));
    actual = dag.splitByControlNodes(ImmutableSet.of("c1"), ImmutableSet.of("a0", "a1"));
    expectedDags.clear();
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("a0", "n0"), new Connection("a1", "n0"), new Connection("n0", "c1"))));
    expectedDags.add(new Dag(ImmutableSet.of(new Connection("c1", "n1"))));
    Assert.assertEquals(expectedDags, actual);
}
Also used : Connection(io.cdap.cdap.etl.proto.Connection) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 74 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class ControlDag method trim.

/**
 * Trims any redundant control connections.
 *
 * For example:
 *   n1 ------> n2
 *       |      |
 *       |      v
 *       |----> n3
 * has a redundant edge n1 -> n3, because the edge from n2 -> n3 already enforces n1 -> n3.
 * The approach is look at each node (call it nodeB). For each input into nodeB (call it nodeA),
 * if there is another path from nodeA to nodeB besides the direct edge, we can remove the edge nodeA -> nodeB.
 *
 * @return number of connections removed.
 */
public int trim() {
    int numRemoved = 0;
    for (String node : nodes) {
        Set<Connection> toRemove = new HashSet<>();
        for (String nodeInput : getNodeInputs(node)) {
            if (numPaths(nodeInput, node) > 1) {
                toRemove.add(new Connection(nodeInput, node));
            }
        }
        for (Connection conn : toRemove) {
            removeConnection(conn.getFrom(), conn.getTo());
        }
        numRemoved += toRemove.size();
    }
    return numRemoved;
}
Also used : Connection(io.cdap.cdap.etl.proto.Connection) HashSet(java.util.HashSet)

Example 75 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class PipelinePlanner method plan.

/**
 * Create an execution plan for the given logical pipeline. This is used for batch pipelines.
 * Though it may eventually be useful to mark windowing points for realtime pipelines.
 *
 * A plan consists of one or more phases, with connections between phases.
 * A connection between a phase indicates control flow, and not necessarily
 * data flow. This class assumes that it receives a valid pipeline spec.
 * That is, the pipeline has no cycles, all its nodes have unique names,
 * sources don't have any input, sinks don't have any output,
 * everything else has both an input and an output, etc.
 *
 * We start by inserting connector nodes into the logical dag,
 * which are used to mark boundaries between mapreduce jobs.
 * Each connector represents a node where we will need to write to a local dataset.
 *
 * Next, the logical pipeline is broken up into phases,
 * using the connectors as sinks in one phase, and a source in another.
 * After this point, connections between phases do not indicate data flow, but control flow.
 *
 * @param spec the pipeline spec, representing a logical pipeline
 * @return the execution plan
 */
public PipelinePlan plan(PipelineSpec spec) {
    // go through the stages and examine their plugin type to determine which stages are reduce stages
    Set<String> reduceNodes = new HashSet<>();
    Set<String> isolationNodes = new HashSet<>();
    Set<String> actionNodes = new HashSet<>();
    Set<String> multiPortNodes = new HashSet<>();
    Set<String> allNodes = new HashSet<>();
    // Map to hold the connection information from condition nodes to the first stage
    // they connect to. Condition information also includes whether the stage is connected
    // on the 'true' branch or the 'false' branch
    Map<String, ConditionBranches> conditionBranches = new HashMap<>();
    Map<String, Set<String>> conditionOutputs = new HashMap<>();
    Map<String, Set<String>> conditionInputs = new HashMap<>();
    Map<String, StageSpec> specs = new HashMap<>();
    for (StageSpec stage : spec.getStages()) {
        String pluginType = stage.getPlugin().getType();
        allNodes.add(stage.getName());
        if (reduceTypes.contains(pluginType)) {
            reduceNodes.add(stage.getName());
        }
        if (isolationTypes.contains(pluginType)) {
            isolationNodes.add(stage.getName());
        }
        if (actionTypes.contains(pluginType)) {
            // Collect all Action nodes from spec
            actionNodes.add(stage.getName());
        }
        if (multiPortTypes.contains(pluginType)) {
            multiPortNodes.add(stage.getName());
        }
        if (Condition.PLUGIN_TYPE.equals(pluginType)) {
            conditionBranches.put(stage.getName(), new ConditionBranches(null, null));
            conditionOutputs.put(stage.getName(), new HashSet<String>());
            conditionInputs.put(stage.getName(), new HashSet<String>());
        }
        specs.put(stage.getName(), stage);
    }
    // Special case for action nodes when there is no connection between them
    if (spec.getConnections().isEmpty()) {
        // All nodes should be actions
        if (!actionNodes.containsAll(allNodes)) {
            throw new IllegalStateException("No connections are specified.");
        }
        Map<String, PipelinePhase> phases = new HashMap<>();
        for (String actionNode : actionNodes) {
            PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
            PipelinePhase actionPhase = phaseBuilder.addStage(specs.get(actionNode)).build();
            phases.put(actionNode, actionPhase);
        }
        return new PipelinePlan(phases, new HashSet<Connection>());
    }
    // Set representing control nodes (Conditions and Actions)
    Set<String> controlNodes = Sets.union(actionNodes, conditionBranches.keySet());
    Map<String, String> conditionChildToParent = new HashMap<>();
    for (Connection connection : spec.getConnections()) {
        if (conditionBranches.containsKey(connection.getFrom())) {
            conditionOutputs.get(connection.getFrom()).add(connection.getTo());
        }
        if (conditionBranches.containsKey(connection.getTo())) {
            conditionInputs.get(connection.getTo()).add(connection.getFrom());
        }
        if (conditionBranches.containsKey(connection.getFrom())) {
            if (conditionBranches.containsKey(connection.getTo())) {
                // conditions are chained
                conditionChildToParent.put(connection.getTo(), connection.getFrom());
            }
            // Outgoing connection from condition
            ConditionBranches branches = conditionBranches.get(connection.getFrom());
            String trueOutput;
            String falseOutput;
            if (connection.getCondition()) {
                trueOutput = connection.getTo();
                falseOutput = branches.getFalseOutput();
            } else {
                trueOutput = branches.getTrueOutput();
                falseOutput = connection.getTo();
            }
            conditionBranches.put(connection.getFrom(), new ConditionBranches(trueOutput, falseOutput));
        }
    }
    Map<String, String> connectorNodes = new HashMap<>();
    // now split the logical pipeline into pipeline phases, using the connectors as split points
    Set<Dag> splittedDag = split(spec.getConnections(), conditionBranches.keySet(), reduceNodes, isolationNodes, actionNodes, multiPortNodes, connectorNodes);
    Map<String, String> controlConnectors = getConnectorsAssociatedWithConditions(conditionBranches.keySet(), conditionChildToParent, conditionInputs, conditionOutputs, actionNodes);
    Map<String, Dag> subdags = new HashMap<>();
    for (Dag subdag : splittedDag) {
        subdags.put(getPhaseName(subdag), subdag);
    }
    // build connections between phases and convert dags to PipelinePhase.
    Set<Connection> phaseConnections = new HashSet<>();
    Map<String, PipelinePhase> phases = new HashMap<>();
    for (Map.Entry<String, Dag> dagEntry1 : subdags.entrySet()) {
        String dag1Name = dagEntry1.getKey();
        Dag dag1 = dagEntry1.getValue();
        // convert the dag to a PipelinePhase
        // add a separate pipeline phase for each control node in the subdag
        Set<String> dag1ControlNodes = Sets.intersection(controlNodes, dag1.getNodes());
        for (String dag1ControlNode : dag1ControlNodes) {
            if (!phases.containsKey(dag1ControlNode)) {
                phases.put(dag1ControlNode, PipelinePhase.builder(supportedPluginTypes).addStage(specs.get(dag1ControlNode)).build());
            }
        }
        // if there are non-control nodes in the subdag, add a pipeline phase for it
        if (!controlNodes.containsAll(dag1.getNodes())) {
            // the updated dag replaces conditions with the corresponding connector if applicable.
            Dag updatedDag = getUpdatedDag(dag1, controlConnectors);
            // Remove any control nodes from this dag
            if (!Sets.intersection(updatedDag.getNodes(), controlNodes).isEmpty()) {
                Set<String> nodes = Sets.difference(updatedDag.getNodes(), controlNodes);
                updatedDag = updatedDag.createSubDag(nodes);
            }
            phases.put(dag1Name, dagToPipeline(updatedDag, connectorNodes, specs, controlConnectors));
        }
        for (String controlSource : Sets.intersection(controlNodes, dag1.getSources())) {
            ConditionBranches branches = conditionBranches.get(controlSource);
            Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
            for (String output : dag1.getNodeOutputs(controlSource)) {
                if (controlNodes.contains(output)) {
                    // control source -> control node, add a phase connection between the control phases
                    phaseConnections.add(new Connection(controlSource, output, condition));
                } else {
                    // control source -> non-control nodes, add a phase connection from the control phase to this dag
                    phaseConnections.add(new Connection(controlSource, dag1Name, condition));
                }
            }
        }
        // from this dag to the control phase
        for (String controlSink : Sets.intersection(controlNodes, dag1.getSinks())) {
            for (String input : dag1.getNodeInputs(controlSink)) {
                if (controlNodes.contains(input)) {
                    // control node -> control-sink, add a phase connection between the control phases
                    ConditionBranches branches = conditionBranches.get(input);
                    Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
                    phaseConnections.add(new Connection(input, controlSink, condition));
                } else {
                    // non-control node -> control-sink, add a phase connection from this dag to the control phase
                    phaseConnections.add(new Connection(dag1Name, controlSink));
                }
            }
        }
        // find connected subdags (they have a source that is a sink in dag1)
        Set<String> nonControlSinks = Sets.difference(dag1.getSinks(), controlNodes);
        for (Map.Entry<String, Dag> dagEntry2 : subdags.entrySet()) {
            String dag2Name = dagEntry2.getKey();
            Dag dag2 = dagEntry2.getValue();
            if (dag1Name.equals(dag2Name)) {
                continue;
            }
            if (!Sets.intersection(nonControlSinks, dag2.getSources()).isEmpty()) {
                phaseConnections.add(new Connection(dag1Name, dag2Name));
            }
        }
    }
    return new PipelinePlan(phases, phaseConnections);
}
Also used : ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) HashMap(java.util.HashMap) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) HashSet(java.util.HashSet) Connection(io.cdap.cdap.etl.proto.Connection) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Connection (io.cdap.cdap.etl.proto.Connection)96 Test (org.junit.Test)78 HashSet (java.util.HashSet)70 HashMap (java.util.HashMap)44 ArrayList (java.util.ArrayList)32 Operation (io.cdap.cdap.api.lineage.field.Operation)28 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)28 List (java.util.List)28 ImmutableList (com.google.common.collect.ImmutableList)26 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)26 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)26 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)26 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)26 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)26 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)24 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)20 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)18 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)16 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)14 FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)8