Search in sources :

Example 26 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by cdapio.

the class PipelinePlanner method plan.

/**
 * Create an execution plan for the given logical pipeline. This is used for batch pipelines.
 * Though it may eventually be useful to mark windowing points for realtime pipelines.
 *
 * A plan consists of one or more phases, with connections between phases.
 * A connection between a phase indicates control flow, and not necessarily
 * data flow. This class assumes that it receives a valid pipeline spec.
 * That is, the pipeline has no cycles, all its nodes have unique names,
 * sources don't have any input, sinks don't have any output,
 * everything else has both an input and an output, etc.
 *
 * We start by inserting connector nodes into the logical dag,
 * which are used to mark boundaries between mapreduce jobs.
 * Each connector represents a node where we will need to write to a local dataset.
 *
 * Next, the logical pipeline is broken up into phases,
 * using the connectors as sinks in one phase, and a source in another.
 * After this point, connections between phases do not indicate data flow, but control flow.
 *
 * @param spec the pipeline spec, representing a logical pipeline
 * @return the execution plan
 */
public PipelinePlan plan(PipelineSpec spec) {
    // go through the stages and examine their plugin type to determine which stages are reduce stages
    Set<String> reduceNodes = new HashSet<>();
    Set<String> isolationNodes = new HashSet<>();
    Set<String> actionNodes = new HashSet<>();
    Set<String> multiPortNodes = new HashSet<>();
    Set<String> allNodes = new HashSet<>();
    // Map to hold the connection information from condition nodes to the first stage
    // they connect to. Condition information also includes whether the stage is connected
    // on the 'true' branch or the 'false' branch
    Map<String, ConditionBranches> conditionBranches = new HashMap<>();
    Map<String, Set<String>> conditionOutputs = new HashMap<>();
    Map<String, Set<String>> conditionInputs = new HashMap<>();
    Map<String, StageSpec> specs = new HashMap<>();
    for (StageSpec stage : spec.getStages()) {
        String pluginType = stage.getPlugin().getType();
        allNodes.add(stage.getName());
        if (reduceTypes.contains(pluginType)) {
            reduceNodes.add(stage.getName());
        }
        if (isolationTypes.contains(pluginType)) {
            isolationNodes.add(stage.getName());
        }
        if (actionTypes.contains(pluginType)) {
            // Collect all Action nodes from spec
            actionNodes.add(stage.getName());
        }
        if (multiPortTypes.contains(pluginType)) {
            multiPortNodes.add(stage.getName());
        }
        if (Condition.PLUGIN_TYPE.equals(pluginType)) {
            conditionBranches.put(stage.getName(), new ConditionBranches(null, null));
            conditionOutputs.put(stage.getName(), new HashSet<String>());
            conditionInputs.put(stage.getName(), new HashSet<String>());
        }
        specs.put(stage.getName(), stage);
    }
    // Special case for action nodes when there is no connection between them
    if (spec.getConnections().isEmpty()) {
        // All nodes should be actions
        if (!actionNodes.containsAll(allNodes)) {
            throw new IllegalStateException("No connections are specified.");
        }
        Map<String, PipelinePhase> phases = new HashMap<>();
        for (String actionNode : actionNodes) {
            PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
            PipelinePhase actionPhase = phaseBuilder.addStage(specs.get(actionNode)).build();
            phases.put(actionNode, actionPhase);
        }
        return new PipelinePlan(phases, new HashSet<Connection>());
    }
    // Set representing control nodes (Conditions and Actions)
    Set<String> controlNodes = Sets.union(actionNodes, conditionBranches.keySet());
    Map<String, String> conditionChildToParent = new HashMap<>();
    for (Connection connection : spec.getConnections()) {
        if (conditionBranches.containsKey(connection.getFrom())) {
            conditionOutputs.get(connection.getFrom()).add(connection.getTo());
        }
        if (conditionBranches.containsKey(connection.getTo())) {
            conditionInputs.get(connection.getTo()).add(connection.getFrom());
        }
        if (conditionBranches.containsKey(connection.getFrom())) {
            if (conditionBranches.containsKey(connection.getTo())) {
                // conditions are chained
                conditionChildToParent.put(connection.getTo(), connection.getFrom());
            }
            // Outgoing connection from condition
            ConditionBranches branches = conditionBranches.get(connection.getFrom());
            String trueOutput;
            String falseOutput;
            if (connection.getCondition()) {
                trueOutput = connection.getTo();
                falseOutput = branches.getFalseOutput();
            } else {
                trueOutput = branches.getTrueOutput();
                falseOutput = connection.getTo();
            }
            conditionBranches.put(connection.getFrom(), new ConditionBranches(trueOutput, falseOutput));
        }
    }
    Map<String, String> connectorNodes = new HashMap<>();
    // now split the logical pipeline into pipeline phases, using the connectors as split points
    Set<Dag> splittedDag = split(spec.getConnections(), conditionBranches.keySet(), reduceNodes, isolationNodes, actionNodes, multiPortNodes, connectorNodes);
    Map<String, String> controlConnectors = getConnectorsAssociatedWithConditions(conditionBranches.keySet(), conditionChildToParent, conditionInputs, conditionOutputs, actionNodes);
    Map<String, Dag> subdags = new HashMap<>();
    for (Dag subdag : splittedDag) {
        subdags.put(getPhaseName(subdag), subdag);
    }
    // build connections between phases and convert dags to PipelinePhase.
    Set<Connection> phaseConnections = new HashSet<>();
    Map<String, PipelinePhase> phases = new HashMap<>();
    for (Map.Entry<String, Dag> dagEntry1 : subdags.entrySet()) {
        String dag1Name = dagEntry1.getKey();
        Dag dag1 = dagEntry1.getValue();
        // convert the dag to a PipelinePhase
        // add a separate pipeline phase for each control node in the subdag
        Set<String> dag1ControlNodes = Sets.intersection(controlNodes, dag1.getNodes());
        for (String dag1ControlNode : dag1ControlNodes) {
            if (!phases.containsKey(dag1ControlNode)) {
                phases.put(dag1ControlNode, PipelinePhase.builder(supportedPluginTypes).addStage(specs.get(dag1ControlNode)).build());
            }
        }
        // if there are non-control nodes in the subdag, add a pipeline phase for it
        if (!controlNodes.containsAll(dag1.getNodes())) {
            // the updated dag replaces conditions with the corresponding connector if applicable.
            Dag updatedDag = getUpdatedDag(dag1, controlConnectors);
            // Remove any control nodes from this dag
            if (!Sets.intersection(updatedDag.getNodes(), controlNodes).isEmpty()) {
                Set<String> nodes = Sets.difference(updatedDag.getNodes(), controlNodes);
                updatedDag = updatedDag.createSubDag(nodes);
            }
            phases.put(dag1Name, dagToPipeline(updatedDag, connectorNodes, specs, controlConnectors));
        }
        for (String controlSource : Sets.intersection(controlNodes, dag1.getSources())) {
            ConditionBranches branches = conditionBranches.get(controlSource);
            Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
            for (String output : dag1.getNodeOutputs(controlSource)) {
                if (controlNodes.contains(output)) {
                    // control source -> control node, add a phase connection between the control phases
                    phaseConnections.add(new Connection(controlSource, output, condition));
                } else {
                    // control source -> non-control nodes, add a phase connection from the control phase to this dag
                    phaseConnections.add(new Connection(controlSource, dag1Name, condition));
                }
            }
        }
        // from this dag to the control phase
        for (String controlSink : Sets.intersection(controlNodes, dag1.getSinks())) {
            for (String input : dag1.getNodeInputs(controlSink)) {
                if (controlNodes.contains(input)) {
                    // control node -> control-sink, add a phase connection between the control phases
                    ConditionBranches branches = conditionBranches.get(input);
                    Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
                    phaseConnections.add(new Connection(input, controlSink, condition));
                } else {
                    // non-control node -> control-sink, add a phase connection from this dag to the control phase
                    phaseConnections.add(new Connection(dag1Name, controlSink));
                }
            }
        }
        // find connected subdags (they have a source that is a sink in dag1)
        Set<String> nonControlSinks = Sets.difference(dag1.getSinks(), controlNodes);
        for (Map.Entry<String, Dag> dagEntry2 : subdags.entrySet()) {
            String dag2Name = dagEntry2.getKey();
            Dag dag2 = dagEntry2.getValue();
            if (dag1Name.equals(dag2Name)) {
                continue;
            }
            if (!Sets.intersection(nonControlSinks, dag2.getSources()).isEmpty()) {
                phaseConnections.add(new Connection(dag1Name, dag2Name));
            }
        }
    }
    return new PipelinePlan(phases, phaseConnections);
}
Also used : ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) HashMap(java.util.HashMap) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) HashSet(java.util.HashSet) Connection(io.cdap.cdap.etl.proto.Connection) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Map(java.util.Map)

Example 27 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by cdapio.

the class ControlDag method trim.

/**
 * Trims any redundant control connections.
 *
 * For example:
 *   n1 ------> n2
 *       |      |
 *       |      v
 *       |----> n3
 * has a redundant edge n1 -> n3, because the edge from n2 -> n3 already enforces n1 -> n3.
 * The approach is look at each node (call it nodeB). For each input into nodeB (call it nodeA),
 * if there is another path from nodeA to nodeB besides the direct edge, we can remove the edge nodeA -> nodeB.
 *
 * @return number of connections removed.
 */
public int trim() {
    int numRemoved = 0;
    for (String node : nodes) {
        Set<Connection> toRemove = new HashSet<>();
        for (String nodeInput : getNodeInputs(node)) {
            if (numPaths(nodeInput, node) > 1) {
                toRemove.add(new Connection(nodeInput, node));
            }
        }
        for (Connection conn : toRemove) {
            removeConnection(conn.getFrom(), conn.getTo());
        }
        numRemoved += toRemove.size();
    }
    return numRemoved;
}
Also used : Connection(io.cdap.cdap.etl.proto.Connection) HashSet(java.util.HashSet)

Example 28 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by cdapio.

the class Dag method subsetAround.

/**
 * Return a subset of this dag starting from the specified stage, without going past any node in the
 * child stop nodes and parent stop nodes. If the parent or child stop nodes contain the starting stage, it
 * will be ignored.
 * This is equivalent to taking the nodes from {@link #accessibleFrom(Set, Set)}, {@link #parentsOf(String, Set)},
 * and building a dag from them.
 *
 * @param stage the stage to start at
 * @param childStopNodes set of nodes to stop traversing forwards on
 * @param parentStopNodes set of nodes to stop traversing backwards on
 * @return a dag created from the stages given and child nodes of those stages and parent nodes of those stages.
 */
public Dag subsetAround(String stage, Set<String> childStopNodes, Set<String> parentStopNodes) {
    Set<String> nodes = Sets.union(accessibleFrom(stage, childStopNodes), parentsOf(stage, parentStopNodes));
    Set<Connection> connections = new HashSet<>();
    for (String node : nodes) {
        for (String outputNode : outgoingConnections.get(node)) {
            if (nodes.contains(outputNode)) {
                connections.add(new Connection(node, outputNode));
            }
        }
    }
    return new Dag(connections);
}
Also used : Connection(io.cdap.cdap.etl.proto.Connection) HashSet(java.util.HashSet)

Example 29 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by cdapio.

the class Dag method subsetFrom.

/**
 * Return a subset of this dag starting from the specified stage, without going past any node in stopNodes.
 * This is equivalent to taking the nodes from {@link #accessibleFrom(Set, Set)} and building a dag from them.
 *
 * @param stages the stages to start at
 * @param stopNodes set of nodes to stop traversal on
 * @return a dag created from the nodes accessible from the specified stage
 */
public Dag subsetFrom(Set<String> stages, Set<String> stopNodes) {
    Set<String> nodes = accessibleFrom(stages, stopNodes);
    Set<Connection> connections = new HashSet<>();
    for (String node : nodes) {
        for (String outputNode : outgoingConnections.get(node)) {
            if (nodes.contains(outputNode)) {
                connections.add(new Connection(node, outputNode));
            }
        }
    }
    return new Dag(connections);
}
Also used : Connection(io.cdap.cdap.etl.proto.Connection) HashSet(java.util.HashSet)

Example 30 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by cdapio.

the class PipelinePlan method getConditionPhaseBranches.

/**
 * @return Conditions along with their phase connections
 */
public Map<String, ConditionBranches> getConditionPhaseBranches() {
    Map<String, ConditionBranches> conditionPhaseConnections = new HashMap<>();
    for (Connection connection : phaseConnections) {
        if (connection.getCondition() == null) {
            continue;
        }
        if (!conditionPhaseConnections.containsKey(connection.getFrom())) {
            conditionPhaseConnections.put(connection.getFrom(), new ConditionBranches(null, null));
        }
        ConditionBranches branches = conditionPhaseConnections.get(connection.getFrom());
        String trueOutput;
        String falseOutput;
        if (connection.getCondition()) {
            trueOutput = connection.getTo();
            falseOutput = branches.getFalseOutput();
        } else {
            trueOutput = branches.getTrueOutput();
            falseOutput = connection.getTo();
        }
        conditionPhaseConnections.put(connection.getFrom(), new ConditionBranches(trueOutput, falseOutput));
    }
    return conditionPhaseConnections;
}
Also used : HashMap(java.util.HashMap) Connection(io.cdap.cdap.etl.proto.Connection)

Aggregations

Connection (io.cdap.cdap.etl.proto.Connection)96 Test (org.junit.Test)78 HashSet (java.util.HashSet)70 HashMap (java.util.HashMap)44 ArrayList (java.util.ArrayList)32 Operation (io.cdap.cdap.api.lineage.field.Operation)28 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)28 List (java.util.List)28 ImmutableList (com.google.common.collect.ImmutableList)26 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)26 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)26 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)26 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)26 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)26 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)24 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)20 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)18 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)16 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)14 FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)8