Search in sources :

Example 31 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class PipelinePlannerTest method testSimpleCondition.

@Test
public void testSimpleCondition() throws Exception {
    /*
      n1 - n2 - condition - n3
                      |
                      |---- n4
     */
    Set<StageSpec> stageSpecs = ImmutableSet.of(StageSpec.builder("n1", NODE).build(), StageSpec.builder("n2", NODE).build(), StageSpec.builder("condition", CONDITION).build(), StageSpec.builder("n3", NODE).build(), StageSpec.builder("n4", NODE).build());
    Set<Connection> connections = ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition"), new Connection("condition", "n3", true), new Connection("condition", "n4", false));
    Set<String> pluginTypes = ImmutableSet.of(NODE.getType(), REDUCE.getType(), Constants.Connector.PLUGIN_TYPE, CONDITION.getType());
    Set<String> reduceTypes = ImmutableSet.of(REDUCE.getType());
    Set<String> emptySet = ImmutableSet.of();
    PipelinePlanner planner = new PipelinePlanner(pluginTypes, reduceTypes, emptySet, emptySet, emptySet);
    PipelineSpec pipelineSpec = PipelineSpec.builder().addStages(stageSpecs).addConnections(connections).build();
    Map<String, PipelinePhase> phases = new HashMap<>();
    /*
      n1--n2--condition.connector
     */
    PipelinePhase phase1 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("n1", NODE).build()).addStage(StageSpec.builder("n2", NODE).build()).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SINK_TYPE)).build()).addConnection("n1", "n2").addConnection("n2", "condition.connector").build();
    Dag controlPhaseDag = new Dag(ImmutableSet.of(new Connection("n1", "n2"), new Connection("n2", "condition")));
    String phase1Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase1Name, phase1);
    /*
      condition
     */
    PipelinePhase phase2 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition", CONDITION).build()).build();
    String phase2Name = "condition";
    phases.put(phase2Name, phase2);
    /*
      condition.connector -- n3
     */
    PipelinePhase phase3 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n3", NODE).build()).addConnection("condition.connector", "n3").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n3")));
    String phase3Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase3Name, phase3);
    /*
      condition.connector -- n4
     */
    PipelinePhase phase4 = PipelinePhase.builder(pluginTypes).addStage(StageSpec.builder("condition.connector", connectorSpec("condition.connector", Constants.Connector.SOURCE_TYPE)).build()).addStage(StageSpec.builder("n4", NODE).build()).addConnection("condition.connector", "n4").build();
    controlPhaseDag = new Dag(ImmutableSet.of(new Connection("condition", "n4")));
    String phase4Name = PipelinePlanner.getPhaseName(controlPhaseDag);
    phases.put(phase4Name, phase4);
    Set<Connection> phaseConnections = new HashSet<>();
    phaseConnections.add(new Connection(phase1Name, phase2Name));
    phaseConnections.add(new Connection(phase2Name, phase3Name, true));
    phaseConnections.add(new Connection(phase2Name, phase4Name, false));
    PipelinePlan expected = new PipelinePlan(phases, phaseConnections);
    PipelinePlan actual = planner.plan(pipelineSpec);
    Assert.assertEquals(expected, actual);
}
Also used : HashMap(java.util.HashMap) Connection(co.cask.cdap.etl.proto.Connection) PipelineSpec(co.cask.cdap.etl.spec.PipelineSpec) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 32 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class PipelinePlan method getConditionPhaseBranches.

/**
 * @return Conditions along with their phase connections
 */
public Map<String, ConditionBranches> getConditionPhaseBranches() {
    Map<String, ConditionBranches> conditionPhaseConnections = new HashMap<>();
    for (Connection connection : phaseConnections) {
        if (connection.getCondition() == null) {
            continue;
        }
        if (!conditionPhaseConnections.containsKey(connection.getFrom())) {
            conditionPhaseConnections.put(connection.getFrom(), new ConditionBranches(null, null));
        }
        ConditionBranches branches = conditionPhaseConnections.get(connection.getFrom());
        String trueOutput;
        String falseOutput;
        if (connection.getCondition()) {
            trueOutput = connection.getTo();
            falseOutput = branches.getFalseOutput();
        } else {
            trueOutput = branches.getTrueOutput();
            falseOutput = connection.getTo();
        }
        conditionPhaseConnections.put(connection.getFrom(), new ConditionBranches(trueOutput, falseOutput));
    }
    return conditionPhaseConnections;
}
Also used : HashMap(java.util.HashMap) Connection(co.cask.cdap.etl.proto.Connection)

Example 33 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class PipelineSpecGenerator method validateConfig.

/**
 * Validate that this is a valid pipeline. A valid pipeline has the following properties:
 *
 * All stages in the pipeline have a unique name.
 * Source stages have at least one output and no inputs.
 * Sink stages have at least one input and no outputs.
 * There are no cycles in the pipeline.
 * All inputs into a stage have the same schema.
 * ErrorTransforms only have BatchSource, Transform, or BatchAggregator as input stages.
 * AlertPublishers have at least one input and no outputs and don't have SparkSink or BatchSink as input.
 *
 * Returns the stages in the order they should be configured to ensure that all input stages are configured
 * before their output.
 *
 * @param config the user provided configuration
 * @return the order to configure the stages in
 * @throws IllegalArgumentException if the pipeline is invalid
 */
private ValidatedPipeline validateConfig(ETLConfig config) {
    config.validate();
    if (config.getStages().isEmpty()) {
        throw new IllegalArgumentException("A pipeline must contain at least one stage.");
    }
    Set<String> actionStages = new HashSet<>();
    Set<String> conditionStages = new HashSet<>();
    Map<String, String> stageTypes = new HashMap<>();
    // check stage name uniqueness
    Set<String> stageNames = new HashSet<>();
    for (ETLStage stage : config.getStages()) {
        if (!stageNames.add(stage.getName())) {
            throw new IllegalArgumentException(String.format("Invalid pipeline. Multiple stages are named %s. Please ensure all stage names are unique", stage.getName()));
        }
        // if stage is Action stage, add it to the Action stage set
        if (isAction(stage.getPlugin().getType())) {
            actionStages.add(stage.getName());
        }
        // if the stage is condition add it to the Condition stage set
        if (stage.getPlugin().getType().equals(Condition.PLUGIN_TYPE)) {
            conditionStages.add(stage.getName());
        }
        stageTypes.put(stage.getName(), stage.getPlugin().getType());
    }
    // check that the from and to are names of actual stages
    // also check that conditions have at most 2 outgoing connections each label with true or
    // false but not both
    Map<String, Boolean> conditionBranch = new HashMap<>();
    for (Connection connection : config.getConnections()) {
        if (!stageNames.contains(connection.getFrom())) {
            throw new IllegalArgumentException(String.format("Invalid connection %s. %s is not a stage.", connection, connection.getFrom()));
        }
        if (!stageNames.contains(connection.getTo())) {
            throw new IllegalArgumentException(String.format("Invalid connection %s. %s is not a stage.", connection, connection.getTo()));
        }
        if (conditionStages.contains(connection.getFrom())) {
            if (connection.getCondition() == null) {
                String msg = String.format("For condition stage %s, the connection %s is not marked with either " + "'true' or 'false'.", connection.getFrom(), connection);
                throw new IllegalArgumentException(msg);
            }
            // check if connection from the condition node is marked as true or false multiple times
            if (conditionBranch.containsKey(connection.getFrom()) && connection.getCondition().equals(conditionBranch.get(connection.getFrom()))) {
                String msg = String.format("For condition stage '%s', more than one outgoing connections are marked as %s.", connection.getFrom(), connection.getCondition());
                throw new IllegalArgumentException(msg);
            }
            conditionBranch.put(connection.getFrom(), connection.getCondition());
        }
    }
    List<ETLStage> traversalOrder = new ArrayList<>(stageNames.size());
    // can only have empty connections if the pipeline consists of a single action.
    if (config.getConnections().isEmpty()) {
        if (actionStages.size() == 1 && stageNames.size() == 1) {
            traversalOrder.add(config.getStages().iterator().next());
            return new ValidatedPipeline(traversalOrder, config);
        } else {
            throw new IllegalArgumentException("Invalid pipeline. There are no connections between stages. " + "This is only allowed if the pipeline consists of a single action plugin.");
        }
    }
    Dag dag = new Dag(config.getConnections());
    Set<String> controlStages = Sets.union(actionStages, conditionStages);
    Map<String, ETLStage> stages = new HashMap<>();
    for (ETLStage stage : config.getStages()) {
        String stageName = stage.getName();
        Set<String> stageInputs = dag.getNodeInputs(stageName);
        Set<String> stageOutputs = dag.getNodeOutputs(stageName);
        String stageType = stage.getPlugin().getType();
        boolean isSource = isSource(stageType);
        boolean isSink = isSink(stageType);
        // check source plugins are sources in the dag
        if (isSource) {
            if (!stageInputs.isEmpty() && !controlStages.containsAll(stageInputs)) {
                throw new IllegalArgumentException(String.format("%s %s has incoming connections from %s. %s stages cannot have any incoming connections.", stageType, stageName, stageType, Joiner.on(',').join(stageInputs)));
            }
        } else if (isSink) {
            if (!stageOutputs.isEmpty() && !controlStages.containsAll(stageOutputs)) {
                throw new IllegalArgumentException(String.format("%s %s has outgoing connections to %s. %s stages cannot have any outgoing connections.", stageType, stageName, stageType, Joiner.on(',').join(stageOutputs)));
            }
        } else if (ErrorTransform.PLUGIN_TYPE.equals(stageType)) {
            for (String inputStage : stageInputs) {
                String inputType = stageTypes.get(inputStage);
                if (!VALID_ERROR_INPUTS.contains(inputType)) {
                    throw new IllegalArgumentException(String.format("ErrorTransform %s cannot have stage %s of type %s as input. Only %s stages can emit errors.", stageName, inputStage, inputType, Joiner.on(',').join(VALID_ERROR_INPUTS)));
                }
            }
        }
        boolean isAction = isAction(stageType);
        if (!isAction && !stageType.equals(Condition.PLUGIN_TYPE) && !isSource && stageInputs.isEmpty()) {
            throw new IllegalArgumentException(String.format("Stage %s is unreachable, it has no incoming connections.", stageName));
        }
        if (!isAction && !isSink && stageOutputs.isEmpty()) {
            throw new IllegalArgumentException(String.format("Stage %s is a dead end, it has no outgoing connections.", stageName));
        }
        stages.put(stageName, stage);
    }
    validateConditionBranches(conditionStages, dag);
    for (String stageName : dag.getTopologicalOrder()) {
        traversalOrder.add(stages.get(stageName));
    }
    return new ValidatedPipeline(traversalOrder, config);
}
Also used : HashMap(java.util.HashMap) Connection(co.cask.cdap.etl.proto.Connection) ArrayList(java.util.ArrayList) Dag(co.cask.cdap.etl.planner.Dag) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) HashSet(java.util.HashSet)

Example 34 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class ETLConfig method getValidConnections.

private List<Connection> getValidConnections(List<Connection> connections) {
    // TODO : this can be removed once UI changes are made and we don't have to support the old format
    if (source.getPlugin() == null) {
        // if its old format, we just return an empty list.
        return new ArrayList<>();
    }
    if (connections == null) {
        connections = new ArrayList<>();
    }
    if (connections.isEmpty()) {
        // if connections are empty, we create a connections list,
        // which is a linear pipeline, source -> transforms -> sinks
        String toSink = source.getName();
        if (transforms != null && !transforms.isEmpty()) {
            connections.add(new Connection(source.getName(), transforms.get(0).getName()));
            for (int i = 0; i < transforms.size() - 1; i++) {
                connections.add(new Connection(transforms.get(i).getName(), transforms.get(i + 1).getName()));
            }
            toSink = transforms.get(transforms.size() - 1).getName();
        }
        for (ETLStage stage : sinks) {
            connections.add(new Connection(toSink, stage.getName()));
        }
    }
    return connections;
}
Also used : ArrayList(java.util.ArrayList) Connection(co.cask.cdap.etl.proto.Connection)

Example 35 with Connection

use of co.cask.cdap.etl.proto.Connection in project cdap by caskdata.

the class ControlDag method trim.

/**
 * Trims any redundant control connections.
 *
 * For example:
 *   n1 ------> n2
 *       |      |
 *       |      v
 *       |----> n3
 * has a redundant edge n1 -> n3, because the edge from n2 -> n3 already enforces n1 -> n3.
 * The approach is look at each node (call it nodeB). For each input into nodeB (call it nodeA),
 * if there is another path from nodeA to nodeB besides the direct edge, we can remove the edge nodeA -> nodeB.
 *
 * @return number of connections removed.
 */
public int trim() {
    int numRemoved = 0;
    for (String node : nodes) {
        Set<Connection> toRemove = new HashSet<>();
        for (String nodeInput : getNodeInputs(node)) {
            if (numPaths(nodeInput, node) > 1) {
                toRemove.add(new Connection(nodeInput, node));
            }
        }
        for (Connection conn : toRemove) {
            removeConnection(conn.getFrom(), conn.getTo());
        }
        numRemoved += toRemove.size();
    }
    return numRemoved;
}
Also used : Connection(co.cask.cdap.etl.proto.Connection) HashSet(java.util.HashSet)

Aggregations

Connection (co.cask.cdap.etl.proto.Connection)36 Test (org.junit.Test)26 HashSet (java.util.HashSet)23 HashMap (java.util.HashMap)10 StageSpec (co.cask.cdap.etl.spec.StageSpec)8 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)6 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)5 Resources (co.cask.cdap.api.Resources)4 UpgradeContext (co.cask.cdap.etl.proto.UpgradeContext)4 ArrayList (java.util.ArrayList)4 ArtifactSelectorConfig (co.cask.cdap.etl.proto.ArtifactSelectorConfig)3 Dag (co.cask.cdap.etl.planner.Dag)2 Plugin (co.cask.cdap.etl.proto.v1.Plugin)2 ETLPlugin (co.cask.cdap.etl.proto.v2.ETLPlugin)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 Map (java.util.Map)2 Set (java.util.Set)2 Schema (co.cask.cdap.api.data.schema.Schema)1 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)1 ConditionBranches (co.cask.cdap.etl.planner.ConditionBranches)1