Search in sources :

Example 1 with Dag

use of co.cask.cdap.etl.planner.Dag in project cdap by caskdata.

the class SmartWorkflow method configure.

@Override
protected void configure() {
    setName(NAME);
    setDescription(DESCRIPTION);
    // set the pipeline spec as a property in case somebody like the UI wants to read it
    Map<String, String> properties = new HashMap<>();
    properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
    setProperties(properties);
    stageSpecs = new HashMap<>();
    useSpark = engine == Engine.SPARK;
    for (StageSpec stageSpec : spec.getStages()) {
        stageSpecs.put(stageSpec.getName(), stageSpec);
        String pluginType = stageSpec.getPlugin().getType();
        if (SparkCompute.PLUGIN_TYPE.equals(pluginType) || SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            useSpark = true;
        }
    }
    PipelinePlanner planner;
    Set<String> actionTypes = ImmutableSet.of(Action.PLUGIN_TYPE, Constants.SPARK_PROGRAM_PLUGIN_TYPE);
    Set<String> multiPortTypes = ImmutableSet.of(SplitterTransform.PLUGIN_TYPE);
    if (useSpark) {
        // if the pipeline uses spark, we don't need to break the pipeline up into phases, we can just have
        // a single phase.
        planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.<String>of(), ImmutableSet.<String>of(), actionTypes, multiPortTypes);
    } else {
        planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.of(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE), ImmutableSet.of(SparkCompute.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE), actionTypes, multiPortTypes);
    }
    plan = planner.plan(spec);
    WorkflowProgramAdder programAdder = new TrunkProgramAdder(getConfigurer());
    // single phase, just add the program directly
    if (plan.getPhases().size() == 1) {
        addProgram(plan.getPhases().keySet().iterator().next(), programAdder);
        return;
    }
    // Dag classes don't allow a 'dag' without connections
    if (plan.getPhaseConnections().isEmpty()) {
        WorkflowProgramAdder fork = programAdder.fork();
        for (String phaseName : plan.getPhases().keySet()) {
            addProgram(phaseName, fork);
        }
        fork.join();
        return;
    }
    dag = new ControlDag(plan.getPhaseConnections());
    boolean dummyNodeAdded = false;
    Map<String, ConditionBranches> conditionBranches = plan.getConditionPhaseBranches();
    if (conditionBranches.isEmpty()) {
        // after flattening, there is guaranteed to be just one source
        dag.flatten();
    } else if (!conditionBranches.keySet().containsAll(dag.getSources())) {
        // Continue only if the conditon node is not the source of the dag, otherwise dag is already in the
        // required form
        Set<String> conditions = conditionBranches.keySet();
        // flatten only the part of the dag starting from sources and ending in conditions/sinks.
        Set<String> dagNodes = dag.accessibleFrom(dag.getSources(), Sets.union(dag.getSinks(), conditions));
        Set<String> dagNodesWithoutCondition = Sets.difference(dagNodes, conditions);
        Set<Connection> connections = new HashSet<>();
        Deque<String> bfs = new LinkedList<>();
        Set<String> sinks = new HashSet<>();
        // If its a single phase without condition then no need to flatten
        if (dagNodesWithoutCondition.size() > 1) {
            Dag subDag;
            try {
                subDag = dag.createSubDag(dagNodesWithoutCondition);
            } catch (IllegalArgumentException | DisjointConnectionsException e) {
                // DisjointConnectionsException thrown when islands are created from the dagNodesWithoutCondition
                // IllegalArgumentException thrown when connections are empty
                // In both cases we need to add dummy node and create connected Dag
                String dummyNode = "dummy";
                dummyNodeAdded = true;
                Set<Connection> subDagConnections = new HashSet<>();
                for (String source : dag.getSources()) {
                    subDagConnections.add(new Connection(dummyNode, source));
                }
                Deque<String> subDagBFS = new LinkedList<>();
                subDagBFS.addAll(dag.getSources());
                while (subDagBFS.peek() != null) {
                    String node = subDagBFS.poll();
                    for (String output : dag.getNodeOutputs(node)) {
                        if (dagNodesWithoutCondition.contains(output)) {
                            subDagConnections.add(new Connection(node, output));
                            subDagBFS.add(output);
                        }
                    }
                }
                subDag = new Dag(subDagConnections);
            }
            ControlDag cdag = new ControlDag(subDag);
            cdag.flatten();
            // Add all connections from cdag
            bfs.addAll(cdag.getSources());
            while (bfs.peek() != null) {
                String node = bfs.poll();
                for (String output : cdag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            }
            sinks.addAll(cdag.getSinks());
        } else {
            sinks.addAll(dagNodesWithoutCondition);
        }
        // Add back the existing condition nodes and corresponding conditions
        Set<String> conditionsFromDag = Sets.intersection(dagNodes, conditions);
        for (String condition : conditionsFromDag) {
            connections.add(new Connection(sinks.iterator().next(), condition));
        }
        bfs.addAll(Sets.intersection(dagNodes, conditions));
        while (bfs.peek() != null) {
            String node = bfs.poll();
            ConditionBranches branches = conditionBranches.get(node);
            if (branches == null) {
                // not a condition node. add outputs
                for (String output : dag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            } else {
                // condition node
                for (Boolean condition : Arrays.asList(true, false)) {
                    String phase = condition ? branches.getTrueOutput() : branches.getFalseOutput();
                    if (phase == null) {
                        continue;
                    }
                    connections.add(new Connection(node, phase, condition));
                    bfs.add(phase);
                }
            }
        }
        dag = new ControlDag(connections);
    }
    if (dummyNodeAdded) {
        WorkflowProgramAdder fork = programAdder.fork();
        String dummyNode = dag.getSources().iterator().next();
        for (String output : dag.getNodeOutputs(dummyNode)) {
            // need to make sure we don't call also() if this is the final branch
            if (!addBranchPrograms(output, fork)) {
                fork = fork.also();
            }
        }
    } else {
        String start = dag.getSources().iterator().next();
        addPrograms(start, programAdder);
    }
}
Also used : ControlDag(co.cask.cdap.etl.planner.ControlDag) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Connection(co.cask.cdap.etl.proto.Connection) Dag(co.cask.cdap.etl.planner.Dag) ControlDag(co.cask.cdap.etl.planner.ControlDag) Deque(java.util.Deque) LinkedList(java.util.LinkedList) ConditionBranches(co.cask.cdap.etl.planner.ConditionBranches) StageSpec(co.cask.cdap.etl.spec.StageSpec) HashSet(java.util.HashSet)

Example 2 with Dag

use of co.cask.cdap.etl.planner.Dag in project cdap by caskdata.

the class PipelineSpecGenerator method validateConfig.

/**
 * Validate that this is a valid pipeline. A valid pipeline has the following properties:
 *
 * All stages in the pipeline have a unique name.
 * Source stages have at least one output and no inputs.
 * Sink stages have at least one input and no outputs.
 * There are no cycles in the pipeline.
 * All inputs into a stage have the same schema.
 * ErrorTransforms only have BatchSource, Transform, or BatchAggregator as input stages.
 * AlertPublishers have at least one input and no outputs and don't have SparkSink or BatchSink as input.
 *
 * Returns the stages in the order they should be configured to ensure that all input stages are configured
 * before their output.
 *
 * @param config the user provided configuration
 * @return the order to configure the stages in
 * @throws IllegalArgumentException if the pipeline is invalid
 */
private ValidatedPipeline validateConfig(ETLConfig config) {
    config.validate();
    if (config.getStages().isEmpty()) {
        throw new IllegalArgumentException("A pipeline must contain at least one stage.");
    }
    Set<String> actionStages = new HashSet<>();
    Set<String> conditionStages = new HashSet<>();
    Map<String, String> stageTypes = new HashMap<>();
    // check stage name uniqueness
    Set<String> stageNames = new HashSet<>();
    for (ETLStage stage : config.getStages()) {
        if (!stageNames.add(stage.getName())) {
            throw new IllegalArgumentException(String.format("Invalid pipeline. Multiple stages are named %s. Please ensure all stage names are unique", stage.getName()));
        }
        // if stage is Action stage, add it to the Action stage set
        if (isAction(stage.getPlugin().getType())) {
            actionStages.add(stage.getName());
        }
        // if the stage is condition add it to the Condition stage set
        if (stage.getPlugin().getType().equals(Condition.PLUGIN_TYPE)) {
            conditionStages.add(stage.getName());
        }
        stageTypes.put(stage.getName(), stage.getPlugin().getType());
    }
    // check that the from and to are names of actual stages
    // also check that conditions have at most 2 outgoing connections each label with true or
    // false but not both
    Map<String, Boolean> conditionBranch = new HashMap<>();
    for (Connection connection : config.getConnections()) {
        if (!stageNames.contains(connection.getFrom())) {
            throw new IllegalArgumentException(String.format("Invalid connection %s. %s is not a stage.", connection, connection.getFrom()));
        }
        if (!stageNames.contains(connection.getTo())) {
            throw new IllegalArgumentException(String.format("Invalid connection %s. %s is not a stage.", connection, connection.getTo()));
        }
        if (conditionStages.contains(connection.getFrom())) {
            if (connection.getCondition() == null) {
                String msg = String.format("For condition stage %s, the connection %s is not marked with either " + "'true' or 'false'.", connection.getFrom(), connection);
                throw new IllegalArgumentException(msg);
            }
            // check if connection from the condition node is marked as true or false multiple times
            if (conditionBranch.containsKey(connection.getFrom()) && connection.getCondition().equals(conditionBranch.get(connection.getFrom()))) {
                String msg = String.format("For condition stage '%s', more than one outgoing connections are marked as %s.", connection.getFrom(), connection.getCondition());
                throw new IllegalArgumentException(msg);
            }
            conditionBranch.put(connection.getFrom(), connection.getCondition());
        }
    }
    List<ETLStage> traversalOrder = new ArrayList<>(stageNames.size());
    // can only have empty connections if the pipeline consists of a single action.
    if (config.getConnections().isEmpty()) {
        if (actionStages.size() == 1 && stageNames.size() == 1) {
            traversalOrder.add(config.getStages().iterator().next());
            return new ValidatedPipeline(traversalOrder, config);
        } else {
            throw new IllegalArgumentException("Invalid pipeline. There are no connections between stages. " + "This is only allowed if the pipeline consists of a single action plugin.");
        }
    }
    Dag dag = new Dag(config.getConnections());
    Set<String> controlStages = Sets.union(actionStages, conditionStages);
    Map<String, ETLStage> stages = new HashMap<>();
    for (ETLStage stage : config.getStages()) {
        String stageName = stage.getName();
        Set<String> stageInputs = dag.getNodeInputs(stageName);
        Set<String> stageOutputs = dag.getNodeOutputs(stageName);
        String stageType = stage.getPlugin().getType();
        boolean isSource = isSource(stageType);
        boolean isSink = isSink(stageType);
        // check source plugins are sources in the dag
        if (isSource) {
            if (!stageInputs.isEmpty() && !controlStages.containsAll(stageInputs)) {
                throw new IllegalArgumentException(String.format("%s %s has incoming connections from %s. %s stages cannot have any incoming connections.", stageType, stageName, stageType, Joiner.on(',').join(stageInputs)));
            }
        } else if (isSink) {
            if (!stageOutputs.isEmpty() && !controlStages.containsAll(stageOutputs)) {
                throw new IllegalArgumentException(String.format("%s %s has outgoing connections to %s. %s stages cannot have any outgoing connections.", stageType, stageName, stageType, Joiner.on(',').join(stageOutputs)));
            }
        } else if (ErrorTransform.PLUGIN_TYPE.equals(stageType)) {
            for (String inputStage : stageInputs) {
                String inputType = stageTypes.get(inputStage);
                if (!VALID_ERROR_INPUTS.contains(inputType)) {
                    throw new IllegalArgumentException(String.format("ErrorTransform %s cannot have stage %s of type %s as input. Only %s stages can emit errors.", stageName, inputStage, inputType, Joiner.on(',').join(VALID_ERROR_INPUTS)));
                }
            }
        }
        boolean isAction = isAction(stageType);
        if (!isAction && !stageType.equals(Condition.PLUGIN_TYPE) && !isSource && stageInputs.isEmpty()) {
            throw new IllegalArgumentException(String.format("Stage %s is unreachable, it has no incoming connections.", stageName));
        }
        if (!isAction && !isSink && stageOutputs.isEmpty()) {
            throw new IllegalArgumentException(String.format("Stage %s is a dead end, it has no outgoing connections.", stageName));
        }
        stages.put(stageName, stage);
    }
    validateConditionBranches(conditionStages, dag);
    for (String stageName : dag.getTopologicalOrder()) {
        traversalOrder.add(stages.get(stageName));
    }
    return new ValidatedPipeline(traversalOrder, config);
}
Also used : HashMap(java.util.HashMap) Connection(co.cask.cdap.etl.proto.Connection) ArrayList(java.util.ArrayList) Dag(co.cask.cdap.etl.planner.Dag) ETLStage(co.cask.cdap.etl.proto.v2.ETLStage) HashSet(java.util.HashSet)

Aggregations

Dag (co.cask.cdap.etl.planner.Dag)2 Connection (co.cask.cdap.etl.proto.Connection)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)1 ConditionBranches (co.cask.cdap.etl.planner.ConditionBranches)1 ControlDag (co.cask.cdap.etl.planner.ControlDag)1 PipelinePlanner (co.cask.cdap.etl.planner.PipelinePlanner)1 ETLStage (co.cask.cdap.etl.proto.v2.ETLStage)1 StageSpec (co.cask.cdap.etl.spec.StageSpec)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 ArrayList (java.util.ArrayList)1 Deque (java.util.Deque)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedList (java.util.LinkedList)1 Set (java.util.Set)1