Search in sources :

Example 1 with ConditionBranches

use of co.cask.cdap.etl.planner.ConditionBranches in project cdap by caskdata.

the class SmartWorkflow method configure.

@Override
protected void configure() {
    setName(NAME);
    setDescription(DESCRIPTION);
    // set the pipeline spec as a property in case somebody like the UI wants to read it
    Map<String, String> properties = new HashMap<>();
    properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
    setProperties(properties);
    stageSpecs = new HashMap<>();
    useSpark = engine == Engine.SPARK;
    for (StageSpec stageSpec : spec.getStages()) {
        stageSpecs.put(stageSpec.getName(), stageSpec);
        String pluginType = stageSpec.getPlugin().getType();
        if (SparkCompute.PLUGIN_TYPE.equals(pluginType) || SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            useSpark = true;
        }
    }
    PipelinePlanner planner;
    Set<String> actionTypes = ImmutableSet.of(Action.PLUGIN_TYPE, Constants.SPARK_PROGRAM_PLUGIN_TYPE);
    Set<String> multiPortTypes = ImmutableSet.of(SplitterTransform.PLUGIN_TYPE);
    if (useSpark) {
        // if the pipeline uses spark, we don't need to break the pipeline up into phases, we can just have
        // a single phase.
        planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.<String>of(), ImmutableSet.<String>of(), actionTypes, multiPortTypes);
    } else {
        planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.of(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE), ImmutableSet.of(SparkCompute.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE), actionTypes, multiPortTypes);
    }
    plan = planner.plan(spec);
    WorkflowProgramAdder programAdder = new TrunkProgramAdder(getConfigurer());
    // single phase, just add the program directly
    if (plan.getPhases().size() == 1) {
        addProgram(plan.getPhases().keySet().iterator().next(), programAdder);
        return;
    }
    // Dag classes don't allow a 'dag' without connections
    if (plan.getPhaseConnections().isEmpty()) {
        WorkflowProgramAdder fork = programAdder.fork();
        for (String phaseName : plan.getPhases().keySet()) {
            addProgram(phaseName, fork);
        }
        fork.join();
        return;
    }
    dag = new ControlDag(plan.getPhaseConnections());
    boolean dummyNodeAdded = false;
    Map<String, ConditionBranches> conditionBranches = plan.getConditionPhaseBranches();
    if (conditionBranches.isEmpty()) {
        // after flattening, there is guaranteed to be just one source
        dag.flatten();
    } else if (!conditionBranches.keySet().containsAll(dag.getSources())) {
        // Continue only if the conditon node is not the source of the dag, otherwise dag is already in the
        // required form
        Set<String> conditions = conditionBranches.keySet();
        // flatten only the part of the dag starting from sources and ending in conditions/sinks.
        Set<String> dagNodes = dag.accessibleFrom(dag.getSources(), Sets.union(dag.getSinks(), conditions));
        Set<String> dagNodesWithoutCondition = Sets.difference(dagNodes, conditions);
        Set<Connection> connections = new HashSet<>();
        Deque<String> bfs = new LinkedList<>();
        Set<String> sinks = new HashSet<>();
        // If its a single phase without condition then no need to flatten
        if (dagNodesWithoutCondition.size() > 1) {
            Dag subDag;
            try {
                subDag = dag.createSubDag(dagNodesWithoutCondition);
            } catch (IllegalArgumentException | DisjointConnectionsException e) {
                // DisjointConnectionsException thrown when islands are created from the dagNodesWithoutCondition
                // IllegalArgumentException thrown when connections are empty
                // In both cases we need to add dummy node and create connected Dag
                String dummyNode = "dummy";
                dummyNodeAdded = true;
                Set<Connection> subDagConnections = new HashSet<>();
                for (String source : dag.getSources()) {
                    subDagConnections.add(new Connection(dummyNode, source));
                }
                Deque<String> subDagBFS = new LinkedList<>();
                subDagBFS.addAll(dag.getSources());
                while (subDagBFS.peek() != null) {
                    String node = subDagBFS.poll();
                    for (String output : dag.getNodeOutputs(node)) {
                        if (dagNodesWithoutCondition.contains(output)) {
                            subDagConnections.add(new Connection(node, output));
                            subDagBFS.add(output);
                        }
                    }
                }
                subDag = new Dag(subDagConnections);
            }
            ControlDag cdag = new ControlDag(subDag);
            cdag.flatten();
            // Add all connections from cdag
            bfs.addAll(cdag.getSources());
            while (bfs.peek() != null) {
                String node = bfs.poll();
                for (String output : cdag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            }
            sinks.addAll(cdag.getSinks());
        } else {
            sinks.addAll(dagNodesWithoutCondition);
        }
        // Add back the existing condition nodes and corresponding conditions
        Set<String> conditionsFromDag = Sets.intersection(dagNodes, conditions);
        for (String condition : conditionsFromDag) {
            connections.add(new Connection(sinks.iterator().next(), condition));
        }
        bfs.addAll(Sets.intersection(dagNodes, conditions));
        while (bfs.peek() != null) {
            String node = bfs.poll();
            ConditionBranches branches = conditionBranches.get(node);
            if (branches == null) {
                // not a condition node. add outputs
                for (String output : dag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            } else {
                // condition node
                for (Boolean condition : Arrays.asList(true, false)) {
                    String phase = condition ? branches.getTrueOutput() : branches.getFalseOutput();
                    if (phase == null) {
                        continue;
                    }
                    connections.add(new Connection(node, phase, condition));
                    bfs.add(phase);
                }
            }
        }
        dag = new ControlDag(connections);
    }
    if (dummyNodeAdded) {
        WorkflowProgramAdder fork = programAdder.fork();
        String dummyNode = dag.getSources().iterator().next();
        for (String output : dag.getNodeOutputs(dummyNode)) {
            // need to make sure we don't call also() if this is the final branch
            if (!addBranchPrograms(output, fork)) {
                fork = fork.also();
            }
        }
    } else {
        String start = dag.getSources().iterator().next();
        addPrograms(start, programAdder);
    }
}
Also used : ControlDag(co.cask.cdap.etl.planner.ControlDag) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Connection(co.cask.cdap.etl.proto.Connection) Dag(co.cask.cdap.etl.planner.Dag) ControlDag(co.cask.cdap.etl.planner.ControlDag) Deque(java.util.Deque) LinkedList(java.util.LinkedList) ConditionBranches(co.cask.cdap.etl.planner.ConditionBranches) StageSpec(co.cask.cdap.etl.spec.StageSpec) HashSet(java.util.HashSet)

Example 2 with ConditionBranches

use of co.cask.cdap.etl.planner.ConditionBranches in project cdap by caskdata.

the class SmartWorkflow method addPrograms.

private void addPrograms(String node, WorkflowProgramAdder programAdder) {
    programAdder = addProgram(node, programAdder);
    Set<String> outputs = dag.getNodeOutputs(node);
    if (outputs.isEmpty()) {
        return;
    }
    ConditionBranches branches = plan.getConditionPhaseBranches().get(node);
    if (branches != null) {
        // This is condition
        addCondition(programAdder, branches);
    } else {
        // if this is a fork
        if (outputs.size() > 1) {
            WorkflowProgramAdder fork = programAdder.fork();
            for (String output : outputs) {
                // need to make sure we don't call also() if this is the final branch
                if (!addBranchPrograms(output, fork)) {
                    fork = fork.also();
                }
            }
        } else {
            addPrograms(outputs.iterator().next(), programAdder);
        }
    }
}
Also used : ConditionBranches(co.cask.cdap.etl.planner.ConditionBranches)

Aggregations

ConditionBranches (co.cask.cdap.etl.planner.ConditionBranches)2 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)1 ControlDag (co.cask.cdap.etl.planner.ControlDag)1 Dag (co.cask.cdap.etl.planner.Dag)1 PipelinePlanner (co.cask.cdap.etl.planner.PipelinePlanner)1 Connection (co.cask.cdap.etl.proto.Connection)1 StageSpec (co.cask.cdap.etl.spec.StageSpec)1 ImmutableSet (com.google.common.collect.ImmutableSet)1 Deque (java.util.Deque)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedList (java.util.LinkedList)1 Set (java.util.Set)1