Examples with Connection - io.cdap.cdap.etl.proto.Connection

Example 21 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class ETLConfig method getValidConnections.

private List<Connection> getValidConnections(List<Connection> connections) {
    // TODO : this can be removed once UI changes are made and we don't have to support the old format
    if (source.getPlugin() == null) {
        // if its old format, we just return an empty list.
        return new ArrayList<>();
    }
    if (connections == null) {
        connections = new ArrayList<>();
    }
    if (connections.isEmpty()) {
        // if connections are empty, we create a connections list,
        // which is a linear pipeline, source -> transforms -> sinks
        String toSink = source.getName();
        if (transforms != null && !transforms.isEmpty()) {
            connections.add(new Connection(source.getName(), transforms.get(0).getName()));
            for (int i = 0; i < transforms.size() - 1; i++) {
                connections.add(new Connection(transforms.get(i).getName(), transforms.get(i + 1).getName()));
            }
            toSink = transforms.get(transforms.size() - 1).getName();
        }
        for (ETLStage stage : sinks) {
            connections.add(new Connection(toSink, stage.getName()));
        }
    }
    return connections;
}

Also used : ArrayList(java.util.ArrayList) Connection(io.cdap.cdap.etl.proto.Connection)

Example 22 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class Dag method subsetAround.

/**
 * Return a subset of this dag starting from the specified stage, without going past any node in the
 * child stop nodes and parent stop nodes. If the parent or child stop nodes contain the starting stage, it
 * will be ignored.
 * This is equivalent to taking the nodes from {@link #accessibleFrom(Set, Set)}, {@link #parentsOf(String, Set)},
 * and building a dag from them.
 *
 * @param stage the stage to start at
 * @param childStopNodes set of nodes to stop traversing forwards on
 * @param parentStopNodes set of nodes to stop traversing backwards on
 * @return a dag created from the stages given and child nodes of those stages and parent nodes of those stages.
 */
public Dag subsetAround(String stage, Set<String> childStopNodes, Set<String> parentStopNodes) {
    Set<String> nodes = Sets.union(accessibleFrom(stage, childStopNodes), parentsOf(stage, parentStopNodes));
    Set<Connection> connections = new HashSet<>();
    for (String node : nodes) {
        for (String outputNode : outgoingConnections.get(node)) {
            if (nodes.contains(outputNode)) {
                connections.add(new Connection(node, outputNode));
            }
        }
    }
    return new Dag(connections);
}

Also used : Connection(io.cdap.cdap.etl.proto.Connection) HashSet(java.util.HashSet)

Example 23 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class PipelinePlanner method getUpdatedDag.

/**
 * Update the current dag by replacing conditions in the dag with the corresponding condition connectors
 */
private Dag getUpdatedDag(Dag dag, Map<String, String> controlConnectors) {
    Set<String> controlAsSources = Sets.intersection(controlConnectors.keySet(), dag.getSources());
    Set<String> controlAsSink = Sets.intersection(controlConnectors.keySet(), dag.getSinks());
    if (controlAsSources.isEmpty() && controlAsSink.isEmpty()) {
        return dag;
    }
    Set<Connection> newConnections = new HashSet<>();
    for (String node : dag.getNodes()) {
        String newNode = controlConnectors.get(node) == null ? node : controlConnectors.get(node);
        for (String inputNode : dag.getNodeInputs(node)) {
            newConnections.add(new Connection(controlConnectors.get(inputNode) == null ? inputNode : controlConnectors.get(inputNode), newNode));
        }
        for (String outputNode : dag.getNodeOutputs(node)) {
            newConnections.add(new Connection(newNode, controlConnectors.get(outputNode) == null ? outputNode : controlConnectors.get(outputNode)));
        }
    }
    return new Dag(newConnections);
}

Also used : Connection(io.cdap.cdap.etl.proto.Connection) HashSet(java.util.HashSet)

Example 24 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class SmartWorkflow method configure.

@Override
protected void configure() {
    setName(NAME);
    setDescription("Data Pipeline Workflow");
    // If plugins were registered only at the application level, CDAP would not be able to fail the run early.
    try {
        spec = new BatchPipelineSpecGenerator(applicationConfigurer.getDeployedNamespace(), getConfigurer(), applicationConfigurer.getRuntimeConfigurer(), ImmutableSet.of(BatchSource.PLUGIN_TYPE), ImmutableSet.of(BatchSink.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE, AlertPublisher.PLUGIN_TYPE), config.getEngine(), getConfigurer()).generateSpec(config);
    } catch (ValidationException e) {
        throw new IllegalArgumentException(String.format("Failed to configure pipeline: %s", e.getFailures().isEmpty() ? e.getMessage() : e.getFailures().iterator().next().getFullMessage()), e);
    }
    // append "_" to the connection name so it will not conflict with the system tag we add
    Set<String> connectionsUsed = spec.getConnectionsUsed().stream().map(s -> "_" + s).collect(Collectors.toSet());
    applicationConfigurer.emitMetadata(new Metadata(Collections.emptyMap(), connectionsUsed), MetadataScope.SYSTEM);
    stageSpecs = new HashMap<>();
    useSpark = config.getEngine() == Engine.SPARK;
    for (StageSpec stageSpec : spec.getStages()) {
        stageSpecs.put(stageSpec.getName(), stageSpec);
        String pluginType = stageSpec.getPlugin().getType();
        if (SparkCompute.PLUGIN_TYPE.equals(pluginType) || SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            useSpark = true;
        }
    }
    plan = createPlan();
    WorkflowProgramAdder programAdder = new TrunkProgramAdder(getConfigurer());
    // single phase, just add the program directly
    if (plan.getPhases().size() == 1) {
        addProgram(plan.getPhases().keySet().iterator().next(), programAdder);
        setWorkflowProperties();
        return;
    }
    // Dag classes don't allow a 'dag' without connections
    if (plan.getPhaseConnections().isEmpty()) {
        WorkflowProgramAdder fork = programAdder.fork();
        for (String phaseName : plan.getPhases().keySet()) {
            addProgram(phaseName, fork);
        }
        fork.join();
        setWorkflowProperties();
        return;
    }
    /*
       ControlDag is used to flatten the dag that represents connections between phases.
       Connections between phases represent a happens-before relationship, not the flow of data.
       As such, phases can be shifted around as long as every happens-before relationship is maintained.
       The exception is condition phases. Connection from a condition to another phase must be maintained as is.

       Flattening a ControlDag will transform a dag into a special fork-join dag by moving phases around.
       We therefore cannot blindly flatten the phase connections.
       However, we validated earlier that condition outputs have a special property, where every stage following a
       condition can only have a single input. This means we will never need to flatten anything after the first
       set of conditions. We will only have to flatten what comes before the first set of conditions.
     */
    dag = new ControlDag(plan.getPhaseConnections());
    boolean dummyNodeAdded = false;
    Map<String, ConditionBranches> conditionBranches = plan.getConditionPhaseBranches();
    if (conditionBranches.isEmpty()) {
        // after flattening, there is guaranteed to be just one source
        dag.flatten();
    } else if (!conditionBranches.keySet().containsAll(dag.getSources())) {
        // Continue only if the condition node is not the source of the dag, otherwise dag is already in the
        // required form
        Set<String> conditions = conditionBranches.keySet();
        // flatten only the part of the dag starting from sources and ending in conditions/sinks.
        Set<String> dagNodes = dag.accessibleFrom(dag.getSources(), Sets.union(dag.getSinks(), conditions));
        Set<String> dagNodesWithoutCondition = Sets.difference(dagNodes, conditions);
        Set<Connection> connections = new HashSet<>();
        Deque<String> bfs = new LinkedList<>();
        Set<String> sinks = new HashSet<>();
        // If its a single phase without condition then no need to flatten
        if (dagNodesWithoutCondition.size() < 2) {
            sinks.addAll(dagNodesWithoutCondition);
        } else {
            /*
           Create a subdag from dagNodesWithoutCondition.
           There are a couple situations where this is not immediately possible. For example:

             source1 --|
                       |--> condition -- ...
             source2 --|

           Here, dagNodesWithoutCondition = [source1, source2], which is an invalid dag. Similarly:

             source --> condition -- ...

           Here, dagNodesWithoutCondition = [source], which is also invalid. In order to ensure that we have a
           valid dag, we just insert a dummy node as the first node in the subdag, adding a connection from the
           dummy node to all the sources.
         */
            Dag subDag;
            try {
                subDag = dag.createSubDag(dagNodesWithoutCondition);
            } catch (IllegalArgumentException | DisjointConnectionsException e) {
                // DisjointConnectionsException thrown when islands are created from the dagNodesWithoutCondition
                // IllegalArgumentException thrown when connections are empty
                // In both cases we need to add dummy node and create connected Dag
                String dummyNode = "dummy";
                dummyNodeAdded = true;
                Set<Connection> subDagConnections = new HashSet<>();
                for (String source : dag.getSources()) {
                    subDagConnections.add(new Connection(dummyNode, source));
                }
                Deque<String> subDagBFS = new LinkedList<>();
                subDagBFS.addAll(dag.getSources());
                while (subDagBFS.peek() != null) {
                    String node = subDagBFS.poll();
                    for (String output : dag.getNodeOutputs(node)) {
                        if (dagNodesWithoutCondition.contains(output)) {
                            subDagConnections.add(new Connection(node, output));
                            subDagBFS.add(output);
                        }
                    }
                }
                subDag = new Dag(subDagConnections);
            }
            ControlDag cdag = new ControlDag(subDag);
            cdag.flatten();
            // Add all connections from cdag
            bfs.addAll(cdag.getSources());
            while (bfs.peek() != null) {
                String node = bfs.poll();
                for (String output : cdag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            }
            sinks.addAll(cdag.getSinks());
        }
        // Add back the existing condition nodes and corresponding conditions
        Set<String> conditionsFromDag = Sets.intersection(dagNodes, conditions);
        for (String condition : conditionsFromDag) {
            connections.add(new Connection(sinks.iterator().next(), condition));
        }
        bfs.addAll(Sets.intersection(dagNodes, conditions));
        while (bfs.peek() != null) {
            String node = bfs.poll();
            ConditionBranches branches = conditionBranches.get(node);
            if (branches == null) {
                // not a condition node. add outputs
                for (String output : dag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            } else {
                // condition node
                for (Boolean condition : Arrays.asList(true, false)) {
                    String phase = condition ? branches.getTrueOutput() : branches.getFalseOutput();
                    if (phase == null) {
                        continue;
                    }
                    connections.add(new Connection(node, phase, condition));
                    bfs.add(phase);
                }
            }
        }
        dag = new ControlDag(connections);
    }
    if (dummyNodeAdded) {
        WorkflowProgramAdder fork = programAdder.fork();
        String dummyNode = dag.getSources().iterator().next();
        // need to make sure we don't call also() if this is the final branch
        Iterator<String> outputIter = dag.getNodeOutputs(dummyNode).iterator();
        addBranchPrograms(outputIter.next(), fork, false);
        while (outputIter.hasNext()) {
            fork = fork.also();
            addBranchPrograms(outputIter.next(), fork, !outputIter.hasNext());
        }
    } else {
        String start = dag.getSources().iterator().next();
        addPrograms(start, programAdder);
    }
    setWorkflowProperties();
}

Also used : AlertPublisher(io.cdap.cdap.etl.api.AlertPublisher) BatchSource(io.cdap.cdap.etl.api.batch.BatchSource) Engine(io.cdap.cdap.etl.api.Engine) ConnectorSource(io.cdap.cdap.etl.batch.connector.ConnectorSource) Arrays(java.util.Arrays) TypeToken(com.google.gson.reflect.TypeToken) MultiConnectorSource(io.cdap.cdap.etl.batch.connector.MultiConnectorSource) LoggerFactory(org.slf4j.LoggerFactory) AbstractWorkflow(io.cdap.cdap.api.workflow.AbstractWorkflow) SparkSink(io.cdap.cdap.etl.api.batch.SparkSink) GsonBuilder(com.google.gson.GsonBuilder) PipelineAction(io.cdap.cdap.etl.batch.customaction.PipelineAction) Constants(io.cdap.cdap.etl.common.Constants) FieldOperationTypeAdapter(io.cdap.cdap.etl.common.FieldOperationTypeAdapter) WorkflowContext(io.cdap.cdap.api.workflow.WorkflowContext) Gson(com.google.gson.Gson) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) Map(java.util.Map) Connection(io.cdap.cdap.etl.proto.Connection) ProgramStatus(io.cdap.cdap.api.ProgramStatus) Condition(io.cdap.cdap.etl.api.condition.Condition) TriggeringPropertyMapping(io.cdap.cdap.etl.proto.v2.TriggeringPropertyMapping) Action(io.cdap.cdap.etl.api.action.Action) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) ActionSpec(io.cdap.cdap.etl.batch.ActionSpec) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) ArgumentMapping(io.cdap.cdap.etl.proto.v2.ArgumentMapping) Metrics(io.cdap.cdap.api.metrics.Metrics) AlertPublisherContext(io.cdap.cdap.etl.api.AlertPublisherContext) PluginPropertyMapping(io.cdap.cdap.etl.proto.v2.PluginPropertyMapping) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) List(java.util.List) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) BasicArguments(io.cdap.cdap.etl.common.BasicArguments) Type(java.lang.reflect.Type) MetadataScope(io.cdap.cdap.api.metadata.MetadataScope) ETLSpark(io.cdap.cdap.etl.spark.batch.ETLSpark) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) ApplicationConfigurer(io.cdap.cdap.api.app.ApplicationConfigurer) TrackedIterator(io.cdap.cdap.etl.common.TrackedIterator) ControlDag(io.cdap.cdap.etl.planner.ControlDag) WorkflowBackedActionContext(io.cdap.cdap.etl.batch.WorkflowBackedActionContext) TriggerInfo(io.cdap.cdap.api.schedule.TriggerInfo) Alert(io.cdap.cdap.etl.api.Alert) FieldOperation(io.cdap.cdap.etl.api.lineage.field.FieldOperation) SplitterTransform(io.cdap.cdap.etl.api.SplitterTransform) SchemaTypeAdapter(io.cdap.cdap.internal.io.SchemaTypeAdapter) BatchPipelineSpecGenerator(io.cdap.cdap.etl.batch.BatchPipelineSpecGenerator) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) HashMap(java.util.HashMap) AlertPublisherSink(io.cdap.cdap.etl.batch.connector.AlertPublisherSink) Deque(java.util.Deque) AlertReader(io.cdap.cdap.etl.batch.connector.AlertReader) ArrayList(java.util.ArrayList) DisjointConnectionsException(io.cdap.cdap.etl.planner.DisjointConnectionsException) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) LocationAwareMDCWrapperLogger(io.cdap.cdap.etl.common.LocationAwareMDCWrapperLogger) PluginContext(io.cdap.cdap.api.plugin.PluginContext) PipelinePlan(io.cdap.cdap.etl.planner.PipelinePlan) BatchActionContext(io.cdap.cdap.etl.api.batch.BatchActionContext) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) PipelinePlanner(io.cdap.cdap.etl.planner.PipelinePlanner) FieldLineageProcessor(io.cdap.cdap.etl.lineage.FieldLineageProcessor) TriggeringScheduleInfo(io.cdap.cdap.api.schedule.TriggeringScheduleInfo) PipelineCondition(io.cdap.cdap.etl.batch.condition.PipelineCondition) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) LinkedList(java.util.LinkedList) Operation(io.cdap.cdap.api.lineage.field.Operation) ETLMapReduce(io.cdap.cdap.etl.batch.mapreduce.ETLMapReduce) Metadata(io.cdap.cdap.api.metadata.Metadata) Logger(org.slf4j.Logger) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) ConditionBranches(io.cdap.cdap.etl.planner.ConditionBranches) Iterator(java.util.Iterator) BatchAggregator(io.cdap.cdap.etl.api.batch.BatchAggregator) SparkCompute(io.cdap.cdap.etl.api.batch.SparkCompute) Dag(io.cdap.cdap.etl.planner.Dag) ETLBatchConfig(io.cdap.cdap.etl.proto.v2.ETLBatchConfig) ProgramStatusTriggerInfo(io.cdap.cdap.api.schedule.ProgramStatusTriggerInfo) Schema(io.cdap.cdap.api.data.schema.Schema) CloseableIterator(io.cdap.cdap.api.dataset.lib.CloseableIterator) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) PostAction(io.cdap.cdap.etl.api.batch.PostAction) NodeValue(io.cdap.cdap.api.workflow.NodeValue) BatchSink(io.cdap.cdap.etl.api.batch.BatchSink) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) BatchPipelineSpec(io.cdap.cdap.etl.batch.BatchPipelineSpec) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) Collections(java.util.Collections) MacroEvaluator(io.cdap.cdap.api.macro.MacroEvaluator) ControlDag(io.cdap.cdap.etl.planner.ControlDag) ValidationException(io.cdap.cdap.etl.api.validation.ValidationException) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) FileSet(io.cdap.cdap.api.dataset.lib.FileSet) HashSet(java.util.HashSet) Metadata(io.cdap.cdap.api.metadata.Metadata) ConditionBranches(io.cdap.cdap.etl.planner.ConditionBranches) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) HashSet(java.util.HashSet) BatchPipelineSpecGenerator(io.cdap.cdap.etl.batch.BatchPipelineSpecGenerator) Connection(io.cdap.cdap.etl.proto.Connection) ControlDag(io.cdap.cdap.etl.planner.ControlDag) Dag(io.cdap.cdap.etl.planner.Dag) Deque(java.util.Deque) LinkedList(java.util.LinkedList)

Example 25 with Connection

use of io.cdap.cdap.etl.proto.Connection in project cdap by caskdata.

the class ETLBatchConfigTest method testUpgrade.

@Test
public void testUpgrade() throws Exception {
    final ArtifactSelectorConfig artifact = new ArtifactSelectorConfig("SYSTEM", "universal", "1.0.0");
    ETLStage source = new ETLStage("source", new Plugin("DataGenerator", ImmutableMap.of("p1", "v1"), artifact), null);
    io.cdap.cdap.etl.proto.v2.ETLStage sourceNew = from(source, BatchSource.PLUGIN_TYPE);
    ETLStage transform1 = new ETLStage("transform1", new Plugin("Script", ImmutableMap.of("script", "something"), null));
    io.cdap.cdap.etl.proto.v2.ETLStage transform1New = from(transform1, Transform.PLUGIN_TYPE);
    ETLStage transform2 = new ETLStage("transform2", new Plugin("Script", null, null));
    io.cdap.cdap.etl.proto.v2.ETLStage transform2New = from(transform2, Transform.PLUGIN_TYPE);
    ETLStage transform3 = new ETLStage("transform3", new Plugin("Validator", ImmutableMap.of("p1", "v1", "p2", "v2")), null);
    io.cdap.cdap.etl.proto.v2.ETLStage transform3New = from(transform3, Transform.PLUGIN_TYPE);
    ETLStage sink1 = new ETLStage("sink1", new Plugin("Table", ImmutableMap.of("rowkey", "xyz"), artifact), null);
    io.cdap.cdap.etl.proto.v2.ETLStage sink1New = from(sink1, BatchSink.PLUGIN_TYPE);
    ETLStage sink2 = new ETLStage("sink2", new Plugin("HDFS", ImmutableMap.of("name", "abc"), artifact), null);
    io.cdap.cdap.etl.proto.v2.ETLStage sink2New = from(sink2, BatchSink.PLUGIN_TYPE);
    Set<Connection> connections = new HashSet<>();
    connections.add(new Connection(sourceNew.getName(), transform1New.getName()));
    connections.add(new Connection(transform1New.getName(), transform2New.getName()));
    connections.add(new Connection(transform2New.getName(), transform3New.getName()));
    connections.add(new Connection(transform3New.getName(), sink1New.getName()));
    connections.add(new Connection(transform3New.getName(), sink2New.getName()));
    String schedule = "*/5 * * * *";
    Resources resources = new Resources(1024, 1);
    ETLBatchConfig config = ETLBatchConfig.builder(schedule).setSource(source).addSink(sink1).addSink(sink2).addTransform(transform1).addTransform(transform2).addTransform(transform3).addConnections(connections).setResources(resources).setDriverResources(resources).build();
    io.cdap.cdap.etl.proto.v2.ETLBatchConfig configNew = io.cdap.cdap.etl.proto.v2.ETLBatchConfig.builder(schedule).addStage(sourceNew).addStage(sink1New).addStage(sink2New).addStage(transform1New).addStage(transform2New).addStage(transform3New).addConnections(connections).setResources(resources).setDriverResources(resources).build();
    Assert.assertEquals(configNew, config.upgrade(new UpgradeContext() {

        @Nullable
        @Override
        public ArtifactSelectorConfig getPluginArtifact(String pluginType, String pluginName) {
            return null;
        }
    }));
}

Also used : ArtifactSelectorConfig(io.cdap.cdap.etl.proto.ArtifactSelectorConfig) Connection(io.cdap.cdap.etl.proto.Connection) UpgradeContext(io.cdap.cdap.etl.proto.UpgradeContext) Resources(io.cdap.cdap.api.Resources) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

Connection (io.cdap.cdap.etl.proto.Connection)48 Test (org.junit.Test)39 HashSet (java.util.HashSet)35 HashMap (java.util.HashMap)22 ArrayList (java.util.ArrayList)16 Operation (io.cdap.cdap.api.lineage.field.Operation)14 FieldOperation (io.cdap.cdap.etl.api.lineage.field.FieldOperation)14 List (java.util.List)14 ImmutableList (com.google.common.collect.ImmutableList)13 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)13 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)13 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)13 FieldReadOperation (io.cdap.cdap.etl.api.lineage.field.FieldReadOperation)13 FieldWriteOperation (io.cdap.cdap.etl.api.lineage.field.FieldWriteOperation)13 FieldTransformOperation (io.cdap.cdap.etl.api.lineage.field.FieldTransformOperation)12 EndPoint (io.cdap.cdap.api.lineage.field.EndPoint)10 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)9 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)8 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)7 FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)4