Search in sources :

Example 36 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.

the class PipelineSpecGenerator method configureStage.

/**
 * Configures a stage and returns the spec for it.
 *
 * @param stage the user provided configuration for the stage
 * @param validatedPipeline the validated pipeline config
 * @param pluginConfigurer configurer used to configure the stage
 * @return the spec for the stage
 * @throws ValidationException if the plugin threw an exception during configuration
 */
protected ConfiguredStage configureStage(ETLStage stage, ValidatedPipeline validatedPipeline, DefaultPipelineConfigurer pluginConfigurer) throws ValidationException {
    String stageName = stage.getName();
    ETLPlugin stagePlugin = stage.getPlugin();
    StageSpec.Builder specBuilder = configureStage(stageName, stagePlugin, pluginConfigurer);
    DefaultStageConfigurer stageConfigurer = pluginConfigurer.getStageConfigurer();
    String pluginType = stage.getPlugin().getType();
    if (pluginType.equals(SplitterTransform.PLUGIN_TYPE)) {
        Map<String, Schema> outputPortSchemas = stageConfigurer.getOutputPortSchemas();
        for (Map.Entry<String, String> outputEntry : validatedPipeline.getOutputPorts(stageName).entrySet()) {
            String outputStage = outputEntry.getKey();
            String outputPort = outputEntry.getValue();
            if (outputPort == null) {
                throw new IllegalArgumentException(String.format("Connection from Splitter '%s' to '%s' must specify a port.", stageName, outputStage));
            }
            specBuilder.addOutput(outputStage, outputPort, outputPortSchemas.get(outputPort));
        }
    } else {
        Schema outputSchema = stageConfigurer.getOutputSchema();
        // all the same
        if (Condition.PLUGIN_TYPE.equals(pluginType)) {
            outputSchema = null;
            for (Schema schema : stageConfigurer.getInputSchemas().values()) {
                if (schema != null) {
                    // todo: fix this cleanly and fully
                    if (outputSchema != null && !Schemas.equalsIgnoringRecordName(outputSchema, schema)) {
                        throw new IllegalArgumentException("Cannot have different input schemas going into stage " + stageName);
                    }
                    outputSchema = schema;
                }
            }
        }
        for (String outputStage : validatedPipeline.getOutputs(stageName)) {
            specBuilder.addOutput(outputStage, null, outputSchema);
        }
    }
    StageSpec stageSpec = specBuilder.setProcessTimingEnabled(validatedPipeline.isProcessTimingEnabled()).setStageLoggingEnabled(validatedPipeline.isStageLoggingEnabled()).setMaxPreviewRecords(validatedPipeline.getMaxPreviewRecords()).build();
    return new ConfiguredStage(stageSpec, pluginConfigurer.getPipelineProperties());
}
Also used : StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) Schema(io.cdap.cdap.api.data.schema.Schema) ETLPlugin(io.cdap.cdap.etl.proto.v2.ETLPlugin) DefaultStageConfigurer(io.cdap.cdap.etl.common.DefaultStageConfigurer) Map(java.util.Map) HashMap(java.util.HashMap)

Example 37 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.

the class PipelinePlanner method dagToPipeline.

/**
 * Converts a Dag into a PipelinePhase, using what we know about the plugin type of each node in the dag.
 * The PipelinePhase is what programs will take as input, and keeps track of sources, transforms, sinks, etc.
 *
 * @param dag the dag to convert
 * @param connectors connector nodes across all dags
 * @param specs specifications for every stage
 * @return the converted dag
 */
private PipelinePhase dagToPipeline(Dag dag, Map<String, String> connectors, Map<String, StageSpec> specs, Map<String, String> conditionConnectors) {
    PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
    for (String stageName : dag.getTopologicalOrder()) {
        Set<String> outputs = dag.getNodeOutputs(stageName);
        if (!outputs.isEmpty()) {
            phaseBuilder.addConnections(stageName, outputs);
        }
        // add connectors
        String originalName = connectors.get(stageName);
        if (originalName != null || conditionConnectors.values().contains(stageName)) {
            String connectorType = dag.getSources().contains(stageName) ? Constants.Connector.SOURCE_TYPE : Constants.Connector.SINK_TYPE;
            PluginSpec connectorSpec = new PluginSpec(Constants.Connector.PLUGIN_TYPE, "connector", ImmutableMap.of(Constants.Connector.ORIGINAL_NAME, originalName != null ? originalName : stageName, Constants.Connector.TYPE, connectorType), null);
            phaseBuilder.addStage(StageSpec.builder(stageName, connectorSpec).build());
            continue;
        }
        // add other plugin types
        StageSpec spec = specs.get(stageName);
        phaseBuilder.addStage(spec);
    }
    return phaseBuilder.build();
}
Also used : PluginSpec(io.cdap.cdap.etl.proto.v2.spec.PluginSpec) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec)

Example 38 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.

the class PipelinePlanner method plan.

/**
 * Create an execution plan for the given logical pipeline. This is used for batch pipelines.
 * Though it may eventually be useful to mark windowing points for realtime pipelines.
 *
 * A plan consists of one or more phases, with connections between phases.
 * A connection between a phase indicates control flow, and not necessarily
 * data flow. This class assumes that it receives a valid pipeline spec.
 * That is, the pipeline has no cycles, all its nodes have unique names,
 * sources don't have any input, sinks don't have any output,
 * everything else has both an input and an output, etc.
 *
 * We start by inserting connector nodes into the logical dag,
 * which are used to mark boundaries between mapreduce jobs.
 * Each connector represents a node where we will need to write to a local dataset.
 *
 * Next, the logical pipeline is broken up into phases,
 * using the connectors as sinks in one phase, and a source in another.
 * After this point, connections between phases do not indicate data flow, but control flow.
 *
 * @param spec the pipeline spec, representing a logical pipeline
 * @return the execution plan
 */
public PipelinePlan plan(PipelineSpec spec) {
    // go through the stages and examine their plugin type to determine which stages are reduce stages
    Set<String> reduceNodes = new HashSet<>();
    Set<String> isolationNodes = new HashSet<>();
    Set<String> actionNodes = new HashSet<>();
    Set<String> multiPortNodes = new HashSet<>();
    Set<String> allNodes = new HashSet<>();
    // Map to hold the connection information from condition nodes to the first stage
    // they connect to. Condition information also includes whether the stage is connected
    // on the 'true' branch or the 'false' branch
    Map<String, ConditionBranches> conditionBranches = new HashMap<>();
    Map<String, Set<String>> conditionOutputs = new HashMap<>();
    Map<String, Set<String>> conditionInputs = new HashMap<>();
    Map<String, StageSpec> specs = new HashMap<>();
    for (StageSpec stage : spec.getStages()) {
        String pluginType = stage.getPlugin().getType();
        allNodes.add(stage.getName());
        if (reduceTypes.contains(pluginType)) {
            reduceNodes.add(stage.getName());
        }
        if (isolationTypes.contains(pluginType)) {
            isolationNodes.add(stage.getName());
        }
        if (actionTypes.contains(pluginType)) {
            // Collect all Action nodes from spec
            actionNodes.add(stage.getName());
        }
        if (multiPortTypes.contains(pluginType)) {
            multiPortNodes.add(stage.getName());
        }
        if (Condition.PLUGIN_TYPE.equals(pluginType)) {
            conditionBranches.put(stage.getName(), new ConditionBranches(null, null));
            conditionOutputs.put(stage.getName(), new HashSet<String>());
            conditionInputs.put(stage.getName(), new HashSet<String>());
        }
        specs.put(stage.getName(), stage);
    }
    // Special case for action nodes when there is no connection between them
    if (spec.getConnections().isEmpty()) {
        // All nodes should be actions
        if (!actionNodes.containsAll(allNodes)) {
            throw new IllegalStateException("No connections are specified.");
        }
        Map<String, PipelinePhase> phases = new HashMap<>();
        for (String actionNode : actionNodes) {
            PipelinePhase.Builder phaseBuilder = PipelinePhase.builder(supportedPluginTypes);
            PipelinePhase actionPhase = phaseBuilder.addStage(specs.get(actionNode)).build();
            phases.put(actionNode, actionPhase);
        }
        return new PipelinePlan(phases, new HashSet<Connection>());
    }
    // Set representing control nodes (Conditions and Actions)
    Set<String> controlNodes = Sets.union(actionNodes, conditionBranches.keySet());
    Map<String, String> conditionChildToParent = new HashMap<>();
    for (Connection connection : spec.getConnections()) {
        if (conditionBranches.containsKey(connection.getFrom())) {
            conditionOutputs.get(connection.getFrom()).add(connection.getTo());
        }
        if (conditionBranches.containsKey(connection.getTo())) {
            conditionInputs.get(connection.getTo()).add(connection.getFrom());
        }
        if (conditionBranches.containsKey(connection.getFrom())) {
            if (conditionBranches.containsKey(connection.getTo())) {
                // conditions are chained
                conditionChildToParent.put(connection.getTo(), connection.getFrom());
            }
            // Outgoing connection from condition
            ConditionBranches branches = conditionBranches.get(connection.getFrom());
            String trueOutput;
            String falseOutput;
            if (connection.getCondition()) {
                trueOutput = connection.getTo();
                falseOutput = branches.getFalseOutput();
            } else {
                trueOutput = branches.getTrueOutput();
                falseOutput = connection.getTo();
            }
            conditionBranches.put(connection.getFrom(), new ConditionBranches(trueOutput, falseOutput));
        }
    }
    Map<String, String> connectorNodes = new HashMap<>();
    // now split the logical pipeline into pipeline phases, using the connectors as split points
    Set<Dag> splittedDag = split(spec.getConnections(), conditionBranches.keySet(), reduceNodes, isolationNodes, actionNodes, multiPortNodes, connectorNodes);
    Map<String, String> controlConnectors = getConnectorsAssociatedWithConditions(conditionBranches.keySet(), conditionChildToParent, conditionInputs, conditionOutputs, actionNodes);
    Map<String, Dag> subdags = new HashMap<>();
    for (Dag subdag : splittedDag) {
        subdags.put(getPhaseName(subdag), subdag);
    }
    // build connections between phases and convert dags to PipelinePhase.
    Set<Connection> phaseConnections = new HashSet<>();
    Map<String, PipelinePhase> phases = new HashMap<>();
    for (Map.Entry<String, Dag> dagEntry1 : subdags.entrySet()) {
        String dag1Name = dagEntry1.getKey();
        Dag dag1 = dagEntry1.getValue();
        // convert the dag to a PipelinePhase
        // add a separate pipeline phase for each control node in the subdag
        Set<String> dag1ControlNodes = Sets.intersection(controlNodes, dag1.getNodes());
        for (String dag1ControlNode : dag1ControlNodes) {
            if (!phases.containsKey(dag1ControlNode)) {
                phases.put(dag1ControlNode, PipelinePhase.builder(supportedPluginTypes).addStage(specs.get(dag1ControlNode)).build());
            }
        }
        // if there are non-control nodes in the subdag, add a pipeline phase for it
        if (!controlNodes.containsAll(dag1.getNodes())) {
            // the updated dag replaces conditions with the corresponding connector if applicable.
            Dag updatedDag = getUpdatedDag(dag1, controlConnectors);
            // Remove any control nodes from this dag
            if (!Sets.intersection(updatedDag.getNodes(), controlNodes).isEmpty()) {
                Set<String> nodes = Sets.difference(updatedDag.getNodes(), controlNodes);
                updatedDag = updatedDag.createSubDag(nodes);
            }
            phases.put(dag1Name, dagToPipeline(updatedDag, connectorNodes, specs, controlConnectors));
        }
        for (String controlSource : Sets.intersection(controlNodes, dag1.getSources())) {
            ConditionBranches branches = conditionBranches.get(controlSource);
            Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
            for (String output : dag1.getNodeOutputs(controlSource)) {
                if (controlNodes.contains(output)) {
                    // control source -> control node, add a phase connection between the control phases
                    phaseConnections.add(new Connection(controlSource, output, condition));
                } else {
                    // control source -> non-control nodes, add a phase connection from the control phase to this dag
                    phaseConnections.add(new Connection(controlSource, dag1Name, condition));
                }
            }
        }
        // from this dag to the control phase
        for (String controlSink : Sets.intersection(controlNodes, dag1.getSinks())) {
            for (String input : dag1.getNodeInputs(controlSink)) {
                if (controlNodes.contains(input)) {
                    // control node -> control-sink, add a phase connection between the control phases
                    ConditionBranches branches = conditionBranches.get(input);
                    Boolean condition = branches == null ? null : dag1.getNodes().contains(branches.getTrueOutput());
                    phaseConnections.add(new Connection(input, controlSink, condition));
                } else {
                    // non-control node -> control-sink, add a phase connection from this dag to the control phase
                    phaseConnections.add(new Connection(dag1Name, controlSink));
                }
            }
        }
        // find connected subdags (they have a source that is a sink in dag1)
        Set<String> nonControlSinks = Sets.difference(dag1.getSinks(), controlNodes);
        for (Map.Entry<String, Dag> dagEntry2 : subdags.entrySet()) {
            String dag2Name = dagEntry2.getKey();
            Dag dag2 = dagEntry2.getValue();
            if (dag1Name.equals(dag2Name)) {
                continue;
            }
            if (!Sets.intersection(nonControlSinks, dag2.getSources()).isEmpty()) {
                phaseConnections.add(new Connection(dag1Name, dag2Name));
            }
        }
    }
    return new PipelinePlan(phases, phaseConnections);
}
Also used : ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) HashMap(java.util.HashMap) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) HashSet(java.util.HashSet) Connection(io.cdap.cdap.etl.proto.Connection) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) Map(java.util.Map)

Example 39 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.

the class SparkStreamingPipelineRunner method handleJoin.

@Override
protected SparkCollection<Object> handleJoin(Map<String, SparkCollection<Object>> inputDataCollections, PipelinePhase pipelinePhase, PluginFunctionContext pluginFunctionContext, StageSpec stageSpec, FunctionCache.Factory functionCacheFactory, Object plugin, Integer numPartitions, StageStatisticsCollector collector, Set<String> shufflers) throws Exception {
    String stageName = stageSpec.getName();
    BatchJoiner<?, ?, ?> joiner;
    if (plugin instanceof BatchAutoJoiner) {
        BatchAutoJoiner autoJoiner = (BatchAutoJoiner) plugin;
        Map<String, Schema> inputSchemas = new HashMap<>();
        for (String inputStageName : pipelinePhase.getStageInputs(stageName)) {
            StageSpec inputStageSpec = pipelinePhase.getStage(inputStageName);
            inputSchemas.put(inputStageName, inputStageSpec.getOutputSchema());
        }
        FailureCollector failureCollector = new LoggingFailureCollector(stageName, inputSchemas);
        AutoJoinerContext autoJoinerContext = DefaultAutoJoinerContext.from(inputSchemas, failureCollector);
        failureCollector.getOrThrowException();
        JoinDefinition joinDefinition = autoJoiner.define(autoJoinerContext);
        if (joinDefinition == null) {
            throw new IllegalStateException(String.format("Joiner stage '%s' did not specify a join definition. " + "Check with the plugin developer to ensure it is implemented correctly.", stageName));
        }
        joiner = new JoinerBridge(stageName, autoJoiner, joinDefinition);
    } else if (plugin instanceof BatchJoiner) {
        joiner = (BatchJoiner) plugin;
    } else {
        // should never happen unless there is a bug in the code. should have failed during deployment
        throw new IllegalStateException(String.format("Stage '%s' is an unknown joiner type %s", stageName, plugin.getClass().getName()));
    }
    BatchJoinerRuntimeContext joinerRuntimeContext = pluginFunctionContext.createBatchRuntimeContext();
    joiner.initialize(joinerRuntimeContext);
    shufflers.add(stageName);
    return handleJoin(joiner, inputDataCollections, stageSpec, functionCacheFactory, numPartitions, collector);
}
Also used : BatchJoinerRuntimeContext(io.cdap.cdap.etl.api.batch.BatchJoinerRuntimeContext) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) HashMap(java.util.HashMap) Schema(io.cdap.cdap.api.data.schema.Schema) BatchJoiner(io.cdap.cdap.etl.api.batch.BatchJoiner) BatchAutoJoiner(io.cdap.cdap.etl.api.batch.BatchAutoJoiner) DefaultAutoJoinerContext(io.cdap.cdap.etl.common.DefaultAutoJoinerContext) AutoJoinerContext(io.cdap.cdap.etl.api.join.AutoJoinerContext) JoinDefinition(io.cdap.cdap.etl.api.join.JoinDefinition) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) LoggingFailureCollector(io.cdap.cdap.etl.validation.LoggingFailureCollector) FailureCollector(io.cdap.cdap.etl.api.FailureCollector) JoinerBridge(io.cdap.cdap.etl.common.plugin.JoinerBridge)

Example 40 with StageSpec

use of io.cdap.cdap.etl.proto.v2.spec.StageSpec in project cdap by cdapio.

the class PipelineAction method run.

@Override
public void run() throws Exception {
    CustomActionContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    PipelinePhase phase = phaseSpec.getPhase();
    StageSpec stageSpec = phase.iterator().next();
    PluginContext pluginContext = new PipelinePluginContext(context, metrics, phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, metrics);
    Action action = pluginContext.newPluginInstance(stageSpec.getName(), new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context, context.getNamespace()));
    ActionContext actionContext = new BasicActionContext(context, pipelineRuntime, stageSpec);
    if (!context.getDataTracer(stageSpec.getName()).isEnabled()) {
        action.run(actionContext);
    }
    WorkflowToken token = context.getWorkflowToken();
    if (token == null) {
        throw new IllegalStateException("WorkflowToken cannot be null when action is executed through Workflow.");
    }
    for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
        token.put(entry.getKey(), entry.getValue());
    }
}
Also used : Action(io.cdap.cdap.etl.api.action.Action) AbstractCustomAction(io.cdap.cdap.api.customaction.AbstractCustomAction) CustomAction(io.cdap.cdap.api.customaction.CustomAction) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext) PluginContext(io.cdap.cdap.api.plugin.PluginContext) WorkflowToken(io.cdap.cdap.api.workflow.WorkflowToken) CustomActionContext(io.cdap.cdap.api.customaction.CustomActionContext) ActionContext(io.cdap.cdap.etl.api.action.ActionContext) PipelinePhase(io.cdap.cdap.etl.common.PipelinePhase) StageSpec(io.cdap.cdap.etl.proto.v2.spec.StageSpec) DefaultMacroEvaluator(io.cdap.cdap.etl.common.DefaultMacroEvaluator) CustomActionContext(io.cdap.cdap.api.customaction.CustomActionContext) BatchPhaseSpec(io.cdap.cdap.etl.batch.BatchPhaseSpec) HashMap(java.util.HashMap) Map(java.util.Map) PipelinePluginContext(io.cdap.cdap.etl.common.plugin.PipelinePluginContext)

Aggregations

StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)74 HashMap (java.util.HashMap)42 PipelinePhase (io.cdap.cdap.etl.common.PipelinePhase)30 HashSet (java.util.HashSet)24 Map (java.util.Map)24 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)20 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)20 Connection (io.cdap.cdap.etl.proto.Connection)18 Schema (io.cdap.cdap.api.data.schema.Schema)16 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)16 ArrayList (java.util.ArrayList)16 BatchPhaseSpec (io.cdap.cdap.etl.batch.BatchPhaseSpec)14 PipelineSpec (io.cdap.cdap.etl.proto.v2.spec.PipelineSpec)14 Test (org.junit.Test)14 PipelinePluginContext (io.cdap.cdap.etl.common.plugin.PipelinePluginContext)12 PluginContext (io.cdap.cdap.api.plugin.PluginContext)10 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)10 List (java.util.List)10 WorkflowToken (io.cdap.cdap.api.workflow.WorkflowToken)8 BatchJoiner (io.cdap.cdap.etl.api.batch.BatchJoiner)8