Search in sources :

Example 1 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class PipelinePlanner method populateActionPhases.

/**
   * This method is responsible for populating phases and phaseConnections with the Action phases.
   * Action phase is a single stage {@link PipelinePhase} which does not have any dag.
   *
   * @param pipelineSpec the overall spec for the pipeline
   * @param specs the Map of stage specs
   * @param actionNodes the Set of action nodes in the pipeline
   * @param phases the Map of phases created so far
   * @param phaseConnections the Set of connections between phases added so far
   * @param outgoingActionConnections the Map that holds set of stages to which
   *                                  there is an outgoing connection from a Action stage
   * @param incomingActionConnections the Map that holds set of stages to which
   *                                  there is a incoming connection to an Action stage
   * @param subdags subdags created so far from the pipeline stages
   */
private void populateActionPhases(PipelineSpec pipelineSpec, Map<String, StageSpec> specs, Set<String> actionNodes, Map<String, PipelinePhase> phases, Set<Connection> phaseConnections, SetMultimap<String, String> outgoingActionConnections, SetMultimap<String, String> incomingActionConnections, Map<String, Dag> subdags) {
    // Create single stage phases for the Action nodes
    for (String node : actionNodes) {
        StageSpec actionStageSpec = specs.get(node);
        String type = specs.get(node).getPlugin().getType();
        StageInfo actionStageInfo = StageInfo.builder(node, type).addInputs(actionStageSpec.getInputs()).addInputSchemas(actionStageSpec.getInputSchemas()).addOutputs(actionStageSpec.getOutputs()).setOutputSchema(actionStageSpec.getOutputSchema()).setErrorSchema(actionStageSpec.getErrorSchema()).setErrorDatasetName(actionStageSpec.getErrorDatasetName()).setStageLoggingEnabled(pipelineSpec.isStageLoggingEnabled()).setProcessTimingEnabled(pipelineSpec.isProcessTimingEnabled()).build();
        phases.put(node, PipelinePhase.builder(supportedPluginTypes).addStage(actionStageInfo).build());
    }
    // Build phaseConnections for the Action nodes
    for (String sourceAction : outgoingActionConnections.keySet()) {
        // Check if destination is one of the source stages in the pipeline
        for (Map.Entry<String, Dag> subdagEntry : subdags.entrySet()) {
            if (Sets.intersection(outgoingActionConnections.get(sourceAction), subdagEntry.getValue().getSources()).size() > 0) {
                phaseConnections.add(new Connection(sourceAction, subdagEntry.getKey()));
            }
        }
        // Check if destination is other Action node
        for (String destination : outgoingActionConnections.get(sourceAction)) {
            if (actionNodes.contains(destination)) {
                phaseConnections.add(new Connection(sourceAction, destination));
            }
        }
    }
    for (String destinationAction : incomingActionConnections.keySet()) {
        // Check if source is one of the sink stages in the pipeline
        for (Map.Entry<String, Dag> subdagEntry : subdags.entrySet()) {
            if (Sets.intersection(incomingActionConnections.get(destinationAction), subdagEntry.getValue().getSinks()).size() > 0) {
                phaseConnections.add(new Connection(subdagEntry.getKey(), destinationAction));
            }
        }
    }
}
Also used : StageSpec(co.cask.cdap.etl.spec.StageSpec) Connection(co.cask.cdap.etl.proto.Connection) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class ETLWorker method configure.

@Override
public void configure() {
    setName(NAME);
    setDescription("Worker Driver for Realtime ETL Pipelines");
    int instances = config.getInstances();
    if (instances < 1) {
        throw new IllegalArgumentException("instances must be greater than 0.");
    }
    setInstances(instances);
    if (config.getResources() != null) {
        setResources(config.getResources());
    }
    PipelineSpecGenerator<ETLRealtimeConfig, PipelineSpec> specGenerator = new RealtimePipelineSpecGenerator(getConfigurer(), ImmutableSet.of(RealtimeSource.PLUGIN_TYPE), ImmutableSet.of(RealtimeSink.PLUGIN_TYPE), Table.class, TableProperties.builder().setSchema(ERROR_SCHEMA).build());
    PipelineSpec spec = specGenerator.generateSpec(config);
    int sourceCount = 0;
    for (StageSpec stageSpec : spec.getStages()) {
        if (RealtimeSource.PLUGIN_TYPE.equals(stageSpec.getPlugin().getType())) {
            sourceCount++;
        }
    }
    if (sourceCount != 1) {
        throw new IllegalArgumentException("Invalid pipeline. There must only be one source.");
    }
    PipelinePlanner planner = new PipelinePlanner(SUPPORTED_PLUGIN_TYPES, ImmutableSet.<String>of(), ImmutableSet.<String>of(), ImmutableSet.<String>of());
    PipelinePlan plan = planner.plan(spec);
    if (plan.getPhases().size() != 1) {
        // should never happen
        throw new IllegalArgumentException("There was an error planning the pipeline. There should only be one phase.");
    }
    PipelinePhase pipeline = plan.getPhases().values().iterator().next();
    Map<String, String> properties = new HashMap<>();
    properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
    properties.put(Constants.PIPELINEID, GSON.toJson(pipeline));
    // Generate unique id for this app creation.
    properties.put(UNIQUE_ID, String.valueOf(System.currentTimeMillis()));
    properties.put(Constants.STAGE_LOGGING_ENABLED, String.valueOf(config.isStageLoggingEnabled()));
    setProperties(properties);
}
Also used : PipelinePlan(co.cask.cdap.etl.planner.PipelinePlan) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) HashMap(java.util.HashMap) ETLRealtimeConfig(co.cask.cdap.etl.proto.v2.ETLRealtimeConfig) PipelineSpec(co.cask.cdap.etl.spec.PipelineSpec) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec)

Example 3 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class SmartWorkflow method configure.

@Override
protected void configure() {
    setName(NAME);
    setDescription(DESCRIPTION);
    // set the pipeline spec as a property in case somebody like the UI wants to read it
    Map<String, String> properties = new HashMap<>();
    properties.put(Constants.PIPELINE_SPEC_KEY, GSON.toJson(spec));
    setProperties(properties);
    stageSpecs = new HashMap<>();
    useSpark = engine == Engine.SPARK;
    for (StageSpec stageSpec : spec.getStages()) {
        stageSpecs.put(stageSpec.getName(), stageSpec);
        String pluginType = stageSpec.getPlugin().getType();
        if (SparkCompute.PLUGIN_TYPE.equals(pluginType) || SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            useSpark = true;
        }
    }
    PipelinePlanner planner;
    Set<String> actionTypes = ImmutableSet.of(Action.PLUGIN_TYPE, Constants.SPARK_PROGRAM_PLUGIN_TYPE);
    Set<String> multiPortTypes = ImmutableSet.of(SplitterTransform.PLUGIN_TYPE);
    if (useSpark) {
        // if the pipeline uses spark, we don't need to break the pipeline up into phases, we can just have
        // a single phase.
        planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.<String>of(), ImmutableSet.<String>of(), actionTypes, multiPortTypes);
    } else {
        planner = new PipelinePlanner(supportedPluginTypes, ImmutableSet.of(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE), ImmutableSet.of(SparkCompute.PLUGIN_TYPE, SparkSink.PLUGIN_TYPE), actionTypes, multiPortTypes);
    }
    plan = planner.plan(spec);
    WorkflowProgramAdder programAdder = new TrunkProgramAdder(getConfigurer());
    // single phase, just add the program directly
    if (plan.getPhases().size() == 1) {
        addProgram(plan.getPhases().keySet().iterator().next(), programAdder);
        return;
    }
    // Dag classes don't allow a 'dag' without connections
    if (plan.getPhaseConnections().isEmpty()) {
        WorkflowProgramAdder fork = programAdder.fork();
        for (String phaseName : plan.getPhases().keySet()) {
            addProgram(phaseName, fork);
        }
        fork.join();
        return;
    }
    dag = new ControlDag(plan.getPhaseConnections());
    boolean dummyNodeAdded = false;
    Map<String, ConditionBranches> conditionBranches = plan.getConditionPhaseBranches();
    if (conditionBranches.isEmpty()) {
        // after flattening, there is guaranteed to be just one source
        dag.flatten();
    } else if (!conditionBranches.keySet().containsAll(dag.getSources())) {
        // Continue only if the conditon node is not the source of the dag, otherwise dag is already in the
        // required form
        Set<String> conditions = conditionBranches.keySet();
        // flatten only the part of the dag starting from sources and ending in conditions/sinks.
        Set<String> dagNodes = dag.accessibleFrom(dag.getSources(), Sets.union(dag.getSinks(), conditions));
        Set<String> dagNodesWithoutCondition = Sets.difference(dagNodes, conditions);
        Set<Connection> connections = new HashSet<>();
        Deque<String> bfs = new LinkedList<>();
        Set<String> sinks = new HashSet<>();
        // If its a single phase without condition then no need to flatten
        if (dagNodesWithoutCondition.size() > 1) {
            Dag subDag;
            try {
                subDag = dag.createSubDag(dagNodesWithoutCondition);
            } catch (IllegalArgumentException | DisjointConnectionsException e) {
                // DisjointConnectionsException thrown when islands are created from the dagNodesWithoutCondition
                // IllegalArgumentException thrown when connections are empty
                // In both cases we need to add dummy node and create connected Dag
                String dummyNode = "dummy";
                dummyNodeAdded = true;
                Set<Connection> subDagConnections = new HashSet<>();
                for (String source : dag.getSources()) {
                    subDagConnections.add(new Connection(dummyNode, source));
                }
                Deque<String> subDagBFS = new LinkedList<>();
                subDagBFS.addAll(dag.getSources());
                while (subDagBFS.peek() != null) {
                    String node = subDagBFS.poll();
                    for (String output : dag.getNodeOutputs(node)) {
                        if (dagNodesWithoutCondition.contains(output)) {
                            subDagConnections.add(new Connection(node, output));
                            subDagBFS.add(output);
                        }
                    }
                }
                subDag = new Dag(subDagConnections);
            }
            ControlDag cdag = new ControlDag(subDag);
            cdag.flatten();
            // Add all connections from cdag
            bfs.addAll(cdag.getSources());
            while (bfs.peek() != null) {
                String node = bfs.poll();
                for (String output : cdag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            }
            sinks.addAll(cdag.getSinks());
        } else {
            sinks.addAll(dagNodesWithoutCondition);
        }
        // Add back the existing condition nodes and corresponding conditions
        Set<String> conditionsFromDag = Sets.intersection(dagNodes, conditions);
        for (String condition : conditionsFromDag) {
            connections.add(new Connection(sinks.iterator().next(), condition));
        }
        bfs.addAll(Sets.intersection(dagNodes, conditions));
        while (bfs.peek() != null) {
            String node = bfs.poll();
            ConditionBranches branches = conditionBranches.get(node);
            if (branches == null) {
                // not a condition node. add outputs
                for (String output : dag.getNodeOutputs(node)) {
                    connections.add(new Connection(node, output));
                    bfs.add(output);
                }
            } else {
                // condition node
                for (Boolean condition : Arrays.asList(true, false)) {
                    String phase = condition ? branches.getTrueOutput() : branches.getFalseOutput();
                    if (phase == null) {
                        continue;
                    }
                    connections.add(new Connection(node, phase, condition));
                    bfs.add(phase);
                }
            }
        }
        dag = new ControlDag(connections);
    }
    if (dummyNodeAdded) {
        WorkflowProgramAdder fork = programAdder.fork();
        String dummyNode = dag.getSources().iterator().next();
        for (String output : dag.getNodeOutputs(dummyNode)) {
            // need to make sure we don't call also() if this is the final branch
            if (!addBranchPrograms(output, fork)) {
                fork = fork.also();
            }
        }
    } else {
        String start = dag.getSources().iterator().next();
        addPrograms(start, programAdder);
    }
}
Also used : ControlDag(co.cask.cdap.etl.planner.ControlDag) PipelinePlanner(co.cask.cdap.etl.planner.PipelinePlanner) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Connection(co.cask.cdap.etl.proto.Connection) Dag(co.cask.cdap.etl.planner.Dag) ControlDag(co.cask.cdap.etl.planner.ControlDag) Deque(java.util.Deque) LinkedList(java.util.LinkedList) ConditionBranches(co.cask.cdap.etl.planner.ConditionBranches) StageSpec(co.cask.cdap.etl.spec.StageSpec) HashSet(java.util.HashSet)

Example 4 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class ETLMapReduce method initialize.

@Override
@TransactionPolicy(TransactionControl.EXPLICIT)
public void initialize() throws Exception {
    final MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    PipelineRuntime pipelineRuntime = new PipelineRuntime(context, mrMetrics);
    List<Finisher> finishers = new ArrayList<>();
    final Job job = context.getHadoopJob();
    final Configuration hConf = job.getConfiguration();
    hConf.setBoolean("mapreduce.map.speculative", false);
    hConf.setBoolean("mapreduce.reduce.speculative", false);
    // plugin name -> runtime args for that plugin
    MacroEvaluator evaluator = new DefaultMacroEvaluator(pipelineRuntime.getArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    Set<String> connectorDatasets = GSON.fromJson(properties.get(Constants.CONNECTOR_DATASETS), CONNECTOR_DATASETS_TYPE);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    final PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec, new MultiConnectorFactory());
    // should never happen if planner is correct
    Set<StageSpec> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (reducers.size() > 1) {
        Iterator<StageSpec> reducerIter = reducers.iterator();
        StringBuilder reducersStr = new StringBuilder(reducerIter.next().getName());
        while (reducerIter.hasNext()) {
            reducersStr.append(",");
            reducersStr.append(reducerIter.next().getName());
        }
        throw new IllegalStateException("Found multiple reducers ( " + reducersStr + " ) in the same pipeline phase. " + "This means there was a bug in planning the pipeline when it was deployed. ");
    }
    job.setMapperClass(ETLMapper.class);
    if (reducers.isEmpty()) {
        job.setNumReduceTasks(0);
    } else {
        job.setReducerClass(ETLReducer.class);
    }
    final Map<String, SinkOutput> sinkOutputs = new HashMap<>();
    final Map<String, String> inputAliasToStage = new HashMap<>();
    // call prepareRun on each stage in order so that any arguments set by a stage will be visible to subsequent stages
    for (final String stageName : phase.getDag().getTopologicalOrder()) {
        final StageSpec stageSpec = phase.getStage(stageName);
        String pluginType = stageSpec.getPluginType();
        boolean isConnectorSource = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSources().contains(stageName);
        boolean isConnectorSink = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && phase.getSinks().contains(stageName);
        SubmitterPlugin submitterPlugin = null;
        if (BatchSource.PLUGIN_TYPE.equals(pluginType) || isConnectorSource) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSource, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {

                @Override
                public void act(MapReduceBatchContext sourceContext) {
                    for (String inputAlias : sourceContext.getInputNames()) {
                        inputAliasToStage.put(inputAlias, stageName);
                    }
                }
            });
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType) || AlertPublisher.PLUGIN_TYPE.equals(pluginType) || isConnectorSink) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, batchSink, contextProvider, new SubmitterPlugin.PrepareAction<MapReduceBatchContext>() {

                @Override
                public void act(MapReduceBatchContext sinkContext) {
                    sinkOutputs.put(stageName, new SinkOutput(sinkContext.getOutputNames()));
                }
            });
        } else if (Transform.PLUGIN_TYPE.equals(pluginType)) {
            Transform<?, ?> transform = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<MapReduceBatchContext> contextProvider = new MapReduceBatchContextProvider(context, pipelineRuntime, stageSpec, connectorDatasets);
            submitterPlugin = new SubmitterPlugin<>(stageName, context, transform, contextProvider);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            final BatchAggregator<?, ?, ?> aggregator = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultAggregatorContext> contextProvider = new AggregatorContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
            submitterPlugin = new SubmitterPlugin<>(stageName, context, aggregator, contextProvider, new SubmitterPlugin.PrepareAction<DefaultAggregatorContext>() {

                @Override
                public void act(DefaultAggregatorContext aggregatorContext) {
                    if (aggregatorContext.getNumPartitions() != null) {
                        job.setNumReduceTasks(aggregatorContext.getNumPartitions());
                    }
                    Class<?> outputKeyClass = aggregatorContext.getGroupKeyClass();
                    Class<?> outputValClass = aggregatorContext.getGroupValueClass();
                    if (outputKeyClass == null) {
                        outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
                    }
                    if (outputValClass == null) {
                        outputValClass = TypeChecker.getGroupValueClass(aggregator);
                    }
                    hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                    hConf.set(MAP_VAL_CLASS, outputValClass.getName());
                    job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
                    job.setMapOutputValueClass(getOutputValClass(stageName, outputValClass));
                }
            });
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            final BatchJoiner<?, ?, ?> batchJoiner = pluginInstantiator.newPluginInstance(stageName, evaluator);
            ContextProvider<DefaultJoinerContext> contextProvider = new JoinerContextProvider(pipelineRuntime, stageSpec, context.getAdmin());
            submitterPlugin = new SubmitterPlugin<>(stageName, context, batchJoiner, contextProvider, new SubmitterPlugin.PrepareAction<DefaultJoinerContext>() {

                @Override
                public void act(DefaultJoinerContext joinerContext) {
                    if (joinerContext.getNumPartitions() != null) {
                        job.setNumReduceTasks(joinerContext.getNumPartitions());
                    }
                    Class<?> outputKeyClass = joinerContext.getJoinKeyClass();
                    Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
                    if (outputKeyClass == null) {
                        outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
                    }
                    if (inputRecordClass == null) {
                        inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
                    }
                    hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                    hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
                    job.setMapOutputKeyClass(getOutputKeyClass(stageName, outputKeyClass));
                    getOutputValClass(stageName, inputRecordClass);
                    // for joiner plugin map output is tagged with stageName
                    job.setMapOutputValueClass(TaggedWritable.class);
                }
            });
        }
        if (submitterPlugin != null) {
            submitterPlugin.prepareRun();
            finishers.add(submitterPlugin);
        }
    }
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
    hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
    finisher = new CompositeFinisher(finishers);
    job.setMapperClass(ETLMapper.class);
    WorkflowToken token = context.getWorkflowToken();
    if (token != null) {
        for (Map.Entry<String, String> entry : pipelineRuntime.getArguments().getAddedArguments().entrySet()) {
            token.put(entry.getKey(), entry.getValue());
        }
    }
    // token is null when just the mapreduce job is run but not the entire workflow
    // we still want things to work in that case.
    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(pipelineRuntime.getArguments().asMap()));
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) SubmitterPlugin(co.cask.cdap.etl.common.submit.SubmitterPlugin) Finisher(co.cask.cdap.etl.common.submit.Finisher) CompositeFinisher(co.cask.cdap.etl.common.submit.CompositeFinisher) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) Job(org.apache.hadoop.mapreduce.Job) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) ContextProvider(co.cask.cdap.etl.common.submit.ContextProvider) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) JoinerContextProvider(co.cask.cdap.etl.common.submit.JoinerContextProvider) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) AggregatorContextProvider(co.cask.cdap.etl.common.submit.AggregatorContextProvider) Map(java.util.Map) HashMap(java.util.HashMap) PipelineRuntime(co.cask.cdap.etl.common.PipelineRuntime) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) StageSpec(co.cask.cdap.etl.spec.StageSpec) MultiConnectorFactory(co.cask.cdap.etl.batch.connector.MultiConnectorFactory) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) TransactionPolicy(co.cask.cdap.api.annotation.TransactionPolicy)

Example 5 with StageSpec

use of co.cask.cdap.etl.spec.StageSpec in project cdap by caskdata.

the class MapReduceTransformExecutorFactory method getPipeStage.

private PipeStage getPipeStage(PipelinePhase pipeline, String stageName, Map<String, PipeStage> pipeStages, OutputWriter<?, ?> outputWriter) throws Exception {
    StageSpec stageSpec = pipeline.getStage(stageName);
    String pluginType = stageSpec.getPluginType();
    // handle ending stage case, which don't use PipeEmitter
    if (pipeline.getSinks().contains(stageName)) {
        if (Constants.Connector.PLUGIN_TYPE.equals(pluginType) || BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            // connectors and joiners require the getting the RecordInfo class directly instead of unwrapping it
            Transformation<RecordInfo<Object>, Object> sink = getTransformation(stageSpec);
            return new DirectOutputPipeStage<>(stageName, sink, new SinkEmitter<>(stageName, outputWriter));
        } else {
            // others (batchsink, aggregators, alertpublisher), only required the value within the RecordInfo
            return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), new SinkEmitter<>(stageName, outputWriter));
        }
    }
    // create PipeEmitter, which holds all output PipeStages it needs to write to and wraps any output it gets
    // into a RecordInfo
    // ConnectorSources require a special emitter since they need to build RecordInfo from the temporary dataset
    PipeEmitter.Builder emitterBuilder = Constants.Connector.PLUGIN_TYPE.equals(pluginType) && pipeline.getSources().contains(stageName) ? ConnectorSourceEmitter.builder(stageName) : PipeEmitter.builder(stageName);
    Map<String, StageSpec.Port> outputPorts = stageSpec.getOutputPorts();
    for (String outputStageName : pipeline.getDag().getNodeOutputs(stageName)) {
        StageSpec outputStageSpec = pipeline.getStage(outputStageName);
        String outputStageType = outputStageSpec.getPluginType();
        PipeStage outputPipeStage = pipeStages.get(outputStageName);
        if (ErrorTransform.PLUGIN_TYPE.equals(outputStageType)) {
            emitterBuilder.addErrorConsumer(outputPipeStage);
        } else if (AlertPublisher.PLUGIN_TYPE.equals(outputStageType)) {
            emitterBuilder.addAlertConsumer(outputPipeStage);
        } else if (Constants.Connector.PLUGIN_TYPE.equals(pluginType)) {
            // connectors only have a single output
            emitterBuilder.addOutputConsumer(outputPipeStage);
        } else {
            // if the output is a connector like agg5.connector, the outputPorts will contain the original 'agg5' as
            // a key, but not 'agg5.connector' so we need to lookup the original stage from the connector's plugin spec
            String originalOutputName = Constants.Connector.PLUGIN_TYPE.equals(outputStageType) ? outputStageSpec.getPlugin().getProperties().get(Constants.Connector.ORIGINAL_NAME) : outputStageName;
            String port = outputPorts.containsKey(originalOutputName) ? outputPorts.get(originalOutputName).getPort() : null;
            if (port != null) {
                emitterBuilder.addOutputConsumer(outputPipeStage, port);
            } else {
                emitterBuilder.addOutputConsumer(outputPipeStage);
            }
        }
    }
    PipeEmitter pipeEmitter = emitterBuilder.build();
    if (SplitterTransform.PLUGIN_TYPE.equals(pluginType)) {
        // this is a SplitterTransform, needs to emit records to the right outputs based on port
        return new MultiOutputTransformPipeStage<>(stageName, getMultiOutputTransform(stageSpec), pipeEmitter);
    } else {
        return new UnwrapPipeStage<>(stageName, getTransformation(stageSpec), pipeEmitter);
    }
}
Also used : UnwrapPipeStage(co.cask.cdap.etl.batch.UnwrapPipeStage) RecordInfo(co.cask.cdap.etl.common.RecordInfo) UnwrapPipeStage(co.cask.cdap.etl.batch.UnwrapPipeStage) MultiOutputTransformPipeStage(co.cask.cdap.etl.batch.MultiOutputTransformPipeStage) PipeStage(co.cask.cdap.etl.batch.PipeStage) DirectOutputPipeStage(co.cask.cdap.etl.batch.DirectOutputPipeStage) PipeEmitter(co.cask.cdap.etl.batch.PipeEmitter) DirectOutputPipeStage(co.cask.cdap.etl.batch.DirectOutputPipeStage) StageSpec(co.cask.cdap.etl.spec.StageSpec) MultiOutputTransformPipeStage(co.cask.cdap.etl.batch.MultiOutputTransformPipeStage)

Aggregations

StageSpec (co.cask.cdap.etl.spec.StageSpec)27 HashMap (java.util.HashMap)20 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)15 Map (java.util.Map)10 PipelineRuntime (co.cask.cdap.etl.common.PipelineRuntime)8 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)7 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)7 Connection (co.cask.cdap.etl.proto.Connection)7 HashSet (java.util.HashSet)7 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)6 PipelinePluginContext (co.cask.cdap.etl.common.plugin.PipelinePluginContext)5 PipelineSpec (co.cask.cdap.etl.spec.PipelineSpec)5 TransactionPolicy (co.cask.cdap.api.annotation.TransactionPolicy)4 PluginContext (co.cask.cdap.api.plugin.PluginContext)4 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)4 LinkedHashMap (java.util.LinkedHashMap)4 Test (org.junit.Test)4 DatasetContext (co.cask.cdap.api.data.DatasetContext)2 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)2 SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)2