Search in sources :

Example 11 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class TransformExecutorFactory method setPipeTransformDetail.

private <KEY_OUT, VAL_OUT> void setPipeTransformDetail(PipelinePhase pipeline, String stageName, Map<String, PipeTransformDetail> transformations, Map<String, ErrorOutputWriter<Object, Object>> transformErrorSinkMap, OutputWriter<KEY_OUT, VAL_OUT> outputWriter) throws Exception {
    if (pipeline.getSinks().contains(stageName)) {
        StageInfo stageInfo = pipeline.getStage(stageName);
        // If there is a connector sink/ joiner at the end of pipeline, do not remove stage name. This is needed to save
        // stageName along with the record in connector sink and joiner takes input along with stageName
        String pluginType = stageInfo.getPluginType();
        boolean removeStageName = !(pluginType.equals(Constants.CONNECTOR_TYPE) || pluginType.equals(BatchJoiner.PLUGIN_TYPE));
        boolean isErrorConsumer = pluginType.equals(ErrorTransform.PLUGIN_TYPE);
        transformations.put(stageName, new PipeTransformDetail(stageName, removeStageName, isErrorConsumer, getTransformation(stageInfo), new SinkEmitter<>(stageName, outputWriter)));
        return;
    }
    try {
        addTransformation(pipeline, stageName, transformations, transformErrorSinkMap);
    } catch (Exception e) {
        // Catch the Exception to generate a User Error Log for the Pipeline
        PIPELINE_LOG.error("Failed to start pipeline stage '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", stageName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
        throw e;
    }
    for (String output : pipeline.getDag().getNodeOutputs(stageName)) {
        setPipeTransformDetail(pipeline, output, transformations, transformErrorSinkMap, outputWriter);
        transformations.get(stageName).addTransformation(output, transformations.get(output));
    }
}
Also used : StageInfo(co.cask.cdap.etl.planner.StageInfo) SinkEmitter(co.cask.cdap.etl.batch.mapreduce.SinkEmitter)

Example 12 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class SmartWorkflow method destroy.

@Override
public void destroy() {
    WorkflowContext workflowContext = getContext();
    // Execute the post actions only if pipeline is not running in preview mode.
    if (!workflowContext.getDataTracer(PostAction.PLUGIN_TYPE).isEnabled()) {
        BasicArguments arguments = new BasicArguments(workflowContext.getToken(), workflowContext.getRuntimeArguments());
        for (Map.Entry<String, PostAction> endingActionEntry : postActions.entrySet()) {
            String name = endingActionEntry.getKey();
            PostAction action = endingActionEntry.getValue();
            StageInfo stageInfo = StageInfo.builder(name, PostAction.PLUGIN_TYPE).setStageLoggingEnabled(spec.isStageLoggingEnabled()).setProcessTimingEnabled(spec.isProcessTimingEnabled()).build();
            BatchActionContext context = new WorkflowBackedActionContext(workflowContext, workflowMetrics, stageInfo, arguments);
            try {
                action.run(context);
            } catch (Throwable t) {
                LOG.error("Error while running post action {}.", name, t);
            }
        }
    }
    ProgramStatus status = getContext().getState().getStatus();
    if (status == ProgramStatus.FAILED) {
        WRAPPERLOGGER.error("Pipeline '{}' failed.", getContext().getApplicationSpecification().getName());
    } else {
        WRAPPERLOGGER.info("Pipeline '{}' {}.", getContext().getApplicationSpecification().getName(), status == ProgramStatus.COMPLETED ? "succeeded" : status.name().toLowerCase());
    }
}
Also used : BatchActionContext(co.cask.cdap.etl.api.batch.BatchActionContext) WorkflowBackedActionContext(co.cask.cdap.etl.batch.WorkflowBackedActionContext) StageInfo(co.cask.cdap.etl.planner.StageInfo) WorkflowContext(co.cask.cdap.api.workflow.WorkflowContext) BasicArguments(co.cask.cdap.etl.common.BasicArguments) PostAction(co.cask.cdap.etl.api.batch.PostAction) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ProgramStatus(co.cask.cdap.api.ProgramStatus)

Example 13 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class SmartWorkflow method addProgram.

private void addProgram(String phaseName, WorkflowProgramAdder programAdder) {
    PipelinePhase phase = plan.getPhase(phaseName);
    // artificially added by the control dag flattening process. So nothing to add, skip it
    if (phase == null) {
        return;
    }
    // can't use phase name as a program name because it might contain invalid characters
    String programName = "phase-" + phaseNum;
    phaseNum++;
    // if this phase uses connectors, add the local dataset for that connector if we haven't already
    for (StageInfo connectorInfo : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
        String connectorName = connectorInfo.getName();
        String datasetName = connectorDatasets.get(connectorName);
        if (datasetName == null) {
            datasetName = "conn-" + connectorNum++;
            connectorDatasets.put(connectorName, datasetName);
            // add the local dataset
            ConnectorSource connectorSource = new ConnectorSource(datasetName, null);
            connectorSource.configure(getConfigurer());
        }
    }
    Map<String, String> phaseConnectorDatasets = new HashMap<>();
    for (StageInfo connectorStage : phase.getStagesOfType(Constants.CONNECTOR_TYPE)) {
        phaseConnectorDatasets.put(connectorStage.getName(), connectorDatasets.get(connectorStage.getName()));
    }
    BatchPhaseSpec batchPhaseSpec = new BatchPhaseSpec(programName, phase, spec.getResources(), spec.getDriverResources(), spec.getClientResources(), spec.isStageLoggingEnabled(), spec.isProcessTimingEnabled(), phaseConnectorDatasets, spec.getNumOfRecordsPreview(), spec.getProperties());
    Set<String> pluginTypes = batchPhaseSpec.getPhase().getPluginTypes();
    if (pluginTypes.contains(Action.PLUGIN_TYPE)) {
        // actions will be all by themselves in a phase
        programAdder.addAction(new PipelineAction(batchPhaseSpec));
    } else if (pluginTypes.contains(Constants.SPARK_PROGRAM_PLUGIN_TYPE)) {
        // spark programs will be all by themselves in a phase
        String stageName = phase.getStagesOfType(Constants.SPARK_PROGRAM_PLUGIN_TYPE).iterator().next().getName();
        StageSpec stageSpec = stageSpecs.get(stageName);
        applicationConfigurer.addSpark(new ExternalSparkProgram(batchPhaseSpec, stageSpec));
        programAdder.addSpark(programName);
    } else if (useSpark) {
        applicationConfigurer.addSpark(new ETLSpark(batchPhaseSpec));
        programAdder.addSpark(programName);
    } else {
        applicationConfigurer.addMapReduce(new ETLMapReduce(batchPhaseSpec));
        programAdder.addMapReduce(programName);
    }
}
Also used : ETLMapReduce(co.cask.cdap.etl.batch.mapreduce.ETLMapReduce) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) PipelineAction(co.cask.cdap.etl.batch.customaction.PipelineAction) ConnectorSource(co.cask.cdap.etl.batch.connector.ConnectorSource) ETLSpark(co.cask.cdap.etl.spark.batch.ETLSpark) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) StageSpec(co.cask.cdap.etl.spec.StageSpec) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec)

Example 14 with StageInfo

use of co.cask.cdap.etl.planner.StageInfo in project cdap by caskdata.

the class TransformRunner method getSinkWriter.

// this is needed because we need to write to the context differently depending on the number of outputs
private OutputWriter<Object, Object> getSinkWriter(MapReduceTaskContext<Object, Object> context, PipelinePhase pipelinePhase, Configuration hConf) {
    Set<StageInfo> reducers = pipelinePhase.getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    JobContext hadoopContext = context.getHadoopContext();
    if (!reducers.isEmpty() && hadoopContext instanceof Mapper.Context) {
        return new SingleOutputWriter<>(context);
    }
    String sinkOutputsStr = hConf.get(ETLMapReduce.SINK_OUTPUTS_KEY);
    // should never happen, this is set in initialize
    Preconditions.checkNotNull(sinkOutputsStr, "Sink outputs not found in Hadoop conf.");
    Map<String, SinkOutput> sinkOutputs = GSON.fromJson(sinkOutputsStr, ETLMapReduce.SINK_OUTPUTS_TYPE);
    return hasSingleOutput(pipelinePhase.getStagesOfType(Transform.PLUGIN_TYPE), sinkOutputs) ? new SingleOutputWriter<>(context) : new MultiOutputWriter<>(context, sinkOutputs);
}
Also used : Mapper(org.apache.hadoop.mapreduce.Mapper) StageInfo(co.cask.cdap.etl.planner.StageInfo) JobContext(org.apache.hadoop.mapreduce.JobContext)

Aggregations

StageInfo (co.cask.cdap.etl.planner.StageInfo)14 HashMap (java.util.HashMap)8 Map (java.util.Map)6 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)4 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)3 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)3 BasicArguments (co.cask.cdap.etl.common.BasicArguments)3 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)3 PluginContext (co.cask.cdap.api.plugin.PluginContext)2 WorkflowContext (co.cask.cdap.api.workflow.WorkflowContext)2 BatchActionContext (co.cask.cdap.etl.api.batch.BatchActionContext)2 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)2 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)2 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)2 PostAction (co.cask.cdap.etl.api.batch.PostAction)2 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)2 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)2 CompositeFinisher (co.cask.cdap.etl.common.CompositeFinisher)2 DefaultStageMetrics (co.cask.cdap.etl.common.DefaultStageMetrics)2 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)2