Search in sources :

Example 1 with PipelinePluginInstantiator

use of co.cask.cdap.etl.batch.PipelinePluginInstantiator in project cdap by caskdata.

the class ETLMapReduce method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();
    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    hConf.setBoolean("mapreduce.map.speculative", false);
    hConf.setBoolean("mapreduce.reduce.speculative", false);
    // plugin name -> runtime args for that plugin
    Map<String, Map<String, String>> runtimeArgs = new HashMap<>();
    MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec);
    Map<String, String> inputAliasToStage = new HashMap<>();
    for (String sourceName : phaseSpec.getPhase().getSources()) {
        try {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName, evaluator);
            StageInfo stageInfo = phaseSpec.getPhase().getStage(sourceName);
            MapReduceBatchContext sourceContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
            batchSource.prepareRun(sourceContext);
            runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
            for (String inputAlias : sourceContext.getInputNames()) {
                inputAliasToStage.put(inputAlias, sourceName);
            }
            finishers.add(batchSource, sourceContext);
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize batch source '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sourceName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    }
    hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
    Map<String, SinkOutput> sinkOutputs = new HashMap<>();
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        String sinkName = stageInfo.getName();
        // todo: add a better way to get info for all sinks
        if (!phase.getSinks().contains(sinkName)) {
            continue;
        }
        try {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName, evaluator);
            MapReduceBatchContext sinkContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
            batchSink.prepareRun(sinkContext);
            runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
            finishers.add(batchSink, sinkContext);
            sinkOutputs.put(sinkName, new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize batch sink '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sinkName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    }
    finisher = finishers.build();
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
    // setup time partition for each error dataset
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        if (stageInfo.getErrorDatasetName() != null) {
            Map<String, String> args = new HashMap<>();
            args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", Constants.ERROR_SCHEMA.toString());
            TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
            context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
        }
    }
    job.setMapperClass(ETLMapper.class);
    Set<StageInfo> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (!reducers.isEmpty()) {
        job.setReducerClass(ETLReducer.class);
        String reducerName = reducers.iterator().next().getName();
        StageInfo stageInfo = phase.getStage(reducerName);
        Class<?> outputKeyClass;
        Class<?> outputValClass;
        try {
            if (!phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE).isEmpty()) {
                BatchAggregator aggregator = pluginInstantiator.newPluginInstance(reducerName, evaluator);
                DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, mrMetrics, stageInfo);
                aggregator.prepareRun(aggregatorContext);
                finishers.add(aggregator, aggregatorContext);
                if (aggregatorContext.getNumPartitions() != null) {
                    job.setNumReduceTasks(aggregatorContext.getNumPartitions());
                }
                outputKeyClass = aggregatorContext.getGroupKeyClass();
                outputValClass = aggregatorContext.getGroupValueClass();
                if (outputKeyClass == null) {
                    outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
                }
                if (outputValClass == null) {
                    outputValClass = TypeChecker.getGroupValueClass(aggregator);
                }
                hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                hConf.set(MAP_VAL_CLASS, outputValClass.getName());
                job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
                job.setMapOutputValueClass(getOutputValClass(reducerName, outputValClass));
            } else {
                // reducer type is joiner
                BatchJoiner batchJoiner = pluginInstantiator.newPluginInstance(reducerName, evaluator);
                DefaultJoinerContext joinerContext = new DefaultJoinerContext(context, mrMetrics, stageInfo);
                batchJoiner.prepareRun(joinerContext);
                finishers.add(batchJoiner, joinerContext);
                if (joinerContext.getNumPartitions() != null) {
                    job.setNumReduceTasks(joinerContext.getNumPartitions());
                }
                outputKeyClass = joinerContext.getJoinKeyClass();
                Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
                if (outputKeyClass == null) {
                    outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
                }
                if (inputRecordClass == null) {
                    inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
                }
                hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
                job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
                getOutputValClass(reducerName, inputRecordClass);
                // for joiner plugin map output is tagged with stageName
                job.setMapOutputValueClass(TaggedWritable.class);
            }
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize pipeline stage '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", reducerName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    } else {
        job.setNumReduceTasks(0);
    }
    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) CompositeFinisher(co.cask.cdap.etl.common.CompositeFinisher) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) Job(org.apache.hadoop.mapreduce.Job) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) StageFailureException(co.cask.cdap.etl.batch.StageFailureException) IOException(java.io.IOException) MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)1 MapReduceContext (co.cask.cdap.api.mapreduce.MapReduceContext)1 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)1 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)1 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)1 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)1 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)1 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)1 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)1 PipelinePluginInstantiator (co.cask.cdap.etl.batch.PipelinePluginInstantiator)1 StageFailureException (co.cask.cdap.etl.batch.StageFailureException)1 CompositeFinisher (co.cask.cdap.etl.common.CompositeFinisher)1 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)1 PipelinePhase (co.cask.cdap.etl.common.PipelinePhase)1 StageInfo (co.cask.cdap.etl.planner.StageInfo)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Configuration (org.apache.hadoop.conf.Configuration)1 Job (org.apache.hadoop.mapreduce.Job)1