Search in sources :

Example 1 with BatchSourceContext

use of co.cask.cdap.etl.api.batch.BatchSourceContext in project cdap by caskdata.

the class ETLMapReduce method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();
    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    hConf.setBoolean("mapreduce.map.speculative", false);
    hConf.setBoolean("mapreduce.reduce.speculative", false);
    // plugin name -> runtime args for that plugin
    Map<String, Map<String, String>> runtimeArgs = new HashMap<>();
    MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec);
    Map<String, String> inputAliasToStage = new HashMap<>();
    for (String sourceName : phaseSpec.getPhase().getSources()) {
        try {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName, evaluator);
            StageInfo stageInfo = phaseSpec.getPhase().getStage(sourceName);
            MapReduceBatchContext sourceContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
            batchSource.prepareRun(sourceContext);
            runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
            for (String inputAlias : sourceContext.getInputNames()) {
                inputAliasToStage.put(inputAlias, sourceName);
            }
            finishers.add(batchSource, sourceContext);
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize batch source '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sourceName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    }
    hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
    Map<String, SinkOutput> sinkOutputs = new HashMap<>();
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        String sinkName = stageInfo.getName();
        // todo: add a better way to get info for all sinks
        if (!phase.getSinks().contains(sinkName)) {
            continue;
        }
        try {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName, evaluator);
            MapReduceBatchContext sinkContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
            batchSink.prepareRun(sinkContext);
            runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
            finishers.add(batchSink, sinkContext);
            sinkOutputs.put(sinkName, new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize batch sink '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sinkName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    }
    finisher = finishers.build();
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
    // setup time partition for each error dataset
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        if (stageInfo.getErrorDatasetName() != null) {
            Map<String, String> args = new HashMap<>();
            args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", Constants.ERROR_SCHEMA.toString());
            TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
            context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
        }
    }
    job.setMapperClass(ETLMapper.class);
    Set<StageInfo> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (!reducers.isEmpty()) {
        job.setReducerClass(ETLReducer.class);
        String reducerName = reducers.iterator().next().getName();
        StageInfo stageInfo = phase.getStage(reducerName);
        Class<?> outputKeyClass;
        Class<?> outputValClass;
        try {
            if (!phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE).isEmpty()) {
                BatchAggregator aggregator = pluginInstantiator.newPluginInstance(reducerName, evaluator);
                DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, mrMetrics, stageInfo);
                aggregator.prepareRun(aggregatorContext);
                finishers.add(aggregator, aggregatorContext);
                if (aggregatorContext.getNumPartitions() != null) {
                    job.setNumReduceTasks(aggregatorContext.getNumPartitions());
                }
                outputKeyClass = aggregatorContext.getGroupKeyClass();
                outputValClass = aggregatorContext.getGroupValueClass();
                if (outputKeyClass == null) {
                    outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
                }
                if (outputValClass == null) {
                    outputValClass = TypeChecker.getGroupValueClass(aggregator);
                }
                hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                hConf.set(MAP_VAL_CLASS, outputValClass.getName());
                job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
                job.setMapOutputValueClass(getOutputValClass(reducerName, outputValClass));
            } else {
                // reducer type is joiner
                BatchJoiner batchJoiner = pluginInstantiator.newPluginInstance(reducerName, evaluator);
                DefaultJoinerContext joinerContext = new DefaultJoinerContext(context, mrMetrics, stageInfo);
                batchJoiner.prepareRun(joinerContext);
                finishers.add(batchJoiner, joinerContext);
                if (joinerContext.getNumPartitions() != null) {
                    job.setNumReduceTasks(joinerContext.getNumPartitions());
                }
                outputKeyClass = joinerContext.getJoinKeyClass();
                Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
                if (outputKeyClass == null) {
                    outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
                }
                if (inputRecordClass == null) {
                    inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
                }
                hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
                job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
                getOutputValClass(reducerName, inputRecordClass);
                // for joiner plugin map output is tagged with stageName
                job.setMapOutputValueClass(TaggedWritable.class);
            }
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize pipeline stage '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", reducerName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    } else {
        job.setNumReduceTasks(0);
    }
    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) CompositeFinisher(co.cask.cdap.etl.common.CompositeFinisher) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) Job(org.apache.hadoop.mapreduce.Job) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) StageFailureException(co.cask.cdap.etl.batch.StageFailureException) IOException(java.io.IOException) MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) Map(java.util.Map) HashMap(java.util.HashMap)

Example 2 with BatchSourceContext

use of co.cask.cdap.etl.api.batch.BatchSourceContext in project cdap by caskdata.

the class ETLSpark method initialize.

@Override
public void initialize() throws Exception {
    SparkClientContext context = getContext();
    cleanupFiles = new ArrayList<>();
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.driver.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.executor.extraJavaOptions", "-XX:MaxPermSize=256m");
    sparkConf.set("spark.speculation", "false");
    context.setSparkConf(sparkConf);
    Map<String, String> properties = context.getSpecification().getProperties();
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        sparkConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    SparkBatchSourceFactory sourceFactory = new SparkBatchSourceFactory();
    SparkBatchSinkFactory sinkFactory = new SparkBatchSinkFactory();
    Map<String, Integer> stagePartitions = new HashMap<>();
    PluginContext pluginContext = new SparkPipelinePluginContext(context, context.getMetrics(), phaseSpec.isStageLoggingEnabled(), phaseSpec.isProcessTimingEnabled());
    for (StageInfo stageInfo : phaseSpec.getPhase()) {
        String stageName = stageInfo.getName();
        String pluginType = stageInfo.getPluginType();
        if (BatchSource.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<BatchSourceContext> batchSource = pluginContext.newPluginInstance(stageName, evaluator);
            BatchSourceContext sourceContext = new SparkBatchSourceContext(sourceFactory, context, stageInfo);
            batchSource.prepareRun(sourceContext);
            finishers.add(batchSource, sourceContext);
        } else if (BatchSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<BatchSinkContext> batchSink = pluginContext.newPluginInstance(stageName, evaluator);
            BatchSinkContext sinkContext = new SparkBatchSinkContext(sinkFactory, context, null, stageInfo);
            batchSink.prepareRun(sinkContext);
            finishers.add(batchSink, sinkContext);
        } else if (SparkSink.PLUGIN_TYPE.equals(pluginType)) {
            BatchConfigurable<SparkPluginContext> sparkSink = pluginContext.newPluginInstance(stageName, evaluator);
            SparkPluginContext sparkPluginContext = new BasicSparkPluginContext(context, stageInfo);
            sparkSink.prepareRun(sparkPluginContext);
            finishers.add(sparkSink, sparkPluginContext);
        } else if (BatchAggregator.PLUGIN_TYPE.equals(pluginType)) {
            BatchAggregator aggregator = pluginContext.newPluginInstance(stageName, evaluator);
            DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, stageInfo);
            aggregator.prepareRun(aggregatorContext);
            finishers.add(aggregator, aggregatorContext);
            stagePartitions.put(stageName, aggregatorContext.getNumPartitions());
        } else if (BatchJoiner.PLUGIN_TYPE.equals(pluginType)) {
            BatchJoiner joiner = pluginContext.newPluginInstance(stageName, evaluator);
            DefaultJoinerContext sparkJoinerContext = new DefaultJoinerContext(context, stageInfo);
            joiner.prepareRun(sparkJoinerContext);
            finishers.add(joiner, sparkJoinerContext);
            stagePartitions.put(stageName, sparkJoinerContext.getNumPartitions());
        }
    }
    File configFile = File.createTempFile("HydratorSpark", ".config");
    cleanupFiles.add(configFile);
    try (Writer writer = Files.newBufferedWriter(configFile.toPath(), StandardCharsets.UTF_8)) {
        SparkBatchSourceSinkFactoryInfo sourceSinkInfo = new SparkBatchSourceSinkFactoryInfo(sourceFactory, sinkFactory, stagePartitions);
        writer.write(GSON.toJson(sourceSinkInfo));
    }
    finisher = finishers.build();
    context.localize("HydratorSpark.config", configFile.toURI());
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) HashMap(java.util.HashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) SparkClientContext(co.cask.cdap.api.spark.SparkClientContext) CompositeFinisher(co.cask.cdap.etl.common.CompositeFinisher) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) SparkPipelinePluginContext(co.cask.cdap.etl.spark.plugin.SparkPipelinePluginContext) PluginContext(co.cask.cdap.api.plugin.PluginContext) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) SparkPluginContext(co.cask.cdap.etl.api.batch.SparkPluginContext) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) BatchConfigurable(co.cask.cdap.etl.api.batch.BatchConfigurable) File(java.io.File) Writer(java.io.Writer)

Aggregations

MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)2 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)2 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)2 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)2 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)2 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)2 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)2 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)2 CompositeFinisher (co.cask.cdap.etl.common.CompositeFinisher)2 DefaultMacroEvaluator (co.cask.cdap.etl.common.DefaultMacroEvaluator)2 StageInfo (co.cask.cdap.etl.planner.StageInfo)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 MapReduceContext (co.cask.cdap.api.mapreduce.MapReduceContext)1 PluginContext (co.cask.cdap.api.plugin.PluginContext)1 SparkClientContext (co.cask.cdap.api.spark.SparkClientContext)1 BatchConfigurable (co.cask.cdap.etl.api.batch.BatchConfigurable)1 SparkPluginContext (co.cask.cdap.etl.api.batch.SparkPluginContext)1 PipelinePluginInstantiator (co.cask.cdap.etl.batch.PipelinePluginInstantiator)1 StageFailureException (co.cask.cdap.etl.batch.StageFailureException)1