Search in sources :

Example 1 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class ClicksAndViewsMapReduce method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    context.addInput(Input.ofStream(ClicksAndViews.CLICKS));
    context.addInput(Input.ofStream(ClicksAndViews.VIEWS));
    PartitionedFileSet joinedPFS = context.getDataset(ClicksAndViews.JOINED);
    PartitionKey outputPartitionKey = PartitionedFileSetArguments.getOutputPartitionKey(context.getRuntimeArguments(), joinedPFS.getPartitioning());
    if (outputPartitionKey == null) {
        outputPartitionKey = PartitionKey.builder().addLongField("runtime", context.getLogicalStartTime()).build();
    }
    Map<String, String> outputArgs = new HashMap<>();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputPartitionKey);
    context.addOutput(Output.ofDataset(ClicksAndViews.JOINED, outputArgs));
    Job job = context.getHadoopJob();
    job.setMapperClass(ImpressionKeyingMapper.class);
    job.setReducerClass(JoiningReducer.class);
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) HashMap(java.util.HashMap) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Job(org.apache.hadoop.mapreduce.Job)

Example 2 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class ETLMapReduce method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Map<String, String> properties = context.getSpecification().getProperties();
    if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
        LogStageInjector.start();
    }
    CompositeFinisher.Builder finishers = CompositeFinisher.builder();
    Job job = context.getHadoopJob();
    Configuration hConf = job.getConfiguration();
    hConf.setBoolean("mapreduce.map.speculative", false);
    hConf.setBoolean("mapreduce.reduce.speculative", false);
    // plugin name -> runtime args for that plugin
    Map<String, Map<String, String>> runtimeArgs = new HashMap<>();
    MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
    BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
    for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
        hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
    }
    PipelinePhase phase = phaseSpec.getPhase();
    PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec);
    Map<String, String> inputAliasToStage = new HashMap<>();
    for (String sourceName : phaseSpec.getPhase().getSources()) {
        try {
            BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName, evaluator);
            StageInfo stageInfo = phaseSpec.getPhase().getStage(sourceName);
            MapReduceBatchContext sourceContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
            batchSource.prepareRun(sourceContext);
            runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
            for (String inputAlias : sourceContext.getInputNames()) {
                inputAliasToStage.put(inputAlias, sourceName);
            }
            finishers.add(batchSource, sourceContext);
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize batch source '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sourceName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    }
    hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
    Map<String, SinkOutput> sinkOutputs = new HashMap<>();
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        String sinkName = stageInfo.getName();
        // todo: add a better way to get info for all sinks
        if (!phase.getSinks().contains(sinkName)) {
            continue;
        }
        try {
            BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName, evaluator);
            MapReduceBatchContext sinkContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
            batchSink.prepareRun(sinkContext);
            runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
            finishers.add(batchSink, sinkContext);
            sinkOutputs.put(sinkName, new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize batch sink '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sinkName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    }
    finisher = finishers.build();
    hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
    // setup time partition for each error dataset
    for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
        if (stageInfo.getErrorDatasetName() != null) {
            Map<String, String> args = new HashMap<>();
            args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", Constants.ERROR_SCHEMA.toString());
            TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
            context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
        }
    }
    job.setMapperClass(ETLMapper.class);
    Set<StageInfo> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
    if (!reducers.isEmpty()) {
        job.setReducerClass(ETLReducer.class);
        String reducerName = reducers.iterator().next().getName();
        StageInfo stageInfo = phase.getStage(reducerName);
        Class<?> outputKeyClass;
        Class<?> outputValClass;
        try {
            if (!phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE).isEmpty()) {
                BatchAggregator aggregator = pluginInstantiator.newPluginInstance(reducerName, evaluator);
                DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, mrMetrics, stageInfo);
                aggregator.prepareRun(aggregatorContext);
                finishers.add(aggregator, aggregatorContext);
                if (aggregatorContext.getNumPartitions() != null) {
                    job.setNumReduceTasks(aggregatorContext.getNumPartitions());
                }
                outputKeyClass = aggregatorContext.getGroupKeyClass();
                outputValClass = aggregatorContext.getGroupValueClass();
                if (outputKeyClass == null) {
                    outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
                }
                if (outputValClass == null) {
                    outputValClass = TypeChecker.getGroupValueClass(aggregator);
                }
                hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                hConf.set(MAP_VAL_CLASS, outputValClass.getName());
                job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
                job.setMapOutputValueClass(getOutputValClass(reducerName, outputValClass));
            } else {
                // reducer type is joiner
                BatchJoiner batchJoiner = pluginInstantiator.newPluginInstance(reducerName, evaluator);
                DefaultJoinerContext joinerContext = new DefaultJoinerContext(context, mrMetrics, stageInfo);
                batchJoiner.prepareRun(joinerContext);
                finishers.add(batchJoiner, joinerContext);
                if (joinerContext.getNumPartitions() != null) {
                    job.setNumReduceTasks(joinerContext.getNumPartitions());
                }
                outputKeyClass = joinerContext.getJoinKeyClass();
                Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
                if (outputKeyClass == null) {
                    outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
                }
                if (inputRecordClass == null) {
                    inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
                }
                hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
                hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
                job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
                getOutputValClass(reducerName, inputRecordClass);
                // for joiner plugin map output is tagged with stageName
                job.setMapOutputValueClass(TaggedWritable.class);
            }
        } catch (Exception e) {
            // Catch the Exception to generate a User Error Log for the Pipeline
            PIPELINE_LOG.error("Failed to initialize pipeline stage '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", reducerName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
            throw e;
        }
    } else {
        job.setNumReduceTasks(0);
    }
    hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}
Also used : DefaultAggregatorContext(co.cask.cdap.etl.batch.DefaultAggregatorContext) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) MacroEvaluator(co.cask.cdap.api.macro.MacroEvaluator) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) StageInfo(co.cask.cdap.etl.planner.StageInfo) CompositeFinisher(co.cask.cdap.etl.common.CompositeFinisher) DefaultJoinerContext(co.cask.cdap.etl.batch.DefaultJoinerContext) BatchAggregator(co.cask.cdap.etl.api.batch.BatchAggregator) DefaultMacroEvaluator(co.cask.cdap.etl.common.DefaultMacroEvaluator) Job(org.apache.hadoop.mapreduce.Job) PipelinePluginInstantiator(co.cask.cdap.etl.batch.PipelinePluginInstantiator) BatchSourceContext(co.cask.cdap.etl.api.batch.BatchSourceContext) BatchSinkContext(co.cask.cdap.etl.api.batch.BatchSinkContext) BatchJoiner(co.cask.cdap.etl.api.batch.BatchJoiner) StageFailureException(co.cask.cdap.etl.batch.StageFailureException) IOException(java.io.IOException) MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) PipelinePhase(co.cask.cdap.etl.common.PipelinePhase) BatchPhaseSpec(co.cask.cdap.etl.batch.BatchPhaseSpec) Map(java.util.Map) HashMap(java.util.HashMap)

Example 3 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class ScoreCounter method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setMapperClass(ResultsMapper.class);
    job.setReducerClass(TeamCounter.class);
    job.setNumReduceTasks(1);
    String league = context.getRuntimeArguments().get("league");
    Preconditions.checkNotNull(league);
    // Configure the input to read all seasons for the league
    Map<String, String> inputArgs = Maps.newHashMap();
    PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addValueCondition("league", league).build());
    context.addInput(Input.ofDataset("results", inputArgs));
    // Each run writes its output to a partition for the league
    Map<String, String> outputArgs = Maps.newHashMap();
    PartitionKey outputKey = PartitionKey.builder().addStringField("league", league).build();
    PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
    context.addOutput(Output.ofDataset("totals", outputArgs));
    // used only for logging:
    PartitionedFileSet input = context.getDataset("results", inputArgs);
    PartitionedFileSet outputFileSet = context.getDataset("totals", outputArgs);
    String outputPath = FileSetArguments.getOutputPath(outputFileSet.getEmbeddedFileSet().getRuntimeArguments());
    LOG.info("input: {}, output: {}", input.getEmbeddedFileSet().getInputLocations(), outputPath);
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) PartitionKey(co.cask.cdap.api.dataset.lib.PartitionKey) PartitionedFileSet(co.cask.cdap.api.dataset.lib.PartitionedFileSet) Job(org.apache.hadoop.mapreduce.Job)

Example 4 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class WikipediaDataDownloader method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setMapperClass(WikipediaDataDownloaderMapper.class);
    job.setNumReduceTasks(0);
    String dataNamespace = context.getRuntimeArguments().get(WikipediaPipelineApp.NAMESPACE_ARG);
    dataNamespace = dataNamespace == null ? getContext().getNamespace() : dataNamespace;
    context.addInput(Input.ofDataset(WikipediaPipelineApp.PAGE_TITLES_DATASET).fromNamespace(dataNamespace));
    context.addOutput(Output.ofDataset(WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET).fromNamespace(dataNamespace));
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) Job(org.apache.hadoop.mapreduce.Job)

Example 5 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class StreamConversionMapReduce method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setMapperClass(StreamConversionMapper.class);
    job.setNumReduceTasks(0);
    job.setMapOutputKeyClass(AvroKey.class);
    job.setMapOutputValueClass(NullWritable.class);
    AvroJob.setOutputKeySchema(job, SCHEMA);
    // read 5 minutes of events from the stream, ending at the logical start time of this run
    long logicalTime = context.getLogicalStartTime();
    context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime));
    // each run writes its output to a partition with the logical start time.
    TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime);
    context.addOutput(Output.ofDataset("converted", dsArguments));
    TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments);
    LOG.info("Output location for new partition is: {}", partitionedFileSet.getEmbeddedFileSet().getOutputLocation());
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) AvroJob(org.apache.avro.mapreduce.AvroJob) Job(org.apache.hadoop.mapreduce.Job) TimePartitionedFileSet(co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)

Aggregations

MapReduceContext (co.cask.cdap.api.mapreduce.MapReduceContext)12 Job (org.apache.hadoop.mapreduce.Job)12 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)3 HashMap (java.util.HashMap)3 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)2 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)2 Resources (co.cask.cdap.api.Resources)1 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)1 KVTableStatePersistor (co.cask.cdap.api.dataset.lib.partitioned.KVTableStatePersistor)1 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)1 Value (co.cask.cdap.api.workflow.Value)1 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)1 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)1 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)1 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)1 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)1 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)1 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)1 PipelinePluginInstantiator (co.cask.cdap.etl.batch.PipelinePluginInstantiator)1 StageFailureException (co.cask.cdap.etl.batch.StageFailureException)1