Search in sources :

Example 6 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class StreamToDataset method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setNumReduceTasks(0);
    WorkflowToken workflowToken = context.getWorkflowToken();
    Class<? extends Mapper> mapper = PageTitleToDatasetMapper.class;
    String inputStream = WikipediaPipelineApp.PAGE_TITLES_STREAM;
    String outputDataset = WikipediaPipelineApp.PAGE_TITLES_DATASET;
    if (workflowToken != null) {
        Value likesToDatasetResult = workflowToken.get("result", WikipediaPipelineApp.LIKES_TO_DATASET_MR_NAME);
        if (likesToDatasetResult != null && likesToDatasetResult.getAsBoolean()) {
            // The "likes" stream to the dataset has already run and has been successful in this run so far.
            // Now run raw wikipedia stream to dataset.
            mapper = RawWikiDataToDatasetMapper.class;
            inputStream = WikipediaPipelineApp.RAW_WIKIPEDIA_STREAM;
            outputDataset = WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET;
        }
    }
    LOG.info("Using '{}' as the input stream and '{}' as the output dataset.", inputStream, outputDataset);
    job.setMapperClass(mapper);
    String dataNamespace = context.getRuntimeArguments().get(WikipediaPipelineApp.NAMESPACE_ARG);
    dataNamespace = dataNamespace == null ? getContext().getNamespace() : dataNamespace;
    context.addInput(Input.ofStream(inputStream).fromNamespace(dataNamespace));
    context.addOutput(Output.ofDataset(outputDataset).fromNamespace(dataNamespace));
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) Value(co.cask.cdap.api.workflow.Value) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) Job(org.apache.hadoop.mapreduce.Job)

Example 7 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class TopNMapReduce method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Map<String, String> runtimeArguments = context.getRuntimeArguments();
    Job job = context.getHadoopJob();
    WorkflowToken workflowToken = context.getWorkflowToken();
    int topNRank = 10;
    if (runtimeArguments.containsKey("topn.rank")) {
        topNRank = Integer.parseInt(runtimeArguments.get("topn.rank"));
    }
    if (workflowToken != null) {
        workflowToken.put("topn.rank", Value.of(topNRank));
    }
    int numReduceTasks = 1;
    if (runtimeArguments.containsKey("num.reduce.tasks")) {
        numReduceTasks = Integer.parseInt(runtimeArguments.get("num.reduce.tasks"));
    }
    job.setNumReduceTasks(numReduceTasks);
    job.setMapperClass(TokenizerMapper.class);
    job.setReducerClass(TopNReducer.class);
    String dataNamespace = runtimeArguments.get(WikipediaPipelineApp.NAMESPACE_ARG);
    dataNamespace = dataNamespace == null ? getContext().getNamespace() : dataNamespace;
    context.addInput(Input.ofDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET).fromNamespace(dataNamespace));
    context.addOutput(Output.ofDataset(WikipediaPipelineApp.MAPREDUCE_TOPN_OUTPUT).fromNamespace(dataNamespace));
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) WorkflowToken(co.cask.cdap.api.workflow.WorkflowToken) Job(org.apache.hadoop.mapreduce.Job)

Example 8 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class WikiContentValidatorAndNormalizer method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setMapperClass(FilterNormalizerMapper.class);
    job.setNumReduceTasks(0);
    String dataNamespace = context.getRuntimeArguments().get(WikipediaPipelineApp.NAMESPACE_ARG);
    dataNamespace = dataNamespace == null ? getContext().getNamespace() : dataNamespace;
    context.addInput(Input.ofDataset(WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET).fromNamespace(dataNamespace));
    context.addOutput(Output.ofDataset(WikipediaPipelineApp.NORMALIZED_WIKIPEDIA_DATASET).fromNamespace(dataNamespace));
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) Job(org.apache.hadoop.mapreduce.Job)

Example 9 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class PurchaseHistoryBuilder method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setReducerClass(PerUserReducer.class);
    context.addInput(Input.ofDataset("purchases"), PurchaseMapper.class);
    context.addOutput(Output.ofDataset("history"));
    // override default memory usage if the corresponding runtime arguments are set.
    Map<String, String> runtimeArgs = context.getRuntimeArguments();
    String mapperMemoryMBStr = runtimeArgs.get(MAPPER_MEMORY_MB);
    if (mapperMemoryMBStr != null) {
        context.setMapperResources(new Resources(Integer.parseInt(mapperMemoryMBStr)));
    }
    String reducerMemoryMBStr = runtimeArgs.get(REDUCER_MEMORY_MB);
    if (reducerMemoryMBStr != null) {
        context.setReducerResources(new Resources(Integer.parseInt(reducerMemoryMBStr)));
    }
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) Resources(co.cask.cdap.api.Resources) Job(org.apache.hadoop.mapreduce.Job)

Example 10 with MapReduceContext

use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.

the class WordCount method initialize.

@Override
public void initialize() throws Exception {
    MapReduceContext context = getContext();
    Job job = context.getHadoopJob();
    job.setMapperClass(Tokenizer.class);
    job.setReducerClass(Counter.class);
    job.setNumReduceTasks(1);
    String inputDataset = context.getRuntimeArguments().get("input");
    inputDataset = inputDataset != null ? inputDataset : "lines";
    String outputDataset = context.getRuntimeArguments().get("output");
    outputDataset = outputDataset != null ? outputDataset : "counts";
    context.addInput(Input.ofDataset(inputDataset));
    context.addOutput(Output.ofDataset(outputDataset));
}
Also used : MapReduceContext(co.cask.cdap.api.mapreduce.MapReduceContext) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

MapReduceContext (co.cask.cdap.api.mapreduce.MapReduceContext)12 Job (org.apache.hadoop.mapreduce.Job)12 PartitionKey (co.cask.cdap.api.dataset.lib.PartitionKey)3 HashMap (java.util.HashMap)3 PartitionedFileSet (co.cask.cdap.api.dataset.lib.PartitionedFileSet)2 WorkflowToken (co.cask.cdap.api.workflow.WorkflowToken)2 Resources (co.cask.cdap.api.Resources)1 TimePartitionedFileSet (co.cask.cdap.api.dataset.lib.TimePartitionedFileSet)1 KVTableStatePersistor (co.cask.cdap.api.dataset.lib.partitioned.KVTableStatePersistor)1 MacroEvaluator (co.cask.cdap.api.macro.MacroEvaluator)1 Value (co.cask.cdap.api.workflow.Value)1 BatchAggregator (co.cask.cdap.etl.api.batch.BatchAggregator)1 BatchJoiner (co.cask.cdap.etl.api.batch.BatchJoiner)1 BatchSinkContext (co.cask.cdap.etl.api.batch.BatchSinkContext)1 BatchSourceContext (co.cask.cdap.etl.api.batch.BatchSourceContext)1 BatchPhaseSpec (co.cask.cdap.etl.batch.BatchPhaseSpec)1 DefaultAggregatorContext (co.cask.cdap.etl.batch.DefaultAggregatorContext)1 DefaultJoinerContext (co.cask.cdap.etl.batch.DefaultJoinerContext)1 PipelinePluginInstantiator (co.cask.cdap.etl.batch.PipelinePluginInstantiator)1 StageFailureException (co.cask.cdap.etl.batch.StageFailureException)1