use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.
the class ClicksAndViewsMapReduce method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
context.addInput(Input.ofStream(ClicksAndViews.CLICKS));
context.addInput(Input.ofStream(ClicksAndViews.VIEWS));
PartitionedFileSet joinedPFS = context.getDataset(ClicksAndViews.JOINED);
PartitionKey outputPartitionKey = PartitionedFileSetArguments.getOutputPartitionKey(context.getRuntimeArguments(), joinedPFS.getPartitioning());
if (outputPartitionKey == null) {
outputPartitionKey = PartitionKey.builder().addLongField("runtime", context.getLogicalStartTime()).build();
}
Map<String, String> outputArgs = new HashMap<>();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputPartitionKey);
context.addOutput(Output.ofDataset(ClicksAndViews.JOINED, outputArgs));
Job job = context.getHadoopJob();
job.setMapperClass(ImpressionKeyingMapper.class);
job.setReducerClass(JoiningReducer.class);
}
use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.
the class ETLMapReduce method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
Map<String, String> properties = context.getSpecification().getProperties();
if (Boolean.valueOf(properties.get(Constants.STAGE_LOGGING_ENABLED))) {
LogStageInjector.start();
}
CompositeFinisher.Builder finishers = CompositeFinisher.builder();
Job job = context.getHadoopJob();
Configuration hConf = job.getConfiguration();
hConf.setBoolean("mapreduce.map.speculative", false);
hConf.setBoolean("mapreduce.reduce.speculative", false);
// plugin name -> runtime args for that plugin
Map<String, Map<String, String>> runtimeArgs = new HashMap<>();
MacroEvaluator evaluator = new DefaultMacroEvaluator(context.getWorkflowToken(), context.getRuntimeArguments(), context.getLogicalStartTime(), context, context.getNamespace());
BatchPhaseSpec phaseSpec = GSON.fromJson(properties.get(Constants.PIPELINEID), BatchPhaseSpec.class);
for (Map.Entry<String, String> pipelineProperty : phaseSpec.getPipelineProperties().entrySet()) {
hConf.set(pipelineProperty.getKey(), pipelineProperty.getValue());
}
PipelinePhase phase = phaseSpec.getPhase();
PipelinePluginInstantiator pluginInstantiator = new PipelinePluginInstantiator(context, mrMetrics, phaseSpec);
Map<String, String> inputAliasToStage = new HashMap<>();
for (String sourceName : phaseSpec.getPhase().getSources()) {
try {
BatchConfigurable<BatchSourceContext> batchSource = pluginInstantiator.newPluginInstance(sourceName, evaluator);
StageInfo stageInfo = phaseSpec.getPhase().getStage(sourceName);
MapReduceBatchContext sourceContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
batchSource.prepareRun(sourceContext);
runtimeArgs.put(sourceName, sourceContext.getRuntimeArguments());
for (String inputAlias : sourceContext.getInputNames()) {
inputAliasToStage.put(inputAlias, sourceName);
}
finishers.add(batchSource, sourceContext);
} catch (Exception e) {
// Catch the Exception to generate a User Error Log for the Pipeline
PIPELINE_LOG.error("Failed to initialize batch source '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sourceName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
throw e;
}
}
hConf.set(INPUT_ALIAS_KEY, GSON.toJson(inputAliasToStage));
Map<String, SinkOutput> sinkOutputs = new HashMap<>();
for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Constants.CONNECTOR_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
String sinkName = stageInfo.getName();
// todo: add a better way to get info for all sinks
if (!phase.getSinks().contains(sinkName)) {
continue;
}
try {
BatchConfigurable<BatchSinkContext> batchSink = pluginInstantiator.newPluginInstance(sinkName, evaluator);
MapReduceBatchContext sinkContext = new MapReduceBatchContext(context, mrMetrics, stageInfo);
batchSink.prepareRun(sinkContext);
runtimeArgs.put(sinkName, sinkContext.getRuntimeArguments());
finishers.add(batchSink, sinkContext);
sinkOutputs.put(sinkName, new SinkOutput(sinkContext.getOutputNames(), stageInfo.getErrorDatasetName()));
} catch (Exception e) {
// Catch the Exception to generate a User Error Log for the Pipeline
PIPELINE_LOG.error("Failed to initialize batch sink '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", sinkName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
throw e;
}
}
finisher = finishers.build();
hConf.set(SINK_OUTPUTS_KEY, GSON.toJson(sinkOutputs));
// setup time partition for each error dataset
for (StageInfo stageInfo : Sets.union(phase.getStagesOfType(Transform.PLUGIN_TYPE), phase.getStagesOfType(BatchSink.PLUGIN_TYPE))) {
if (stageInfo.getErrorDatasetName() != null) {
Map<String, String> args = new HashMap<>();
args.put(FileSetProperties.OUTPUT_PROPERTIES_PREFIX + "avro.schema.output.key", Constants.ERROR_SCHEMA.toString());
TimePartitionedFileSetArguments.setOutputPartitionTime(args, context.getLogicalStartTime());
context.addOutput(Output.ofDataset(stageInfo.getErrorDatasetName(), args));
}
}
job.setMapperClass(ETLMapper.class);
Set<StageInfo> reducers = phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE, BatchJoiner.PLUGIN_TYPE);
if (!reducers.isEmpty()) {
job.setReducerClass(ETLReducer.class);
String reducerName = reducers.iterator().next().getName();
StageInfo stageInfo = phase.getStage(reducerName);
Class<?> outputKeyClass;
Class<?> outputValClass;
try {
if (!phaseSpec.getPhase().getStagesOfType(BatchAggregator.PLUGIN_TYPE).isEmpty()) {
BatchAggregator aggregator = pluginInstantiator.newPluginInstance(reducerName, evaluator);
DefaultAggregatorContext aggregatorContext = new DefaultAggregatorContext(context, mrMetrics, stageInfo);
aggregator.prepareRun(aggregatorContext);
finishers.add(aggregator, aggregatorContext);
if (aggregatorContext.getNumPartitions() != null) {
job.setNumReduceTasks(aggregatorContext.getNumPartitions());
}
outputKeyClass = aggregatorContext.getGroupKeyClass();
outputValClass = aggregatorContext.getGroupValueClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getGroupKeyClass(aggregator);
}
if (outputValClass == null) {
outputValClass = TypeChecker.getGroupValueClass(aggregator);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, outputValClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
job.setMapOutputValueClass(getOutputValClass(reducerName, outputValClass));
} else {
// reducer type is joiner
BatchJoiner batchJoiner = pluginInstantiator.newPluginInstance(reducerName, evaluator);
DefaultJoinerContext joinerContext = new DefaultJoinerContext(context, mrMetrics, stageInfo);
batchJoiner.prepareRun(joinerContext);
finishers.add(batchJoiner, joinerContext);
if (joinerContext.getNumPartitions() != null) {
job.setNumReduceTasks(joinerContext.getNumPartitions());
}
outputKeyClass = joinerContext.getJoinKeyClass();
Class<?> inputRecordClass = joinerContext.getJoinInputRecordClass();
if (outputKeyClass == null) {
outputKeyClass = TypeChecker.getJoinKeyClass(batchJoiner);
}
if (inputRecordClass == null) {
inputRecordClass = TypeChecker.getJoinInputRecordClass(batchJoiner);
}
hConf.set(MAP_KEY_CLASS, outputKeyClass.getName());
hConf.set(MAP_VAL_CLASS, inputRecordClass.getName());
job.setMapOutputKeyClass(getOutputKeyClass(reducerName, outputKeyClass));
getOutputValClass(reducerName, inputRecordClass);
// for joiner plugin map output is tagged with stageName
job.setMapOutputValueClass(TaggedWritable.class);
}
} catch (Exception e) {
// Catch the Exception to generate a User Error Log for the Pipeline
PIPELINE_LOG.error("Failed to initialize pipeline stage '{}' with the error: {}. Please review your pipeline " + "configuration and check the system logs for more details.", reducerName, Throwables.getRootCause(e).getMessage(), Throwables.getRootCause(e));
throw e;
}
} else {
job.setNumReduceTasks(0);
}
hConf.set(RUNTIME_ARGS_KEY, GSON.toJson(runtimeArgs));
}
use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.
the class ScoreCounter method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
Job job = context.getHadoopJob();
job.setMapperClass(ResultsMapper.class);
job.setReducerClass(TeamCounter.class);
job.setNumReduceTasks(1);
String league = context.getRuntimeArguments().get("league");
Preconditions.checkNotNull(league);
// Configure the input to read all seasons for the league
Map<String, String> inputArgs = Maps.newHashMap();
PartitionedFileSetArguments.setInputPartitionFilter(inputArgs, PartitionFilter.builder().addValueCondition("league", league).build());
context.addInput(Input.ofDataset("results", inputArgs));
// Each run writes its output to a partition for the league
Map<String, String> outputArgs = Maps.newHashMap();
PartitionKey outputKey = PartitionKey.builder().addStringField("league", league).build();
PartitionedFileSetArguments.setOutputPartitionKey(outputArgs, outputKey);
context.addOutput(Output.ofDataset("totals", outputArgs));
// used only for logging:
PartitionedFileSet input = context.getDataset("results", inputArgs);
PartitionedFileSet outputFileSet = context.getDataset("totals", outputArgs);
String outputPath = FileSetArguments.getOutputPath(outputFileSet.getEmbeddedFileSet().getRuntimeArguments());
LOG.info("input: {}, output: {}", input.getEmbeddedFileSet().getInputLocations(), outputPath);
}
use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.
the class WikipediaDataDownloader method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
Job job = context.getHadoopJob();
job.setMapperClass(WikipediaDataDownloaderMapper.class);
job.setNumReduceTasks(0);
String dataNamespace = context.getRuntimeArguments().get(WikipediaPipelineApp.NAMESPACE_ARG);
dataNamespace = dataNamespace == null ? getContext().getNamespace() : dataNamespace;
context.addInput(Input.ofDataset(WikipediaPipelineApp.PAGE_TITLES_DATASET).fromNamespace(dataNamespace));
context.addOutput(Output.ofDataset(WikipediaPipelineApp.RAW_WIKIPEDIA_DATASET).fromNamespace(dataNamespace));
}
use of co.cask.cdap.api.mapreduce.MapReduceContext in project cdap by caskdata.
the class StreamConversionMapReduce method initialize.
@Override
public void initialize() throws Exception {
MapReduceContext context = getContext();
Job job = context.getHadoopJob();
job.setMapperClass(StreamConversionMapper.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(AvroKey.class);
job.setMapOutputValueClass(NullWritable.class);
AvroJob.setOutputKeySchema(job, SCHEMA);
// read 5 minutes of events from the stream, ending at the logical start time of this run
long logicalTime = context.getLogicalStartTime();
context.addInput(Input.ofStream("events", logicalTime - TimeUnit.MINUTES.toMillis(5), logicalTime));
// each run writes its output to a partition with the logical start time.
TimePartitionedFileSetArguments.setOutputPartitionTime(dsArguments, logicalTime);
context.addOutput(Output.ofDataset("converted", dsArguments));
TimePartitionedFileSet partitionedFileSet = context.getDataset("converted", dsArguments);
LOG.info("Output location for new partition is: {}", partitionedFileSet.getEmbeddedFileSet().getOutputLocation());
}
Aggregations