use of org.apache.hadoop.mapreduce.Reducer in project hadoop by apache.
the class Chain method addReducer.
/**
* Add reducer that reads from context and writes to a queue
*/
@SuppressWarnings("unchecked")
void addReducer(TaskInputOutputContext inputContext, ChainBlockingQueue<KeyValuePair<?, ?>> outputQueue) throws IOException, InterruptedException {
Class<?> keyOutClass = rConf.getClass(REDUCER_OUTPUT_KEY_CLASS, Object.class);
Class<?> valueOutClass = rConf.getClass(REDUCER_OUTPUT_VALUE_CLASS, Object.class);
RecordWriter rw = new ChainRecordWriter(keyOutClass, valueOutClass, outputQueue, rConf);
Reducer.Context reducerContext = createReduceContext(rw, (ReduceContext) inputContext, rConf);
ReduceRunner runner = new ReduceRunner(reducerContext, reducer, rw);
threads.add(runner);
}
use of org.apache.hadoop.mapreduce.Reducer in project nutch by apache.
the class TestCrawlDbStates method testCrawlDbStateTransitionMatrix.
/**
* Test the matrix of state transitions:
* <ul>
* <li>for all available {@link FetchSchedule} implementations</li>
* <li>for every possible status in CrawlDb (including "not in CrawlDb")</li>
* <li>for every possible fetch status</li>
* <li>and zero or more (0-3) additional in-links</li>
* </ul>
* call {@literal updatedb} and check whether the resulting CrawlDb status is
* the expected one.
*/
@Test
public void testCrawlDbStateTransitionMatrix() {
LOG.info("Test CrawlDatum state transitions");
Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context context = CrawlDBTestUtil.createContext();
Configuration conf = context.getConfiguration();
CrawlDbUpdateUtil updateDb = null;
try {
updateDb = new CrawlDbUpdateUtil(new CrawlDbReducer(), context);
} catch (IOException e) {
e.printStackTrace();
}
int retryMax = conf.getInt("db.fetch.retry.max", 3);
for (String sched : schedules) {
LOG.info("Testing state transitions with " + sched);
conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
FetchSchedule schedule = FetchScheduleFactory.getFetchSchedule(conf);
for (int i = 0; i < fetchDbStatusPairs.length; i++) {
byte fromDbStatus = fetchDbStatusPairs[i][1];
for (int j = 0; j < fetchDbStatusPairs.length; j++) {
byte fetchStatus = fetchDbStatusPairs[j][0];
CrawlDatum fromDb = null;
if (fromDbStatus == -1) {
// nothing yet in CrawlDb
// CrawlDatum added by FreeGenerator or via outlink
} else {
fromDb = new CrawlDatum();
fromDb.setStatus(fromDbStatus);
// initialize fetchInterval:
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
}
// expected db status
byte toDbStatus = fetchDbStatusPairs[j][1];
if (fetchStatus == -1) {
if (fromDbStatus == -1) {
// nothing fetched yet: new document detected via outlink
toDbStatus = STATUS_DB_UNFETCHED;
} else {
// nothing fetched but new inlinks detected: status is unchanged
toDbStatus = fromDbStatus;
}
} else if (fetchStatus == STATUS_FETCH_RETRY) {
// a simple test of fetch_retry (without retries)
if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) {
toDbStatus = STATUS_DB_UNFETCHED;
} else {
toDbStatus = STATUS_DB_GONE;
}
}
String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>" : getStatusName(fromDbStatus));
String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>" : CrawlDatum.getStatusName(fetchStatus));
LOG.info(fromDbStatusName + " + " + fetchStatusName + " => " + getStatusName(toDbStatus));
List<CrawlDatum> values = new ArrayList<CrawlDatum>();
for (int l = 0; l <= 2; l++) {
// number of additional in-links
CrawlDatum fetch = null;
if (fetchStatus == -1) {
// nothing fetched, need at least one in-link
if (l == 0)
continue;
} else {
fetch = new CrawlDatum();
if (fromDb != null) {
fetch.set(fromDb);
} else {
// not yet in CrawlDb: added by FreeGenerator
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch);
}
fetch.setStatus(fetchStatus);
fetch.setFetchTime(System.currentTimeMillis());
}
if (fromDb != null)
values.add(fromDb);
if (fetch != null)
values.add(fetch);
for (int n = 0; n < l; n++) {
values.add(linked);
}
List<CrawlDatum> res = updateDb.update(values);
if (res.size() != 1) {
fail("CrawlDb update didn't result in one single CrawlDatum per URL");
continue;
}
byte status = res.get(0).getStatus();
if (status != toDbStatus) {
fail("CrawlDb update for " + fromDbStatusName + " and " + fetchStatusName + " and " + l + " inlinks results in " + getStatusName(status) + " (expected: " + getStatusName(toDbStatus) + ")");
}
values.clear();
}
}
}
}
}
use of org.apache.hadoop.mapreduce.Reducer in project ignite by apache.
the class HadoopV2ReduceTask method run0.
/**
* {@inheritDoc}
*/
@SuppressWarnings({ "ConstantConditions", "unchecked" })
@Override
public void run0(HadoopV2TaskContext taskCtx) throws IgniteCheckedException {
OutputFormat outputFormat = null;
Exception err = null;
JobContextImpl jobCtx = taskCtx.jobContext();
// Set mapper index for combiner tasks
if (!reduce && taskCtx.taskInfo().hasMapperIndex())
HadoopMapperUtils.mapperIndex(taskCtx.taskInfo().mapperIndex());
else
HadoopMapperUtils.clearMapperIndex();
try {
outputFormat = reduce || !taskCtx.job().info().hasReducer() ? prepareWriter(jobCtx) : null;
Reducer reducer;
if (reduce)
reducer = ReflectionUtils.newInstance(jobCtx.getReducerClass(), jobCtx.getConfiguration());
else
reducer = ReflectionUtils.newInstance(jobCtx.getCombinerClass(), jobCtx.getConfiguration());
try {
reducer.run(new WrappedReducer().getReducerContext(hadoopContext()));
if (!reduce)
taskCtx.onMapperFinished();
} finally {
closeWriter();
}
commit(outputFormat);
} catch (InterruptedException e) {
err = e;
Thread.currentThread().interrupt();
throw new IgniteInterruptedCheckedException(e);
} catch (Exception e) {
err = e;
throw new IgniteCheckedException(e);
} finally {
if (!reduce)
HadoopMapperUtils.clearMapperIndex();
if (err != null)
abort(outputFormat);
}
}
use of org.apache.hadoop.mapreduce.Reducer in project cdap by caskdata.
the class MapReduceRuntimeService method getInputValueType.
/**
* Returns the input value type of the MR job based on the job Mapper/Reducer type.
* It does so by inspecting the Mapper/Reducer type parameters to figure out what the input type is.
* If the job has Mapper, then it's the Mapper IN_VALUE type, otherwise it would be the Reducer IN_VALUE type.
* If the cannot determine the input value type, then return the given default type.
*
* @param hConf the Configuration to use to resolve the class TypeToken
* @param defaultType the defaultType to return
* @param mapperTypeToken the mapper type token for the configured input (not resolved by the job's mapper class)
*/
@VisibleForTesting
static Type getInputValueType(Configuration hConf, Type defaultType, @Nullable TypeToken<?> mapperTypeToken) {
TypeToken<?> type;
if (mapperTypeToken == null) {
// if the input's mapper is null, first try resolving a from the job
mapperTypeToken = resolveClass(hConf, MRJobConfig.MAP_CLASS_ATTR, Mapper.class);
}
if (mapperTypeToken == null) {
// If there is no Mapper, it's a Reducer only job, hence get the value type from Reducer class
type = resolveClass(hConf, MRJobConfig.REDUCE_CLASS_ATTR, Reducer.class);
} else {
type = mapperTypeToken;
}
Preconditions.checkArgument(type != null, "Neither a Mapper nor a Reducer is configured for the MapReduce job.");
if (!(type.getType() instanceof ParameterizedType)) {
return defaultType;
}
// The super type Mapper/Reducer must be a parametrized type with <IN_KEY, IN_VALUE, OUT_KEY, OUT_VALUE>
Type inputValueType = ((ParameterizedType) type.getType()).getActualTypeArguments()[1];
// This avoid the case where a subclass that has "class InvalidMapper<I, O> extends Mapper<I, O>"
if (inputValueType instanceof TypeVariable && inputValueType.equals(type.getRawType().getTypeParameters()[1])) {
inputValueType = defaultType;
}
return inputValueType;
}
use of org.apache.hadoop.mapreduce.Reducer in project cdap by caskdata.
the class MapReduceRuntimeService method setMapOutputClassesIfNeeded.
/**
* Sets the map output key and value classes in the job configuration by inspecting the {@link Mapper}
* if it is not set by the user.
*
* @param job the MapReduce job
* @param mapperTypeToken TypeToken of a configured mapper (may not be configured on the job). Has already been
* resolved from the job's mapper class.
*/
private void setMapOutputClassesIfNeeded(Job job, @Nullable TypeToken<?> mapperTypeToken) {
Configuration conf = job.getConfiguration();
TypeToken<?> type = mapperTypeToken;
int keyIdx = 2;
int valueIdx = 3;
if (type == null) {
// Reducer only job. Use the Reducer input types as the key/value classes.
type = resolveClass(conf, MRJobConfig.REDUCE_CLASS_ATTR, Reducer.class);
keyIdx = 0;
valueIdx = 1;
}
// If not able to detect type, nothing to set.
if (type == null || !(type.getType() instanceof ParameterizedType)) {
return;
}
Type[] typeArgs = ((ParameterizedType) type.getType()).getActualTypeArguments();
// The key and value type are in the 3rd and 4th type parameters
if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS)) {
Class<?> cls = TypeToken.of(typeArgs[keyIdx]).getRawType();
LOG.debug("Set map output key class to {}", cls);
job.setMapOutputKeyClass(cls);
}
if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS)) {
Class<?> cls = TypeToken.of(typeArgs[valueIdx]).getRawType();
LOG.debug("Set map output value class to {}", cls);
job.setMapOutputValueClass(cls);
}
}
Aggregations