Search in sources :

Example 1 with Reducer

use of org.apache.hadoop.mapreduce.Reducer in project hadoop by apache.

the class Chain method addReducer.

/**
   * Add reducer that reads from context and writes to a queue
   */
@SuppressWarnings("unchecked")
void addReducer(TaskInputOutputContext inputContext, ChainBlockingQueue<KeyValuePair<?, ?>> outputQueue) throws IOException, InterruptedException {
    Class<?> keyOutClass = rConf.getClass(REDUCER_OUTPUT_KEY_CLASS, Object.class);
    Class<?> valueOutClass = rConf.getClass(REDUCER_OUTPUT_VALUE_CLASS, Object.class);
    RecordWriter rw = new ChainRecordWriter(keyOutClass, valueOutClass, outputQueue, rConf);
    Reducer.Context reducerContext = createReduceContext(rw, (ReduceContext) inputContext, rConf);
    ReduceRunner runner = new ReduceRunner(reducerContext, reducer, rw);
    threads.add(runner);
}
Also used : RecordWriter(org.apache.hadoop.mapreduce.RecordWriter) WrappedReducer(org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer) Reducer(org.apache.hadoop.mapreduce.Reducer)

Example 2 with Reducer

use of org.apache.hadoop.mapreduce.Reducer in project nutch by apache.

the class TestCrawlDbStates method testCrawlDbStateTransitionMatrix.

/**
 * Test the matrix of state transitions:
 * <ul>
 * <li>for all available {@link FetchSchedule} implementations</li>
 * <li>for every possible status in CrawlDb (including "not in CrawlDb")</li>
 * <li>for every possible fetch status</li>
 * <li>and zero or more (0-3) additional in-links</li>
 * </ul>
 * call {@literal updatedb} and check whether the resulting CrawlDb status is
 * the expected one.
 */
@Test
public void testCrawlDbStateTransitionMatrix() {
    LOG.info("Test CrawlDatum state transitions");
    Reducer<Text, CrawlDatum, Text, CrawlDatum>.Context context = CrawlDBTestUtil.createContext();
    Configuration conf = context.getConfiguration();
    CrawlDbUpdateUtil updateDb = null;
    try {
        updateDb = new CrawlDbUpdateUtil(new CrawlDbReducer(), context);
    } catch (IOException e) {
        e.printStackTrace();
    }
    int retryMax = conf.getInt("db.fetch.retry.max", 3);
    for (String sched : schedules) {
        LOG.info("Testing state transitions with " + sched);
        conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
        FetchSchedule schedule = FetchScheduleFactory.getFetchSchedule(conf);
        for (int i = 0; i < fetchDbStatusPairs.length; i++) {
            byte fromDbStatus = fetchDbStatusPairs[i][1];
            for (int j = 0; j < fetchDbStatusPairs.length; j++) {
                byte fetchStatus = fetchDbStatusPairs[j][0];
                CrawlDatum fromDb = null;
                if (fromDbStatus == -1) {
                // nothing yet in CrawlDb
                // CrawlDatum added by FreeGenerator or via outlink
                } else {
                    fromDb = new CrawlDatum();
                    fromDb.setStatus(fromDbStatus);
                    // initialize fetchInterval:
                    schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
                }
                // expected db status
                byte toDbStatus = fetchDbStatusPairs[j][1];
                if (fetchStatus == -1) {
                    if (fromDbStatus == -1) {
                        // nothing fetched yet: new document detected via outlink
                        toDbStatus = STATUS_DB_UNFETCHED;
                    } else {
                        // nothing fetched but new inlinks detected: status is unchanged
                        toDbStatus = fromDbStatus;
                    }
                } else if (fetchStatus == STATUS_FETCH_RETRY) {
                    // a simple test of fetch_retry (without retries)
                    if (fromDb == null || fromDb.getRetriesSinceFetch() < retryMax) {
                        toDbStatus = STATUS_DB_UNFETCHED;
                    } else {
                        toDbStatus = STATUS_DB_GONE;
                    }
                }
                String fromDbStatusName = (fromDbStatus == -1 ? "<not in CrawlDb>" : getStatusName(fromDbStatus));
                String fetchStatusName = (fetchStatus == -1 ? "<only inlinks>" : CrawlDatum.getStatusName(fetchStatus));
                LOG.info(fromDbStatusName + " + " + fetchStatusName + " => " + getStatusName(toDbStatus));
                List<CrawlDatum> values = new ArrayList<CrawlDatum>();
                for (int l = 0; l <= 2; l++) {
                    // number of additional in-links
                    CrawlDatum fetch = null;
                    if (fetchStatus == -1) {
                        // nothing fetched, need at least one in-link
                        if (l == 0)
                            continue;
                    } else {
                        fetch = new CrawlDatum();
                        if (fromDb != null) {
                            fetch.set(fromDb);
                        } else {
                            // not yet in CrawlDb: added by FreeGenerator
                            schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fetch);
                        }
                        fetch.setStatus(fetchStatus);
                        fetch.setFetchTime(System.currentTimeMillis());
                    }
                    if (fromDb != null)
                        values.add(fromDb);
                    if (fetch != null)
                        values.add(fetch);
                    for (int n = 0; n < l; n++) {
                        values.add(linked);
                    }
                    List<CrawlDatum> res = updateDb.update(values);
                    if (res.size() != 1) {
                        fail("CrawlDb update didn't result in one single CrawlDatum per URL");
                        continue;
                    }
                    byte status = res.get(0).getStatus();
                    if (status != toDbStatus) {
                        fail("CrawlDb update for " + fromDbStatusName + " and " + fetchStatusName + " and " + l + " inlinks results in " + getStatusName(status) + " (expected: " + getStatusName(toDbStatus) + ")");
                    }
                    values.clear();
                }
            }
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) IOException(java.io.IOException) Reducer(org.apache.hadoop.mapreduce.Reducer) Test(org.junit.Test)

Example 3 with Reducer

use of org.apache.hadoop.mapreduce.Reducer in project ignite by apache.

the class HadoopV2ReduceTask method run0.

/**
 * {@inheritDoc}
 */
@SuppressWarnings({ "ConstantConditions", "unchecked" })
@Override
public void run0(HadoopV2TaskContext taskCtx) throws IgniteCheckedException {
    OutputFormat outputFormat = null;
    Exception err = null;
    JobContextImpl jobCtx = taskCtx.jobContext();
    // Set mapper index for combiner tasks
    if (!reduce && taskCtx.taskInfo().hasMapperIndex())
        HadoopMapperUtils.mapperIndex(taskCtx.taskInfo().mapperIndex());
    else
        HadoopMapperUtils.clearMapperIndex();
    try {
        outputFormat = reduce || !taskCtx.job().info().hasReducer() ? prepareWriter(jobCtx) : null;
        Reducer reducer;
        if (reduce)
            reducer = ReflectionUtils.newInstance(jobCtx.getReducerClass(), jobCtx.getConfiguration());
        else
            reducer = ReflectionUtils.newInstance(jobCtx.getCombinerClass(), jobCtx.getConfiguration());
        try {
            reducer.run(new WrappedReducer().getReducerContext(hadoopContext()));
            if (!reduce)
                taskCtx.onMapperFinished();
        } finally {
            closeWriter();
        }
        commit(outputFormat);
    } catch (InterruptedException e) {
        err = e;
        Thread.currentThread().interrupt();
        throw new IgniteInterruptedCheckedException(e);
    } catch (Exception e) {
        err = e;
        throw new IgniteCheckedException(e);
    } finally {
        if (!reduce)
            HadoopMapperUtils.clearMapperIndex();
        if (err != null)
            abort(outputFormat);
    }
}
Also used : IgniteInterruptedCheckedException(org.apache.ignite.internal.IgniteInterruptedCheckedException) JobContextImpl(org.apache.hadoop.mapred.JobContextImpl) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) OutputFormat(org.apache.hadoop.mapreduce.OutputFormat) WrappedReducer(org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer) WrappedReducer(org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer) Reducer(org.apache.hadoop.mapreduce.Reducer) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteInterruptedCheckedException(org.apache.ignite.internal.IgniteInterruptedCheckedException)

Example 4 with Reducer

use of org.apache.hadoop.mapreduce.Reducer in project cdap by caskdata.

the class MapReduceRuntimeService method getInputValueType.

/**
 * Returns the input value type of the MR job based on the job Mapper/Reducer type.
 * It does so by inspecting the Mapper/Reducer type parameters to figure out what the input type is.
 * If the job has Mapper, then it's the Mapper IN_VALUE type, otherwise it would be the Reducer IN_VALUE type.
 * If the cannot determine the input value type, then return the given default type.
 *
 * @param hConf the Configuration to use to resolve the class TypeToken
 * @param defaultType the defaultType to return
 * @param mapperTypeToken the mapper type token for the configured input (not resolved by the job's mapper class)
 */
@VisibleForTesting
static Type getInputValueType(Configuration hConf, Type defaultType, @Nullable TypeToken<?> mapperTypeToken) {
    TypeToken<?> type;
    if (mapperTypeToken == null) {
        // if the input's mapper is null, first try resolving a from the job
        mapperTypeToken = resolveClass(hConf, MRJobConfig.MAP_CLASS_ATTR, Mapper.class);
    }
    if (mapperTypeToken == null) {
        // If there is no Mapper, it's a Reducer only job, hence get the value type from Reducer class
        type = resolveClass(hConf, MRJobConfig.REDUCE_CLASS_ATTR, Reducer.class);
    } else {
        type = mapperTypeToken;
    }
    Preconditions.checkArgument(type != null, "Neither a Mapper nor a Reducer is configured for the MapReduce job.");
    if (!(type.getType() instanceof ParameterizedType)) {
        return defaultType;
    }
    // The super type Mapper/Reducer must be a parametrized type with <IN_KEY, IN_VALUE, OUT_KEY, OUT_VALUE>
    Type inputValueType = ((ParameterizedType) type.getType()).getActualTypeArguments()[1];
    // This avoid the case where a subclass that has "class InvalidMapper<I, O> extends Mapper<I, O>"
    if (inputValueType instanceof TypeVariable && inputValueType.equals(type.getRawType().getTypeParameters()[1])) {
        inputValueType = defaultType;
    }
    return inputValueType;
}
Also used : ParameterizedType(java.lang.reflect.ParameterizedType) Mapper(org.apache.hadoop.mapreduce.Mapper) ProgramType(co.cask.cdap.proto.ProgramType) AccessType(co.cask.cdap.data2.metadata.lineage.AccessType) ParameterizedType(java.lang.reflect.ParameterizedType) Type(java.lang.reflect.Type) TypeVariable(java.lang.reflect.TypeVariable) Reducer(org.apache.hadoop.mapreduce.Reducer) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 5 with Reducer

use of org.apache.hadoop.mapreduce.Reducer in project cdap by caskdata.

the class MapReduceRuntimeService method setMapOutputClassesIfNeeded.

/**
 * Sets the map output key and value classes in the job configuration by inspecting the {@link Mapper}
 * if it is not set by the user.
 *
 * @param job the MapReduce job
 * @param mapperTypeToken TypeToken of a configured mapper (may not be configured on the job). Has already been
 *                        resolved from the job's mapper class.
 */
private void setMapOutputClassesIfNeeded(Job job, @Nullable TypeToken<?> mapperTypeToken) {
    Configuration conf = job.getConfiguration();
    TypeToken<?> type = mapperTypeToken;
    int keyIdx = 2;
    int valueIdx = 3;
    if (type == null) {
        // Reducer only job. Use the Reducer input types as the key/value classes.
        type = resolveClass(conf, MRJobConfig.REDUCE_CLASS_ATTR, Reducer.class);
        keyIdx = 0;
        valueIdx = 1;
    }
    // If not able to detect type, nothing to set.
    if (type == null || !(type.getType() instanceof ParameterizedType)) {
        return;
    }
    Type[] typeArgs = ((ParameterizedType) type.getType()).getActualTypeArguments();
    // The key and value type are in the 3rd and 4th type parameters
    if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS)) {
        Class<?> cls = TypeToken.of(typeArgs[keyIdx]).getRawType();
        LOG.debug("Set map output key class to {}", cls);
        job.setMapOutputKeyClass(cls);
    }
    if (!isProgrammaticConfig(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS)) {
        Class<?> cls = TypeToken.of(typeArgs[valueIdx]).getRawType();
        LOG.debug("Set map output value class to {}", cls);
        job.setMapOutputValueClass(cls);
    }
}
Also used : ParameterizedType(java.lang.reflect.ParameterizedType) ProgramType(co.cask.cdap.proto.ProgramType) AccessType(co.cask.cdap.data2.metadata.lineage.AccessType) ParameterizedType(java.lang.reflect.ParameterizedType) Type(java.lang.reflect.Type) CConfiguration(co.cask.cdap.common.conf.CConfiguration) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Reducer(org.apache.hadoop.mapreduce.Reducer)

Aggregations

Reducer (org.apache.hadoop.mapreduce.Reducer)7 Configuration (org.apache.hadoop.conf.Configuration)3 WrappedReducer (org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer)3 AccessType (co.cask.cdap.data2.metadata.lineage.AccessType)2 ProgramType (co.cask.cdap.proto.ProgramType)2 IOException (java.io.IOException)2 ParameterizedType (java.lang.reflect.ParameterizedType)2 Type (java.lang.reflect.Type)2 ProgramLifecycle (co.cask.cdap.api.ProgramLifecycle)1 RuntimeContext (co.cask.cdap.api.RuntimeContext)1 CConfiguration (co.cask.cdap.common.conf.CConfiguration)1 PropertyFieldSetter (co.cask.cdap.common.lang.PropertyFieldSetter)1 WeakReferenceDelegatorClassLoader (co.cask.cdap.common.lang.WeakReferenceDelegatorClassLoader)1 DataSetFieldSetter (co.cask.cdap.internal.app.runtime.DataSetFieldSetter)1 MetricsFieldSetter (co.cask.cdap.internal.app.runtime.MetricsFieldSetter)1 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 TypeVariable (java.lang.reflect.TypeVariable)1 ArrayList (java.util.ArrayList)1 ByteSequence (org.apache.accumulo.core.data.ByteSequence)1 Key (org.apache.accumulo.core.data.Key)1