Examples with RawKeyValueIterator - org.apache.hadoop.mapred.RawKeyValueIterator

Example 1 with RawKeyValueIterator

use of org.apache.hadoop.mapred.RawKeyValueIterator in project hadoop by apache.

the class TestGridMixClasses method testSleepReducer.

/*
   * test SleepReducer
   */
@Test(timeout = 3000)
public void testSleepReducer() throws Exception {
    Configuration conf = new Configuration();
    conf.setInt(JobContext.NUM_REDUCES, 2);
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    conf.setBoolean(FileOutputFormat.COMPRESS, true);
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
    TaskAttemptID taskId = new TaskAttemptID();
    RawKeyValueIterator input = new FakeRawKeyValueReducerIterator();
    Counter counter = new GenericCounter();
    Counter inputValueCounter = new GenericCounter();
    RecordWriter<NullWritable, NullWritable> output = new LoadRecordReduceWriter();
    OutputCommitter committer = new CustomOutputCommitter();
    StatusReporter reporter = new DummyReporter();
    RawComparator<GridmixKey> comparator = new FakeRawComparator();
    ReduceContext<GridmixKey, NullWritable, NullWritable, NullWritable> reducecontext = new ReduceContextImpl<GridmixKey, NullWritable, NullWritable, NullWritable>(conf, taskId, input, counter, inputValueCounter, output, committer, reporter, comparator, GridmixKey.class, NullWritable.class);
    org.apache.hadoop.mapreduce.Reducer<GridmixKey, NullWritable, NullWritable, NullWritable>.Context<GridmixKey, NullWritable, NullWritable, NullWritable> context = new WrappedReducer<GridmixKey, NullWritable, NullWritable, NullWritable>().getReducerContext(reducecontext);
    SleepReducer test = new SleepReducer();
    long start = System.currentTimeMillis();
    test.setup(context);
    long sleeper = context.getCurrentKey().getReduceOutputBytes();
    // status has been changed
    assertEquals("Sleeping... " + sleeper + " ms left", context.getStatus());
    // should sleep 0.9 sec
    assertTrue(System.currentTimeMillis() >= (start + sleeper));
    test.cleanup(context);
    // status has been changed again
    assertEquals("Slept for " + sleeper, context.getStatus());
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) ReduceContextImpl(org.apache.hadoop.mapreduce.task.ReduceContextImpl) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) GenericCounter(org.apache.hadoop.mapreduce.counters.GenericCounter) Counter(org.apache.hadoop.mapreduce.Counter) CustomOutputCommitter(org.apache.hadoop.CustomOutputCommitter) CustomOutputCommitter(org.apache.hadoop.CustomOutputCommitter) OutputCommitter(org.apache.hadoop.mapreduce.OutputCommitter) GenericCounter(org.apache.hadoop.mapreduce.counters.GenericCounter) DummyReporter(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl.DummyReporter) NullWritable(org.apache.hadoop.io.NullWritable) RawKeyValueIterator(org.apache.hadoop.mapred.RawKeyValueIterator) SleepReducer(org.apache.hadoop.mapred.gridmix.SleepJob.SleepReducer) WrappedReducer(org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer) SleepReducer(org.apache.hadoop.mapred.gridmix.SleepJob.SleepReducer) StatusReporter(org.apache.hadoop.mapreduce.StatusReporter) Test(org.junit.Test)

Example 2 with RawKeyValueIterator

use of org.apache.hadoop.mapred.RawKeyValueIterator in project hadoop by apache.

the class TestGridMixClasses method testLoadJobLoadReducer.

/*
   * test LoadReducer
   */
@Test(timeout = 3000)
public void testLoadJobLoadReducer() throws Exception {
    LoadJob.LoadReducer test = new LoadJob.LoadReducer();
    Configuration conf = new Configuration();
    conf.setInt(JobContext.NUM_REDUCES, 2);
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    conf.setBoolean(FileOutputFormat.COMPRESS, true);
    CompressionEmulationUtil.setCompressionEmulationEnabled(conf, true);
    conf.setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
    TaskAttemptID taskid = new TaskAttemptID();
    RawKeyValueIterator input = new FakeRawKeyValueIterator();
    Counter counter = new GenericCounter();
    Counter inputValueCounter = new GenericCounter();
    LoadRecordWriter output = new LoadRecordWriter();
    OutputCommitter committer = new CustomOutputCommitter();
    StatusReporter reporter = new DummyReporter();
    RawComparator<GridmixKey> comparator = new FakeRawComparator();
    ReduceContext<GridmixKey, GridmixRecord, NullWritable, GridmixRecord> reduceContext = new ReduceContextImpl<GridmixKey, GridmixRecord, NullWritable, GridmixRecord>(conf, taskid, input, counter, inputValueCounter, output, committer, reporter, comparator, GridmixKey.class, GridmixRecord.class);
    // read for previous data
    reduceContext.nextKeyValue();
    org.apache.hadoop.mapreduce.Reducer<GridmixKey, GridmixRecord, NullWritable, GridmixRecord>.Context<GridmixKey, GridmixRecord, NullWritable, GridmixRecord> context = new WrappedReducer<GridmixKey, GridmixRecord, NullWritable, GridmixRecord>().getReducerContext(reduceContext);
    // test.setup(context);
    test.run(context);
    // have been readed 9 records (-1 for previous)
    assertEquals(9, counter.getValue());
    assertEquals(10, inputValueCounter.getValue());
    assertEquals(1, output.getData().size());
    GridmixRecord record = output.getData().values().iterator().next();
    assertEquals(1593, record.getSize());
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) ReduceContextImpl(org.apache.hadoop.mapreduce.task.ReduceContextImpl) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) GenericCounter(org.apache.hadoop.mapreduce.counters.GenericCounter) Counter(org.apache.hadoop.mapreduce.Counter) CustomOutputCommitter(org.apache.hadoop.CustomOutputCommitter) CustomOutputCommitter(org.apache.hadoop.CustomOutputCommitter) OutputCommitter(org.apache.hadoop.mapreduce.OutputCommitter) GenericCounter(org.apache.hadoop.mapreduce.counters.GenericCounter) DummyReporter(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl.DummyReporter) NullWritable(org.apache.hadoop.io.NullWritable) RawKeyValueIterator(org.apache.hadoop.mapred.RawKeyValueIterator) WrappedReducer(org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer) SleepReducer(org.apache.hadoop.mapred.gridmix.SleepJob.SleepReducer) StatusReporter(org.apache.hadoop.mapreduce.StatusReporter) Test(org.junit.Test)

Example 3 with RawKeyValueIterator

use of org.apache.hadoop.mapred.RawKeyValueIterator in project hadoop by apache.

the class MergeManagerImpl method finalMerge.

private RawKeyValueIterator finalMerge(JobConf job, FileSystem fs, List<InMemoryMapOutput<K, V>> inMemoryMapOutputs, List<CompressAwarePath> onDiskMapOutputs) throws IOException {
    LOG.info("finalMerge called with " + inMemoryMapOutputs.size() + " in-memory map-outputs and " + onDiskMapOutputs.size() + " on-disk map-outputs");
    final long maxInMemReduce = getMaxInMemReduceLimit();
    // merge config params
    Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
    Class<V> valueClass = (Class<V>) job.getMapOutputValueClass();
    boolean keepInputs = job.getKeepFailedTaskFiles();
    final Path tmpDir = new Path(reduceId.toString());
    final RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();
    // segments required to vacate memory
    List<Segment<K, V>> memDiskSegments = new ArrayList<Segment<K, V>>();
    long inMemToDiskBytes = 0;
    boolean mergePhaseFinished = false;
    if (inMemoryMapOutputs.size() > 0) {
        TaskID mapId = inMemoryMapOutputs.get(0).getMapId().getTaskID();
        inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce);
        final int numMemDiskSegments = memDiskSegments.size();
        if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
            // If we reach here, it implies that we have less than io.sort.factor
            // disk segments and this will be incremented by 1 (result of the 
            // memory segments merge). Since this total would still be 
            // <= io.sort.factor, we will not do any more intermediate merges,
            // the merge of all these disk segments would be directly fed to the
            // reduce method
            mergePhaseFinished = true;
            // must spill to disk, but can't retain in-mem for intermediate merge
            final Path outputPath = mapOutputFile.getInputFileForWrite(mapId, inMemToDiskBytes).suffix(Task.MERGED_OUTPUT_PREFIX);
            final RawKeyValueIterator rIter = Merger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null, mergePhase);
            FSDataOutputStream out = CryptoUtils.wrapIfNecessary(job, fs.create(outputPath));
            Writer<K, V> writer = new Writer<K, V>(job, out, keyClass, valueClass, codec, null, true);
            try {
                Merger.writeFile(rIter, writer, reporter, job);
                writer.close();
                onDiskMapOutputs.add(new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()));
                writer = null;
            // add to list of final disk outputs.
            } catch (IOException e) {
                if (null != outputPath) {
                    try {
                        fs.delete(outputPath, true);
                    } catch (IOException ie) {
                    // NOTHING
                    }
                }
                throw e;
            } finally {
                if (null != writer) {
                    writer.close();
                }
            }
            LOG.info("Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy " + "reduce memory limit");
            inMemToDiskBytes = 0;
            memDiskSegments.clear();
        } else if (inMemToDiskBytes != 0) {
            LOG.info("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
        }
    }
    // segments on disk
    List<Segment<K, V>> diskSegments = new ArrayList<Segment<K, V>>();
    long onDiskBytes = inMemToDiskBytes;
    long rawBytes = inMemToDiskBytes;
    CompressAwarePath[] onDisk = onDiskMapOutputs.toArray(new CompressAwarePath[onDiskMapOutputs.size()]);
    for (CompressAwarePath file : onDisk) {
        long fileLength = fs.getFileStatus(file).getLen();
        onDiskBytes += fileLength;
        rawBytes += (file.getRawDataLength() > 0) ? file.getRawDataLength() : fileLength;
        LOG.debug("Disk file: " + file + " Length is " + fileLength);
        diskSegments.add(new Segment<K, V>(job, fs, file, codec, keepInputs, (file.toString().endsWith(Task.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter), file.getRawDataLength()));
    }
    LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
    Collections.sort(diskSegments, new Comparator<Segment<K, V>>() {

        public int compare(Segment<K, V> o1, Segment<K, V> o2) {
            if (o1.getLength() == o2.getLength()) {
                return 0;
            }
            return o1.getLength() < o2.getLength() ? -1 : 1;
        }
    });
    // build final list of segments from merged backed by disk + in-mem
    List<Segment<K, V>> finalSegments = new ArrayList<Segment<K, V>>();
    long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
    LOG.info("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
    if (0 != onDiskBytes) {
        final int numInMemSegments = memDiskSegments.size();
        diskSegments.addAll(0, memDiskSegments);
        memDiskSegments.clear();
        // Pass mergePhase only if there is a going to be intermediate
        // merges. See comment where mergePhaseFinished is being set
        Progress thisPhase = (mergePhaseFinished) ? null : mergePhase;
        RawKeyValueIterator diskMerge = Merger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false, spilledRecordsCounter, null, thisPhase);
        diskSegments.clear();
        if (0 == finalSegments.size()) {
            return diskMerge;
        }
        finalSegments.add(new Segment<K, V>(new RawKVIteratorReader(diskMerge, onDiskBytes), true, rawBytes));
    }
    return Merger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, reporter, spilledRecordsCounter, null, null);
}

Also used : ArrayList(java.util.ArrayList) Segment(org.apache.hadoop.mapred.Merger.Segment) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Path(org.apache.hadoop.fs.Path) Progress(org.apache.hadoop.util.Progress) TaskID(org.apache.hadoop.mapreduce.TaskID) IOException(java.io.IOException) RawKeyValueIterator(org.apache.hadoop.mapred.RawKeyValueIterator) RawComparator(org.apache.hadoop.io.RawComparator) Writer(org.apache.hadoop.mapred.IFile.Writer)

Example 4 with RawKeyValueIterator

use of org.apache.hadoop.mapred.RawKeyValueIterator in project tez by apache.

the class MRCombiner method createReduceContext.

private static <KEYIN, VALUEIN, KEYOUT, VALUEOUT> org.apache.hadoop.mapreduce.Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context createReduceContext(Configuration conf, TaskAttemptID mrTaskAttemptID, final TezRawKeyValueIterator rawIter, Counter combineInputRecordsCounter, Counter combineOutputRecordsCounter, RecordWriter<KEYOUT, VALUEOUT> recordWriter, MRTaskReporter reporter, RawComparator<KEYIN> comparator, Class<KEYIN> keyClass, Class<VALUEIN> valClass) throws InterruptedException, IOException {
    RawKeyValueIterator r = new RawKeyValueIterator() {

        @Override
        public boolean next() throws IOException {
            return rawIter.next();
        }

        @Override
        public DataInputBuffer getValue() throws IOException {
            return rawIter.getValue();
        }

        @Override
        public Progress getProgress() {
            return rawIter.getProgress();
        }

        @Override
        public DataInputBuffer getKey() throws IOException {
            return rawIter.getKey();
        }

        @Override
        public void close() throws IOException {
            rawIter.close();
        }
    };
    ReduceContext<KEYIN, VALUEIN, KEYOUT, VALUEOUT> rContext = new ReduceContextImpl<KEYIN, VALUEIN, KEYOUT, VALUEOUT>(conf, mrTaskAttemptID, r, null, combineInputRecordsCounter, recordWriter, null, reporter, comparator, keyClass, valClass);
    org.apache.hadoop.mapreduce.Reducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context reducerContext = new WrappedReducer<KEYIN, VALUEIN, KEYOUT, VALUEOUT>().getReducerContext(rContext);
    return reducerContext;
}

Also used : ReduceContextImpl(org.apache.hadoop.mapreduce.task.ReduceContextImpl) Reducer(org.apache.hadoop.mapred.Reducer) WrappedReducer(org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer) TezRawKeyValueIterator(org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator) RawKeyValueIterator(org.apache.hadoop.mapred.RawKeyValueIterator)

Example 5 with RawKeyValueIterator

use of org.apache.hadoop.mapred.RawKeyValueIterator in project hadoop by apache.

the class Shuffle method run.

@Override
public RawKeyValueIterator run() throws IOException, InterruptedException {
    // Scale the maximum events we fetch per RPC call to mitigate OOM issues
    // on the ApplicationMaster when a thundering herd of reducers fetch events
    // TODO: This should not be necessary after HADOOP-8942
    int eventsPerReducer = Math.max(MIN_EVENTS_TO_FETCH, MAX_RPC_OUTSTANDING_EVENTS / jobConf.getNumReduceTasks());
    int maxEventsToFetch = Math.min(MAX_EVENTS_TO_FETCH, eventsPerReducer);
    // Start the map-completion events fetcher thread
    final EventFetcher<K, V> eventFetcher = new EventFetcher<K, V>(reduceId, umbilical, scheduler, this, maxEventsToFetch);
    eventFetcher.start();
    // Start the map-output fetcher threads
    boolean isLocal = localMapFiles != null;
    final int numFetchers = isLocal ? 1 : jobConf.getInt(MRJobConfig.SHUFFLE_PARALLEL_COPIES, 5);
    Fetcher<K, V>[] fetchers = new Fetcher[numFetchers];
    if (isLocal) {
        fetchers[0] = new LocalFetcher<K, V>(jobConf, reduceId, scheduler, merger, reporter, metrics, this, reduceTask.getShuffleSecret(), localMapFiles);
        fetchers[0].start();
    } else {
        for (int i = 0; i < numFetchers; ++i) {
            fetchers[i] = new Fetcher<K, V>(jobConf, reduceId, scheduler, merger, reporter, metrics, this, reduceTask.getShuffleSecret());
            fetchers[i].start();
        }
    }
    // Wait for shuffle to complete successfully
    while (!scheduler.waitUntilDone(PROGRESS_FREQUENCY)) {
        reporter.progress();
        synchronized (this) {
            if (throwable != null) {
                throw new ShuffleError("error in shuffle in " + throwingThreadName, throwable);
            }
        }
    }
    // Stop the event-fetcher thread
    eventFetcher.shutDown();
    // Stop the map-output fetcher threads
    for (Fetcher<K, V> fetcher : fetchers) {
        fetcher.shutDown();
    }
    // stop the scheduler
    scheduler.close();
    // copy is already complete
    copyPhase.complete();
    taskStatus.setPhase(TaskStatus.Phase.SORT);
    reduceTask.statusUpdate(umbilical);
    // Finish the on-going merges...
    RawKeyValueIterator kvIter = null;
    try {
        kvIter = merger.close();
    } catch (Throwable e) {
        throw new ShuffleError("Error while doing final merge ", e);
    }
    // Sanity check
    synchronized (this) {
        if (throwable != null) {
            throw new ShuffleError("error in shuffle in " + throwingThreadName, throwable);
        }
    }
    return kvIter;
}

Also used : RawKeyValueIterator(org.apache.hadoop.mapred.RawKeyValueIterator)

Aggregations

RawKeyValueIterator (org.apache.hadoop.mapred.RawKeyValueIterator)7 WrappedReducer (org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer)4 ReduceContextImpl (org.apache.hadoop.mapreduce.task.ReduceContextImpl)4 CustomOutputCommitter (org.apache.hadoop.CustomOutputCommitter)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 NullWritable (org.apache.hadoop.io.NullWritable)2 SleepReducer (org.apache.hadoop.mapred.gridmix.SleepJob.SleepReducer)2 Counter (org.apache.hadoop.mapreduce.Counter)2 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)2 StatusReporter (org.apache.hadoop.mapreduce.StatusReporter)2 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)2 GenericCounter (org.apache.hadoop.mapreduce.counters.GenericCounter)2 DummyReporter (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl.DummyReporter)2 Progress (org.apache.hadoop.util.Progress)2 TezRawKeyValueIterator (org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator)2 Test (org.junit.Test)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1