Search in sources :

Example 6 with RawComparator

use of org.apache.hadoop.io.RawComparator in project tez by apache.

the class TestTezMerger method testWithCustomComparator_mixedFiles.

@Test(timeout = 5000)
public void testWithCustomComparator_mixedFiles() throws Exception {
    List<Path> pathList = new LinkedList<Path>();
    List<String> data = Lists.newLinkedList();
    LOG.info("Test with custom comparator with mixed set of segments (empty, non-empty etc)");
    // Test with 2 files, where the RLE keys can span across files
    // First file
    data.clear();
    data.add("0");
    pathList.add(createIFileWithTextData(data));
    // Second file; empty file
    data.clear();
    pathList.add(createIFileWithTextData(data));
    // Third file with empty key
    data.clear();
    data.add("");
    pathList.add(createIFileWithTextData(data));
    // Fourth file with repeated keys
    data.clear();
    data.add("0");
    data.add("0");
    data.add("0");
    pathList.add(createIFileWithTextData(data));
    // Merge datasets with custom comparator
    RawComparator rc = new CustomComparator();
    TezRawKeyValueIterator records = merge(pathList, rc);
    // expected result
    String[][] expectedResult = { // formatting intentionally
    { "", DIFF_KEY }, { "0", DIFF_KEY }, { "0", SAME_KEY }, { "0", SAME_KEY }, { "0", SAME_KEY } };
    verify(records, expectedResult);
    pathList.clear();
    data.clear();
}
Also used : Path(org.apache.hadoop.fs.Path) RawComparator(org.apache.hadoop.io.RawComparator) LinkedList(java.util.LinkedList) Test(org.junit.Test)

Example 7 with RawComparator

use of org.apache.hadoop.io.RawComparator in project tez by apache.

the class TestTezMerger method testWithCustomComparator_No_RLE.

@Test(timeout = 5000)
public void testWithCustomComparator_No_RLE() throws Exception {
    List<Path> pathList = new LinkedList<Path>();
    List<String> data = Lists.newLinkedList();
    // Merge datasets with custom comparator
    RawComparator rc = new CustomComparator();
    LOG.info("Test with custom comparator with no RLE");
    // Test with 3 files,
    data.add("1");
    data.add("4");
    data.add("5");
    pathList.add(createIFileWithTextData(data));
    // Second file with empty key
    data.clear();
    data.add("2");
    data.add("6");
    data.add("7");
    pathList.add(createIFileWithTextData(data));
    // Third file
    data.clear();
    data.add("3");
    data.add("8");
    data.add("9");
    pathList.add(createIFileWithTextData(data));
    TezRawKeyValueIterator records = merge(pathList, rc);
    String[][] expectedResult = { { "1", DIFF_KEY }, { "2", DIFF_KEY }, { "3", DIFF_KEY }, { "4", DIFF_KEY }, { "5", DIFF_KEY }, { "6", DIFF_KEY }, { "7", DIFF_KEY }, { "8", DIFF_KEY }, { "9", DIFF_KEY } };
    verify(records, expectedResult);
    pathList.clear();
    data.clear();
}
Also used : Path(org.apache.hadoop.fs.Path) RawComparator(org.apache.hadoop.io.RawComparator) LinkedList(java.util.LinkedList) Test(org.junit.Test)

Example 8 with RawComparator

use of org.apache.hadoop.io.RawComparator in project tez by apache.

the class OrderedGroupedKVInput method createValuesIterator.

@SuppressWarnings({ "rawtypes", "unchecked" })
protected synchronized void createValuesIterator() throws IOException {
    // Not used by ReduceProcessor
    RawComparator rawComparator = ConfigUtils.getIntermediateInputKeyComparator(conf);
    Class<?> keyClass = ConfigUtils.getIntermediateInputKeyClass(conf);
    Class<?> valClass = ConfigUtils.getIntermediateInputValueClass(conf);
    LOG.info(getContext().getSourceVertexName() + ": " + "creating ValuesIterator with " + "comparator=" + rawComparator.getClass().getName() + ", keyClass=" + keyClass.getName() + ", valClass=" + valClass.getName());
    vIter = new ValuesIterator(rawIter, rawComparator, keyClass, valClass, conf, inputKeyCounter, inputValueCounter);
}
Also used : RawComparator(org.apache.hadoop.io.RawComparator) ValuesIterator(org.apache.tez.runtime.library.common.ValuesIterator)

Example 9 with RawComparator

use of org.apache.hadoop.io.RawComparator in project tez by apache.

the class ReduceProcessor method run.

@Override
public void run(Map<String, LogicalInput> _inputs, Map<String, LogicalOutput> _outputs) throws Exception {
    this.inputs = _inputs;
    this.outputs = _outputs;
    progressHelper = new ProgressHelper(this.inputs, processorContext, this.getClass().getSimpleName());
    LOG.info("Running reduce: " + processorContext.getUniqueIdentifier());
    if (_outputs.size() <= 0 || _outputs.size() > 1) {
        throw new IOException("Invalid number of _outputs" + ", outputCount=" + _outputs.size());
    }
    if (_inputs.size() <= 0 || _inputs.size() > 1) {
        throw new IOException("Invalid number of _inputs" + ", inputCount=" + _inputs.size());
    }
    LogicalInput in = _inputs.values().iterator().next();
    in.start();
    List<Input> pendingInputs = new LinkedList<Input>();
    pendingInputs.add(in);
    processorContext.waitForAllInputsReady(pendingInputs);
    LOG.info("Input is ready for consumption. Starting Output");
    LogicalOutput out = _outputs.values().iterator().next();
    out.start();
    initTask(out);
    progressHelper.scheduleProgressTaskService(0, 100);
    this.statusUpdate();
    Class keyClass = ConfigUtils.getIntermediateInputKeyClass(jobConf);
    Class valueClass = ConfigUtils.getIntermediateInputValueClass(jobConf);
    LOG.info("Using keyClass: " + keyClass);
    LOG.info("Using valueClass: " + valueClass);
    RawComparator comparator = ConfigUtils.getInputKeySecondaryGroupingComparator(jobConf);
    LOG.info("Using comparator: " + comparator);
    reduceInputKeyCounter = mrReporter.getCounter(TaskCounter.REDUCE_INPUT_GROUPS);
    reduceInputValueCounter = mrReporter.getCounter(TaskCounter.REDUCE_INPUT_RECORDS);
    // Sanity check
    if (!(in instanceof OrderedGroupedInputLegacy)) {
        throw new IOException("Illegal input to reduce: " + in.getClass());
    }
    OrderedGroupedInputLegacy shuffleInput = (OrderedGroupedInputLegacy) in;
    KeyValuesReader kvReader = shuffleInput.getReader();
    KeyValueWriter kvWriter = null;
    if ((out instanceof MROutputLegacy)) {
        kvWriter = ((MROutputLegacy) out).getWriter();
    } else if ((out instanceof OrderedPartitionedKVOutput)) {
        kvWriter = ((OrderedPartitionedKVOutput) out).getWriter();
    } else {
        throw new IOException("Illegal output to reduce: " + in.getClass());
    }
    if (useNewApi) {
        try {
            runNewReducer(jobConf, mrReporter, shuffleInput, comparator, keyClass, valueClass, kvWriter);
        } catch (ClassNotFoundException cnfe) {
            throw new IOException(cnfe);
        }
    } else {
        runOldReducer(jobConf, mrReporter, kvReader, comparator, keyClass, valueClass, kvWriter);
    }
    done();
}
Also used : OrderedGroupedInputLegacy(org.apache.tez.runtime.library.input.OrderedGroupedInputLegacy) ProgressHelper(org.apache.tez.common.ProgressHelper) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) MROutputLegacy(org.apache.tez.mapreduce.output.MROutputLegacy) OrderedPartitionedKVOutput(org.apache.tez.runtime.library.output.OrderedPartitionedKVOutput) IOException(java.io.IOException) LinkedList(java.util.LinkedList) KeyValueWriter(org.apache.tez.runtime.library.api.KeyValueWriter) RawComparator(org.apache.hadoop.io.RawComparator) LogicalInput(org.apache.tez.runtime.api.LogicalInput) Input(org.apache.tez.runtime.api.Input) LogicalInput(org.apache.tez.runtime.api.LogicalInput) KeyValuesReader(org.apache.tez.runtime.library.api.KeyValuesReader)

Example 10 with RawComparator

use of org.apache.hadoop.io.RawComparator in project tez by apache.

the class MergeManager method finalMerge.

private TezRawKeyValueIterator finalMerge(Configuration job, FileSystem fs, List<MapOutput> inMemoryMapOutputs, List<FileChunk> onDiskMapOutputs) throws IOException, InterruptedException {
    logFinalMergeStart(inMemoryMapOutputs, onDiskMapOutputs);
    StringBuilder finalMergeLog = new StringBuilder();
    inputContext.notifyProgress();
    // merge config params
    Class keyClass = (Class) ConfigUtils.getIntermediateInputKeyClass(job);
    Class valueClass = (Class) ConfigUtils.getIntermediateInputValueClass(job);
    final Path tmpDir = new Path(inputContext.getUniqueIdentifier());
    final RawComparator comparator = (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(job);
    // segments required to vacate memory
    List<Segment> memDiskSegments = new ArrayList<Segment>();
    long inMemToDiskBytes = 0;
    boolean mergePhaseFinished = false;
    if (inMemoryMapOutputs.size() > 0) {
        int srcTaskId = inMemoryMapOutputs.get(0).getAttemptIdentifier().getInputIdentifier();
        inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, this.postMergeMemLimit);
        final int numMemDiskSegments = memDiskSegments.size();
        if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
            // If we reach here, it implies that we have less than io.sort.factor
            // disk segments and this will be incremented by 1 (result of the
            // memory segments merge). Since this total would still be
            // <= io.sort.factor, we will not do any more intermediate merges,
            // the merge of all these disk segments would be directly fed to the
            // reduce method
            mergePhaseFinished = true;
            // must spill to disk, but can't retain in-mem for intermediate merge
            // Can not use spill id in final merge as it would clobber with other files, hence using
            // Integer.MAX_VALUE
            final Path outputPath = mapOutputFile.getInputFileForWrite(srcTaskId, Integer.MAX_VALUE, inMemToDiskBytes).suffix(Constants.MERGED_OUTPUT_PREFIX);
            final TezRawKeyValueIterator rIter = TezMerger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
            final Writer writer = new Writer(job, fs, outputPath, keyClass, valueClass, codec, null, null);
            try {
                TezMerger.writeFile(rIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
            } catch (IOException e) {
                if (null != outputPath) {
                    try {
                        fs.delete(outputPath, true);
                    } catch (IOException ie) {
                    // NOTHING
                    }
                }
                throw e;
            } finally {
                if (null != writer) {
                    writer.close();
                    additionalBytesWritten.increment(writer.getCompressedLength());
                }
            }
            final FileStatus fStatus = localFS.getFileStatus(outputPath);
            // add to list of final disk outputs.
            onDiskMapOutputs.add(new FileChunk(outputPath, 0, fStatus.getLen()));
            if (LOG.isInfoEnabled()) {
                finalMergeLog.append("MemMerged: " + numMemDiskSegments + ", " + inMemToDiskBytes);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Merged " + numMemDiskSegments + "segments, size=" + inMemToDiskBytes + " to " + outputPath);
                }
            }
            inMemToDiskBytes = 0;
            memDiskSegments.clear();
        } else if (inMemToDiskBytes != 0) {
            if (LOG.isInfoEnabled()) {
                finalMergeLog.append("DelayedMemMerge: " + numMemDiskSegments + ", " + inMemToDiskBytes);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
                }
            }
        }
    }
    // segments on disk
    List<Segment> diskSegments = new ArrayList<Segment>();
    long onDiskBytes = inMemToDiskBytes;
    FileChunk[] onDisk = onDiskMapOutputs.toArray(new FileChunk[onDiskMapOutputs.size()]);
    for (FileChunk fileChunk : onDisk) {
        final long fileLength = fileChunk.getLength();
        onDiskBytes += fileLength;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Disk file=" + fileChunk.getPath() + ", len=" + fileLength + ", isLocal=" + fileChunk.isLocalFile());
        }
        final Path file = fileChunk.getPath();
        TezCounter counter = file.toString().endsWith(Constants.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter;
        final long fileOffset = fileChunk.getOffset();
        final boolean preserve = fileChunk.isLocalFile();
        diskSegments.add(new DiskSegment(fs, file, fileOffset, fileLength, codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, preserve, counter));
    }
    if (LOG.isInfoEnabled()) {
        finalMergeLog.append(". DiskSeg: " + onDisk.length + ", " + onDiskBytes);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
        }
    }
    Collections.sort(diskSegments, new Comparator<Segment>() {

        public int compare(Segment o1, Segment o2) {
            if (o1.getLength() == o2.getLength()) {
                return 0;
            }
            return o1.getLength() < o2.getLength() ? -1 : 1;
        }
    });
    // build final list of segments from merged backed by disk + in-mem
    List<Segment> finalSegments = new ArrayList<Segment>();
    long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
    if (LOG.isInfoEnabled()) {
        finalMergeLog.append(". MemSeg: " + finalSegments.size() + ", " + inMemBytes);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
        }
    }
    if (0 != onDiskBytes) {
        final int numInMemSegments = memDiskSegments.size();
        diskSegments.addAll(0, memDiskSegments);
        memDiskSegments.clear();
        TezRawKeyValueIterator diskMerge = TezMerger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, progressable, false, spilledRecordsCounter, null, additionalBytesRead, null);
        diskSegments.clear();
        if (0 == finalSegments.size()) {
            return diskMerge;
        }
        finalSegments.add(new Segment(new RawKVIteratorReader(diskMerge, onDiskBytes), null));
    }
    if (LOG.isInfoEnabled()) {
        LOG.info(finalMergeLog.toString());
    }
    // This is doing nothing but creating an iterator over the segments.
    return TezMerger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
}
Also used : Path(org.apache.hadoop.fs.Path) DiskSegment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TezCounter(org.apache.tez.common.counters.TezCounter) DiskSegment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment) Segment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment) RawComparator(org.apache.hadoop.io.RawComparator) FileChunk(org.apache.hadoop.io.FileChunk) TezRawKeyValueIterator(org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer)

Aggregations

RawComparator (org.apache.hadoop.io.RawComparator)12 Path (org.apache.hadoop.fs.Path)8 LinkedList (java.util.LinkedList)5 IOException (java.io.IOException)4 Test (org.junit.Test)4 ArrayList (java.util.ArrayList)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Configuration (org.apache.hadoop.conf.Configuration)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 BinaryComparable (org.apache.hadoop.io.BinaryComparable)1 FileChunk (org.apache.hadoop.io.FileChunk)1 NullWritable (org.apache.hadoop.io.NullWritable)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 Writer (org.apache.hadoop.mapred.IFile.Writer)1 JobConf (org.apache.hadoop.mapred.JobConf)1 Segment (org.apache.hadoop.mapred.Merger.Segment)1 RawKeyValueIterator (org.apache.hadoop.mapred.RawKeyValueIterator)1 Reducer (org.apache.hadoop.mapred.Reducer)1 CombineValuesIterator (org.apache.hadoop.mapred.Task.CombineValuesIterator)1