Search in sources :

Example 6 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestIFile method testReadToDisk.

@Test(timeout = 20000)
public void testReadToDisk() throws IOException {
    // verify sending a stream of zeroes generates an error
    byte[] zeroData = new byte[1000];
    Arrays.fill(zeroData, (byte) 0);
    ByteArrayInputStream in = new ByteArrayInputStream(zeroData);
    try {
        IFile.Reader.readToDisk(new ByteArrayOutputStream(), in, zeroData.length, false, 0);
        fail("Exception should have been thrown");
    } catch (IOException e) {
    }
    // verify sending same stream of zeroes with a valid IFile header still
    // generates an error
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    baos.write(IFile.HEADER);
    baos.write(zeroData);
    try {
        IFile.Reader.readToDisk(new ByteArrayOutputStream(), new ByteArrayInputStream(baos.toByteArray()), zeroData.length, false, 0);
        fail("Exception should have been thrown");
    } catch (IOException e) {
        assertTrue(e instanceof ChecksumException);
    }
    // verify valid data is copied properly
    List<KVPair> data = KVDataGen.generateTestData(true, 0);
    Writer writer = writeTestFile(false, false, data, codec);
    baos.reset();
    IFile.Reader.readToDisk(baos, localFs.open(outputPath), writer.getCompressedLength(), false, 0);
    byte[] diskData = baos.toByteArray();
    Reader reader = new Reader(new ByteArrayInputStream(diskData), diskData.length, codec, null, null, false, 0, 1024);
    verifyData(reader, data);
    reader.close();
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ChecksumException(org.apache.hadoop.fs.ChecksumException) KVPair(org.apache.tez.runtime.library.testutils.KVDataGen.KVPair) Reader(org.apache.tez.runtime.library.common.sort.impl.IFile.Reader) InMemoryReader(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryReader) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BoundedByteArrayOutputStream(org.apache.hadoop.io.BoundedByteArrayOutputStream) IOException(java.io.IOException) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) Test(org.junit.Test)

Example 7 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestIFile method testAppendValueWithDataInputBuffer.

@Test(timeout = 5000)
public // Test appendValue with DataInputBuffer
void testAppendValueWithDataInputBuffer() throws IOException {
    List<KVPair> data = KVDataGen.generateTestData(false, rnd.nextInt(100));
    IFile.Writer writer = new IFile.Writer(defaultConf, localFs, outputPath, Text.class, IntWritable.class, codec, null, null);
    final DataInputBuffer previousKey = new DataInputBuffer();
    DataInputBuffer key = new DataInputBuffer();
    DataInputBuffer value = new DataInputBuffer();
    for (KVPair kvp : data) {
        populateData(kvp, key, value);
        if ((previousKey != null && BufferUtils.compare(key, previousKey) == 0)) {
            writer.appendValue(value);
        } else {
            writer.append(key, value);
        }
        previousKey.reset(k.getData(), 0, k.getLength());
    }
    writer.close();
    readAndVerifyData(writer.getRawLength(), writer.getCompressedLength(), data, codec);
}
Also used : DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) KVPair(org.apache.tez.runtime.library.testutils.KVDataGen.KVPair) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) InMemoryWriter(org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter) Test(org.junit.Test)

Example 8 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestMRCombiner method testRunNewCombiner.

@Test
public void testRunNewCombiner() throws IOException, InterruptedException {
    TezConfiguration conf = new TezConfiguration();
    setKeyAndValueClassTypes(conf);
    conf.setBoolean("mapred.mapper.new-api", true);
    conf.setClass(MRJobConfig.COMBINE_CLASS_ATTR, NewReducer.class, Object.class);
    TaskContext taskContext = getTaskContext(conf);
    MRCombiner combiner = new MRCombiner(taskContext);
    Writer writer = Mockito.mock(Writer.class);
    combiner.combine(new TezRawKeyValueIteratorTest(), writer);
    long inputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_INPUT_RECORDS).getValue();
    long outputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_OUTPUT_RECORDS).getValue();
    assertEquals(6, inputRecords);
    assertEquals(3, outputRecords);
    // verify combiner output keys and values
    verifyKeyAndValues(writer);
}
Also used : TaskContext(org.apache.tez.runtime.api.TaskContext) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) Test(org.junit.Test)

Example 9 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class TestMRCombiner method testRunOldCombiner.

@Test
public void testRunOldCombiner() throws IOException, InterruptedException {
    TezConfiguration conf = new TezConfiguration();
    setKeyAndValueClassTypes(conf);
    conf.setClass("mapred.combiner.class", OldReducer.class, Object.class);
    TaskContext taskContext = getTaskContext(conf);
    MRCombiner combiner = new MRCombiner(taskContext);
    Writer writer = Mockito.mock(Writer.class);
    combiner.combine(new TezRawKeyValueIteratorTest(), writer);
    long inputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_INPUT_RECORDS).getValue();
    long outputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_OUTPUT_RECORDS).getValue();
    assertEquals(6, inputRecords);
    assertEquals(3, outputRecords);
    // verify combiner output keys and values
    verifyKeyAndValues(writer);
}
Also used : TaskContext(org.apache.tez.runtime.api.TaskContext) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) Test(org.junit.Test)

Example 10 with Writer

use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.

the class DefaultSorter method mergeParts.

private void mergeParts() throws IOException, InterruptedException {
    // get the approximate size of the final output/index files
    long finalOutFileSize = 0;
    long finalIndexFileSize = 0;
    final Path[] filename = new Path[numSpills];
    final String taskIdentifier = outputContext.getUniqueIdentifier();
    for (int i = 0; i < numSpills; i++) {
        filename[i] = spillFilePaths.get(i);
        finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
    }
    if (numSpills == 1) {
        // the spill is the final output
        TezSpillRecord spillRecord = null;
        if (isFinalMergeEnabled()) {
            finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename[0]);
            finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]);
            sameVolRename(filename[0], finalOutputFile);
            if (indexCacheList.size() == 0) {
                sameVolRename(spillFileIndexPaths.get(0), finalIndexFile);
                spillRecord = new TezSpillRecord(finalIndexFile, conf);
            } else {
                spillRecord = indexCacheList.get(0);
                spillRecord.writeToFile(finalIndexFile, conf);
            }
        } else {
            List<Event> events = Lists.newLinkedList();
            // Since there is only one spill, spill record would be present in cache.
            spillRecord = indexCacheList.get(0);
            Path indexPath = mapOutputFile.getSpillIndexFileForWrite(numSpills - 1, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
            spillRecord.writeToFile(indexPath, conf);
            maybeSendEventForSpill(events, true, spillRecord, 0, true);
            fileOutputByteCounter.increment(rfs.getFileStatus(spillFilePaths.get(0)).getLen());
        // No need to populate finalIndexFile, finalOutputFile etc when finalMerge is disabled
        }
        if (spillRecord != null && reportPartitionStats()) {
            for (int i = 0; i < spillRecord.size(); i++) {
                partitionStats[i] += spillRecord.getIndex(i).getPartLength();
            }
        }
        numShuffleChunks.setValue(numSpills);
        return;
    }
    // read in paged indices
    for (int i = indexCacheList.size(); i < numSpills; ++i) {
        Path indexFileName = spillFileIndexPaths.get(i);
        indexCacheList.add(new TezSpillRecord(indexFileName, conf));
    }
    // Check if it is needed to do final merge. Or else, exit early.
    if (numSpills > 0 && !isFinalMergeEnabled()) {
        maybeAddEventsForSpills();
        // No need to do final merge.
        return;
    }
    // make correction in the length to include the sequence file header
    // lengths for each partition
    finalOutFileSize += partitions * APPROX_HEADER_LENGTH;
    finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;
    if (isFinalMergeEnabled()) {
        finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize);
        finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize);
    } else if (numSpills == 0) {
        // e.g attempt_1424502260528_0119_1_07_000058_0_10012_0/file.out when final merge is
        // disabled
        finalOutputFile = mapOutputFile.getSpillFileForWrite(numSpills, finalOutFileSize);
        finalIndexFile = mapOutputFile.getSpillIndexFileForWrite(numSpills, finalIndexFileSize);
    }
    // The output stream for the final single output file
    FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);
    if (!SPILL_FILE_PERMS.equals(SPILL_FILE_PERMS.applyUMask(FsPermission.getUMask(conf)))) {
        rfs.setPermission(finalOutputFile, SPILL_FILE_PERMS);
    }
    if (numSpills == 0) {
        // TODO Change event generation to say there is no data rather than generating a dummy file
        // create dummy files
        long rawLength = 0;
        long partLength = 0;
        TezSpillRecord sr = new TezSpillRecord(partitions);
        try {
            for (int i = 0; i < partitions; i++) {
                long segmentStart = finalOut.getPos();
                if (!sendEmptyPartitionDetails) {
                    Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null);
                    writer.close();
                    rawLength = writer.getRawLength();
                    partLength = writer.getCompressedLength();
                }
                TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
                // Covers the case of multiple spills.
                outputBytesWithOverheadCounter.increment(rawLength);
                sr.putIndex(rec, i);
            }
            sr.writeToFile(finalIndexFile, conf);
        } finally {
            finalOut.close();
        }
        ++numSpills;
        if (!isFinalMergeEnabled()) {
            List<Event> events = Lists.newLinkedList();
            maybeSendEventForSpill(events, true, sr, 0, true);
            fileOutputByteCounter.increment(rfs.getFileStatus(finalOutputFile).getLen());
        }
        numShuffleChunks.setValue(numSpills);
        return;
    } else {
        final TezSpillRecord spillRec = new TezSpillRecord(partitions);
        for (int parts = 0; parts < partitions; parts++) {
            boolean shouldWrite = false;
            // create the segments to be merged
            List<Segment> segmentList = new ArrayList<Segment>(numSpills);
            for (int i = 0; i < numSpills; i++) {
                outputContext.notifyProgress();
                TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);
                if (indexRecord.hasData() || !sendEmptyPartitionDetails) {
                    shouldWrite = true;
                    DiskSegment s = new DiskSegment(rfs, filename[i], indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true);
                    segmentList.add(s);
                }
                if (LOG.isDebugEnabled()) {
                    LOG.debug(outputContext.getDestinationVertexName() + ": " + "TaskIdentifier=" + taskIdentifier + " Partition=" + parts + "Spill =" + i + "(" + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", " + indexRecord.getPartLength() + ")");
                }
            }
            int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
            // sort the segments only if there are intermediate merges
            boolean sortSegments = segmentList.size() > mergeFactor;
            // merge
            TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(taskIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), progressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, // Not using any Progress in TezMerger. Should just work.
            null);
            // write merged output to disk
            long segmentStart = finalOut.getPos();
            long rawLength = 0;
            long partLength = 0;
            if (shouldWrite) {
                Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null);
                if (combiner == null || numSpills < minSpillsForCombine) {
                    TezMerger.writeFile(kvIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
                } else {
                    runCombineProcessor(kvIter, writer);
                }
                writer.close();
                rawLength = writer.getRawLength();
                partLength = writer.getCompressedLength();
            }
            outputBytesWithOverheadCounter.increment(rawLength);
            // record offsets
            final TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
            spillRec.putIndex(rec, parts);
            if (reportPartitionStats()) {
                partitionStats[parts] += partLength;
            }
        }
        // final merge has happened
        numShuffleChunks.setValue(1);
        spillRec.writeToFile(finalIndexFile, conf);
        finalOut.close();
        for (int i = 0; i < numSpills; i++) {
            rfs.delete(filename[i], true);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) DiskSegment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment) ArrayList(java.util.ArrayList) DiskSegment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment) Segment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment) TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) TezIndexRecord(org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord) Event(org.apache.tez.runtime.api.Event) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer) TezRawKeyValueIterator(org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator)

Aggregations

Writer (org.apache.tez.runtime.library.common.sort.impl.IFile.Writer)25 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)12 Test (org.junit.Test)12 InMemoryWriter (org.apache.tez.runtime.library.common.shuffle.orderedgrouped.InMemoryWriter)11 Path (org.apache.hadoop.fs.Path)8 DataInputBuffer (org.apache.hadoop.io.DataInputBuffer)6 ArrayList (java.util.ArrayList)5 Text (org.apache.hadoop.io.Text)5 TezIndexRecord (org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord)5 TezSpillRecord (org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord)5 KVPair (org.apache.tez.runtime.library.testutils.KVDataGen.KVPair)5 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)4 TaskContext (org.apache.tez.runtime.api.TaskContext)4 IFile (org.apache.tez.runtime.library.common.sort.impl.IFile)4 IOException (java.io.IOException)3 IntWritable (org.apache.hadoop.io.IntWritable)3 Reader (org.apache.tez.runtime.library.common.sort.impl.IFile.Reader)3 DiskSegment (org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment)3 Segment (org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment)3 TezRawKeyValueIterator (org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator)3