Search in sources :

Example 1 with IFile

use of org.apache.tez.runtime.library.common.sort.impl.IFile in project tez by apache.

the class UnorderedPartitionedKVWriter method mergeAll.

private void mergeAll() throws IOException {
    long expectedSize = spilledSize;
    if (currentBuffer.nextPosition != 0) {
        expectedSize += currentBuffer.nextPosition - (currentBuffer.numRecords * META_SIZE) - currentBuffer.skipSize + numPartitions * APPROX_HEADER_LENGTH;
        // Update final statistics.
        updateGlobalStats(currentBuffer);
    }
    SpillPathDetails spillPathDetails = getSpillPathDetails(true, expectedSize);
    finalIndexPath = spillPathDetails.indexFilePath;
    finalOutPath = spillPathDetails.outputFilePath;
    TezSpillRecord finalSpillRecord = new TezSpillRecord(numPartitions);
    DataInputBuffer keyBuffer = new DataInputBuffer();
    DataInputBuffer valBuffer = new DataInputBuffer();
    DataInputBuffer keyBufferIFile = new DataInputBuffer();
    DataInputBuffer valBufferIFile = new DataInputBuffer();
    FSDataOutputStream out = null;
    try {
        out = rfs.create(finalOutPath);
        if (!SPILL_FILE_PERMS.equals(SPILL_FILE_PERMS.applyUMask(FsPermission.getUMask(conf)))) {
            rfs.setPermission(finalOutPath, SPILL_FILE_PERMS);
        }
        Writer writer = null;
        for (int i = 0; i < numPartitions; i++) {
            long segmentStart = out.getPos();
            if (numRecordsPerPartition[i] == 0) {
                LOG.info(destNameTrimmed + ": " + "Skipping partition: " + i + " in final merge since it has no records");
                continue;
            }
            writer = new Writer(conf, out, keyClass, valClass, codec, null, null);
            try {
                if (currentBuffer.nextPosition != 0 && currentBuffer.partitionPositions[i] != WrappedBuffer.PARTITION_ABSENT_POSITION) {
                    // Write current buffer.
                    writePartition(currentBuffer.partitionPositions[i], currentBuffer, writer, keyBuffer, valBuffer);
                }
                synchronized (spillInfoList) {
                    for (SpillInfo spillInfo : spillInfoList) {
                        TezIndexRecord indexRecord = spillInfo.spillRecord.getIndex(i);
                        if (indexRecord.getPartLength() == 0) {
                            // Skip empty partitions within a spill
                            continue;
                        }
                        FSDataInputStream in = rfs.open(spillInfo.outPath);
                        in.seek(indexRecord.getStartOffset());
                        IFile.Reader reader = new IFile.Reader(in, indexRecord.getPartLength(), codec, null, additionalSpillBytesReadCounter, ifileReadAhead, ifileReadAheadLength, ifileBufferSize);
                        while (reader.nextRawKey(keyBufferIFile)) {
                            // TODO Inefficient. If spills are not compressed, a direct copy should be possible
                            // given the current IFile format. Also exteremely inefficient for large records,
                            // since the entire record will be read into memory.
                            reader.nextRawValue(valBufferIFile);
                            writer.append(keyBufferIFile, valBufferIFile);
                        }
                        reader.close();
                    }
                }
                writer.close();
                fileOutputBytesCounter.increment(writer.getCompressedLength());
                TezIndexRecord indexRecord = new TezIndexRecord(segmentStart, writer.getRawLength(), writer.getCompressedLength());
                writer = null;
                finalSpillRecord.putIndex(indexRecord, i);
                outputContext.notifyProgress();
            } finally {
                if (writer != null) {
                    writer.close();
                }
            }
        }
    } finally {
        if (out != null) {
            out.close();
        }
        deleteIntermediateSpills();
    }
    finalSpillRecord.writeToFile(finalIndexPath, conf);
    fileOutputBytesCounter.increment(indexFileSizeEstimate);
    LOG.info(destNameTrimmed + ": " + "Finished final spill after merging : " + numSpills.get() + " spills");
}
Also used : IFile(org.apache.tez.runtime.library.common.sort.impl.IFile) TezSpillRecord(org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) TezIndexRecord(org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer)

Aggregations

FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 DataInputBuffer (org.apache.hadoop.io.DataInputBuffer)1 IFile (org.apache.tez.runtime.library.common.sort.impl.IFile)1 Writer (org.apache.tez.runtime.library.common.sort.impl.IFile.Writer)1 TezIndexRecord (org.apache.tez.runtime.library.common.sort.impl.TezIndexRecord)1 TezSpillRecord (org.apache.tez.runtime.library.common.sort.impl.TezSpillRecord)1