use of org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment in project tez by apache.
the class DefaultSorter method mergeParts.
private void mergeParts() throws IOException, InterruptedException {
// get the approximate size of the final output/index files
long finalOutFileSize = 0;
long finalIndexFileSize = 0;
final Path[] filename = new Path[numSpills];
final String taskIdentifier = outputContext.getUniqueIdentifier();
for (int i = 0; i < numSpills; i++) {
filename[i] = spillFilePaths.get(i);
finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
}
if (numSpills == 1) {
// the spill is the final output
TezSpillRecord spillRecord = null;
if (isFinalMergeEnabled()) {
finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename[0]);
finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]);
sameVolRename(filename[0], finalOutputFile);
if (indexCacheList.size() == 0) {
sameVolRename(spillFileIndexPaths.get(0), finalIndexFile);
spillRecord = new TezSpillRecord(finalIndexFile, conf);
} else {
spillRecord = indexCacheList.get(0);
spillRecord.writeToFile(finalIndexFile, conf);
}
} else {
List<Event> events = Lists.newLinkedList();
// Since there is only one spill, spill record would be present in cache.
spillRecord = indexCacheList.get(0);
Path indexPath = mapOutputFile.getSpillIndexFileForWrite(numSpills - 1, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
spillRecord.writeToFile(indexPath, conf);
maybeSendEventForSpill(events, true, spillRecord, 0, true);
fileOutputByteCounter.increment(rfs.getFileStatus(spillFilePaths.get(0)).getLen());
// No need to populate finalIndexFile, finalOutputFile etc when finalMerge is disabled
}
if (spillRecord != null && reportPartitionStats()) {
for (int i = 0; i < spillRecord.size(); i++) {
partitionStats[i] += spillRecord.getIndex(i).getPartLength();
}
}
numShuffleChunks.setValue(numSpills);
return;
}
// read in paged indices
for (int i = indexCacheList.size(); i < numSpills; ++i) {
Path indexFileName = spillFileIndexPaths.get(i);
indexCacheList.add(new TezSpillRecord(indexFileName, conf));
}
// Check if it is needed to do final merge. Or else, exit early.
if (numSpills > 0 && !isFinalMergeEnabled()) {
maybeAddEventsForSpills();
// No need to do final merge.
return;
}
// make correction in the length to include the sequence file header
// lengths for each partition
finalOutFileSize += partitions * APPROX_HEADER_LENGTH;
finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;
if (isFinalMergeEnabled()) {
finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize);
finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize);
} else if (numSpills == 0) {
// e.g attempt_1424502260528_0119_1_07_000058_0_10012_0/file.out when final merge is
// disabled
finalOutputFile = mapOutputFile.getSpillFileForWrite(numSpills, finalOutFileSize);
finalIndexFile = mapOutputFile.getSpillIndexFileForWrite(numSpills, finalIndexFileSize);
}
// The output stream for the final single output file
FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);
if (!SPILL_FILE_PERMS.equals(SPILL_FILE_PERMS.applyUMask(FsPermission.getUMask(conf)))) {
rfs.setPermission(finalOutputFile, SPILL_FILE_PERMS);
}
if (numSpills == 0) {
// TODO Change event generation to say there is no data rather than generating a dummy file
// create dummy files
long rawLength = 0;
long partLength = 0;
TezSpillRecord sr = new TezSpillRecord(partitions);
try {
for (int i = 0; i < partitions; i++) {
long segmentStart = finalOut.getPos();
if (!sendEmptyPartitionDetails) {
Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null);
writer.close();
rawLength = writer.getRawLength();
partLength = writer.getCompressedLength();
}
TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
// Covers the case of multiple spills.
outputBytesWithOverheadCounter.increment(rawLength);
sr.putIndex(rec, i);
}
sr.writeToFile(finalIndexFile, conf);
} finally {
finalOut.close();
}
++numSpills;
if (!isFinalMergeEnabled()) {
List<Event> events = Lists.newLinkedList();
maybeSendEventForSpill(events, true, sr, 0, true);
fileOutputByteCounter.increment(rfs.getFileStatus(finalOutputFile).getLen());
}
numShuffleChunks.setValue(numSpills);
return;
} else {
final TezSpillRecord spillRec = new TezSpillRecord(partitions);
for (int parts = 0; parts < partitions; parts++) {
boolean shouldWrite = false;
// create the segments to be merged
List<Segment> segmentList = new ArrayList<Segment>(numSpills);
for (int i = 0; i < numSpills; i++) {
outputContext.notifyProgress();
TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);
if (indexRecord.hasData() || !sendEmptyPartitionDetails) {
shouldWrite = true;
DiskSegment s = new DiskSegment(rfs, filename[i], indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true);
segmentList.add(s);
}
if (LOG.isDebugEnabled()) {
LOG.debug(outputContext.getDestinationVertexName() + ": " + "TaskIdentifier=" + taskIdentifier + " Partition=" + parts + "Spill =" + i + "(" + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", " + indexRecord.getPartLength() + ")");
}
}
int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
// sort the segments only if there are intermediate merges
boolean sortSegments = segmentList.size() > mergeFactor;
// merge
TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(taskIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), progressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, // Not using any Progress in TezMerger. Should just work.
null);
// write merged output to disk
long segmentStart = finalOut.getPos();
long rawLength = 0;
long partLength = 0;
if (shouldWrite) {
Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null);
if (combiner == null || numSpills < minSpillsForCombine) {
TezMerger.writeFile(kvIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
} else {
runCombineProcessor(kvIter, writer);
}
writer.close();
rawLength = writer.getRawLength();
partLength = writer.getCompressedLength();
}
outputBytesWithOverheadCounter.increment(rawLength);
// record offsets
final TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
spillRec.putIndex(rec, parts);
if (reportPartitionStats()) {
partitionStats[parts] += partLength;
}
}
// final merge has happened
numShuffleChunks.setValue(1);
spillRec.writeToFile(finalIndexFile, conf);
finalOut.close();
for (int i = 0; i < numSpills; i++) {
rfs.delete(filename[i], true);
}
}
}
use of org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment in project tez by apache.
the class MergeManager method finalMerge.
private TezRawKeyValueIterator finalMerge(Configuration job, FileSystem fs, List<MapOutput> inMemoryMapOutputs, List<FileChunk> onDiskMapOutputs) throws IOException, InterruptedException {
logFinalMergeStart(inMemoryMapOutputs, onDiskMapOutputs);
StringBuilder finalMergeLog = new StringBuilder();
inputContext.notifyProgress();
// merge config params
Class keyClass = (Class) ConfigUtils.getIntermediateInputKeyClass(job);
Class valueClass = (Class) ConfigUtils.getIntermediateInputValueClass(job);
final Path tmpDir = new Path(inputContext.getUniqueIdentifier());
final RawComparator comparator = (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(job);
// segments required to vacate memory
List<Segment> memDiskSegments = new ArrayList<Segment>();
long inMemToDiskBytes = 0;
boolean mergePhaseFinished = false;
if (inMemoryMapOutputs.size() > 0) {
int srcTaskId = inMemoryMapOutputs.get(0).getAttemptIdentifier().getInputIdentifier();
inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, this.postMergeMemLimit);
final int numMemDiskSegments = memDiskSegments.size();
if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
// If we reach here, it implies that we have less than io.sort.factor
// disk segments and this will be incremented by 1 (result of the
// memory segments merge). Since this total would still be
// <= io.sort.factor, we will not do any more intermediate merges,
// the merge of all these disk segments would be directly fed to the
// reduce method
mergePhaseFinished = true;
// must spill to disk, but can't retain in-mem for intermediate merge
// Can not use spill id in final merge as it would clobber with other files, hence using
// Integer.MAX_VALUE
final Path outputPath = mapOutputFile.getInputFileForWrite(srcTaskId, Integer.MAX_VALUE, inMemToDiskBytes).suffix(Constants.MERGED_OUTPUT_PREFIX);
final TezRawKeyValueIterator rIter = TezMerger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
final Writer writer = new Writer(job, fs, outputPath, keyClass, valueClass, codec, null, null);
try {
TezMerger.writeFile(rIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
} catch (IOException e) {
if (null != outputPath) {
try {
fs.delete(outputPath, true);
} catch (IOException ie) {
// NOTHING
}
}
throw e;
} finally {
if (null != writer) {
writer.close();
additionalBytesWritten.increment(writer.getCompressedLength());
}
}
final FileStatus fStatus = localFS.getFileStatus(outputPath);
// add to list of final disk outputs.
onDiskMapOutputs.add(new FileChunk(outputPath, 0, fStatus.getLen()));
if (LOG.isInfoEnabled()) {
finalMergeLog.append("MemMerged: " + numMemDiskSegments + ", " + inMemToDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merged " + numMemDiskSegments + "segments, size=" + inMemToDiskBytes + " to " + outputPath);
}
}
inMemToDiskBytes = 0;
memDiskSegments.clear();
} else if (inMemToDiskBytes != 0) {
if (LOG.isInfoEnabled()) {
finalMergeLog.append("DelayedMemMerge: " + numMemDiskSegments + ", " + inMemToDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
}
}
}
}
// segments on disk
List<Segment> diskSegments = new ArrayList<Segment>();
long onDiskBytes = inMemToDiskBytes;
FileChunk[] onDisk = onDiskMapOutputs.toArray(new FileChunk[onDiskMapOutputs.size()]);
for (FileChunk fileChunk : onDisk) {
final long fileLength = fileChunk.getLength();
onDiskBytes += fileLength;
if (LOG.isDebugEnabled()) {
LOG.debug("Disk file=" + fileChunk.getPath() + ", len=" + fileLength + ", isLocal=" + fileChunk.isLocalFile());
}
final Path file = fileChunk.getPath();
TezCounter counter = file.toString().endsWith(Constants.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter;
final long fileOffset = fileChunk.getOffset();
final boolean preserve = fileChunk.isLocalFile();
diskSegments.add(new DiskSegment(fs, file, fileOffset, fileLength, codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, preserve, counter));
}
if (LOG.isInfoEnabled()) {
finalMergeLog.append(". DiskSeg: " + onDisk.length + ", " + onDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
}
}
Collections.sort(diskSegments, new Comparator<Segment>() {
public int compare(Segment o1, Segment o2) {
if (o1.getLength() == o2.getLength()) {
return 0;
}
return o1.getLength() < o2.getLength() ? -1 : 1;
}
});
// build final list of segments from merged backed by disk + in-mem
List<Segment> finalSegments = new ArrayList<Segment>();
long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
if (LOG.isInfoEnabled()) {
finalMergeLog.append(". MemSeg: " + finalSegments.size() + ", " + inMemBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
}
}
if (0 != onDiskBytes) {
final int numInMemSegments = memDiskSegments.size();
diskSegments.addAll(0, memDiskSegments);
memDiskSegments.clear();
TezRawKeyValueIterator diskMerge = TezMerger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, progressable, false, spilledRecordsCounter, null, additionalBytesRead, null);
diskSegments.clear();
if (0 == finalSegments.size()) {
return diskMerge;
}
finalSegments.add(new Segment(new RawKVIteratorReader(diskMerge, onDiskBytes), null));
}
if (LOG.isInfoEnabled()) {
LOG.info(finalMergeLog.toString());
}
// This is doing nothing but creating an iterator over the segments.
return TezMerger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
}
use of org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment in project tez by apache.
the class MergeManager method createInMemorySegments.
private long createInMemorySegments(List<MapOutput> inMemoryMapOutputs, List<Segment> inMemorySegments, long leaveBytes) throws IOException {
long totalSize = 0L;
// We could use fullSize could come from the RamManager, but files can be
// closed but not yet present in inMemoryMapOutputs
long fullSize = 0L;
for (MapOutput mo : inMemoryMapOutputs) {
fullSize += mo.getSize();
}
int inMemoryMapOutputsOffset = 0;
while ((fullSize > leaveBytes) && !Thread.currentThread().isInterrupted()) {
MapOutput mo = inMemoryMapOutputs.get(inMemoryMapOutputsOffset++);
byte[] data = mo.getMemory();
long size = data.length;
totalSize += size;
fullSize -= size;
IFile.Reader reader = new InMemoryReader(MergeManager.this, mo.getAttemptIdentifier(), data, 0, (int) size);
inMemorySegments.add(new Segment(reader, (mo.isPrimaryMapOutput() ? mergedMapOutputsCounter : null)));
}
// Bulk remove removed in-memory map outputs efficiently
inMemoryMapOutputs.subList(0, inMemoryMapOutputsOffset).clear();
return totalSize;
}
use of org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment in project tez by apache.
the class PipelinedSorter method flush.
@Override
public void flush() throws IOException {
final String uniqueIdentifier = outputContext.getUniqueIdentifier();
outputContext.notifyProgress();
/**
* Possible that the thread got interrupted when flush was happening or when the flush was
* never invoked. As a part of cleanup activity in TezTaskRunner, it would invoke close()
* on all I/O. At that time, this is safe to cleanup
*/
if (isThreadInterrupted()) {
return;
}
try {
LOG.info(outputContext.getDestinationVertexName() + ": Starting flush of map output");
span.end();
merger.add(span.sort(sorter));
// force a spill in flush()
// case 1: we want to force because of following scenarios:
// we have no keys written, and flush got called
// we want atleast one spill(be it empty)
// case 2: in pipeline shuffle case, we have no way of
// knowing the last key being written until flush is called
// so for flush()->spill() we want to force spill so that
// we can send pipeline shuffle event with last event true.
spill(false);
sortmaster.shutdown();
// safe to clean up
buffers.clear();
if (indexCacheList.isEmpty()) {
/*
* If we do not have this check, and if the task gets killed in the middle, it can throw
* NPE leading to distraction when debugging.
*/
if (LOG.isDebugEnabled()) {
LOG.debug(outputContext.getDestinationVertexName() + ": Index list is empty... returning");
}
return;
}
if (!isFinalMergeEnabled()) {
// Generate events for all spills
List<Event> events = Lists.newLinkedList();
// For pipelined shuffle, previous events are already sent. Just generate the last event alone
int startIndex = (pipelinedShuffle) ? (numSpills - 1) : 0;
int endIndex = numSpills;
for (int i = startIndex; i < endIndex; i++) {
boolean isLastEvent = (i == numSpills - 1);
String pathComponent = (outputContext.getUniqueIdentifier() + "_" + i);
ShuffleUtils.generateEventOnSpill(events, isFinalMergeEnabled(), isLastEvent, outputContext, i, indexCacheList.get(i), partitions, sendEmptyPartitionDetails, pathComponent, partitionStats, reportDetailedPartitionStats(), auxiliaryService, deflater);
LOG.info(outputContext.getDestinationVertexName() + ": Adding spill event for spill (final update=" + isLastEvent + "), spillId=" + i);
}
outputContext.sendEvents(events);
return;
}
numAdditionalSpills.increment(numSpills - 1);
// In case final merge is required, the following code path is executed.
if (numSpills == 1) {
// someday be able to pass this directly to shuffle
// without writing to disk
final Path filename = spillFilePaths.get(0);
final Path indexFilename = spillFileIndexPaths.get(0);
finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename);
finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(indexFilename);
sameVolRename(filename, finalOutputFile);
sameVolRename(indexFilename, finalIndexFile);
if (LOG.isDebugEnabled()) {
LOG.debug(outputContext.getDestinationVertexName() + ": numSpills=" + numSpills + ", finalOutputFile=" + finalOutputFile + ", " + "finalIndexFile=" + finalIndexFile + ", filename=" + filename + ", indexFilename=" + indexFilename);
}
TezSpillRecord spillRecord = new TezSpillRecord(finalIndexFile, conf);
if (reportPartitionStats()) {
for (int i = 0; i < spillRecord.size(); i++) {
partitionStats[i] += spillRecord.getIndex(i).getPartLength();
}
}
numShuffleChunks.setValue(numSpills);
fileOutputByteCounter.increment(rfs.getFileStatus(finalOutputFile).getLen());
// ??? why are events not being sent here?
return;
}
finalOutputFile = // TODO
mapOutputFile.getOutputFileForWrite(0);
finalIndexFile = // TODO
mapOutputFile.getOutputIndexFileForWrite(0);
if (LOG.isDebugEnabled()) {
LOG.debug(outputContext.getDestinationVertexName() + ": " + "numSpills: " + numSpills + ", finalOutputFile:" + finalOutputFile + ", finalIndexFile:" + finalIndexFile);
}
// The output stream for the final single output file
FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);
if (!SPILL_FILE_PERMS.equals(SPILL_FILE_PERMS.applyUMask(FsPermission.getUMask(conf)))) {
rfs.setPermission(finalOutputFile, SPILL_FILE_PERMS);
}
final TezSpillRecord spillRec = new TezSpillRecord(partitions);
for (int parts = 0; parts < partitions; parts++) {
boolean shouldWrite = false;
// create the segments to be merged
List<Segment> segmentList = new ArrayList<Segment>(numSpills);
for (int i = 0; i < numSpills; i++) {
Path spillFilename = spillFilePaths.get(i);
TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);
if (indexRecord.hasData() || !sendEmptyPartitionDetails) {
shouldWrite = true;
DiskSegment s = new DiskSegment(rfs, spillFilename, indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true);
segmentList.add(s);
}
}
int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
// sort the segments only if there are intermediate merges
boolean sortSegments = segmentList.size() > mergeFactor;
// merge
TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(uniqueIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), progressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, null, // Not using any Progress in TezMerger. Should just work.
merger.needsRLE());
// write merged output to disk
long segmentStart = finalOut.getPos();
long rawLength = 0;
long partLength = 0;
if (shouldWrite) {
Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null, merger.needsRLE());
if (combiner == null || numSpills < minSpillsForCombine) {
TezMerger.writeFile(kvIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
} else {
runCombineProcessor(kvIter, writer);
}
// close
writer.close();
rawLength = writer.getRawLength();
partLength = writer.getCompressedLength();
}
outputBytesWithOverheadCounter.increment(rawLength);
// record offsets
final TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
spillRec.putIndex(rec, parts);
if (reportPartitionStats()) {
partitionStats[parts] += partLength;
}
}
// final merge has happened.
numShuffleChunks.setValue(1);
fileOutputByteCounter.increment(rfs.getFileStatus(finalOutputFile).getLen());
spillRec.writeToFile(finalIndexFile, conf);
finalOut.close();
for (int i = 0; i < numSpills; i++) {
Path indexFilename = spillFileIndexPaths.get(i);
Path spillFilename = spillFilePaths.get(i);
rfs.delete(indexFilename, true);
rfs.delete(spillFilename, true);
}
spillFileIndexPaths.clear();
spillFilePaths.clear();
} catch (InterruptedException ie) {
if (cleanup) {
cleanup();
}
Thread.currentThread().interrupt();
throw new IOInterruptedException("Interrupted while closing Output", ie);
}
}
Aggregations