use of org.apache.hadoop.io.FileChunk in project tez by apache.
the class TestMergeManager method testLocalDiskMergeMultipleTasks.
void testLocalDiskMergeMultipleTasks(final boolean interruptInMiddle) throws IOException, InterruptedException {
Configuration conf = new TezConfiguration(defaultConf);
conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS, false);
conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, IntWritable.class.getName());
conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, IntWritable.class.getName());
Path localDir = new Path(workDir, "local");
Path srcDir = new Path(workDir, "srcData");
localFs.mkdirs(localDir);
localFs.mkdirs(srcDir);
conf.setStrings(TezRuntimeFrameworkConfigs.LOCAL_DIRS, localDir.toString());
FileSystem localFs = FileSystem.getLocal(conf);
LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS);
InputContext t0inputContext = createMockInputContext(UUID.randomUUID().toString());
InputContext t1inputContext = createMockInputContext(UUID.randomUUID().toString());
ExceptionReporter t0exceptionReporter = mock(ExceptionReporter.class);
ExceptionReporter t1exceptionReporter = mock(ExceptionReporter.class);
MergeManager t0mergeManagerReal = new MergeManager(conf, localFs, localDirAllocator, t0inputContext, null, null, null, null, t0exceptionReporter, 2000000, null, false, -1) {
// override for interruptInMiddle testing
@Override
public synchronized void closeOnDiskFile(FileChunk file) {
if (interruptInMiddle) {
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
return;
}
}
super.closeOnDiskFile(file);
}
};
MergeManager t0mergeManager = spy(t0mergeManagerReal);
t0mergeManager.configureAndStart();
MergeManager t1mergeManagerReal = new MergeManager(conf, localFs, localDirAllocator, t1inputContext, null, null, null, null, t1exceptionReporter, 2000000, null, false, -1);
MergeManager t1mergeManager = spy(t1mergeManagerReal);
// Partition 0 Keys 0-2, Partition 1 Keys 3-5
SrcFileInfo src1Info = createFile(conf, localFs, new Path(srcDir, InputAttemptIdentifier.PATH_PREFIX + "src1.out"), 2, 3, 0);
// Partition 0 Keys 6-8, Partition 1 Keys 9-11
SrcFileInfo src2Info = createFile(conf, localFs, new Path(srcDir, InputAttemptIdentifier.PATH_PREFIX + "src2.out"), 2, 3, 6);
// Simulating Task 0 fetches partition 0. (targetIndex = 0,1)
// Simulating Task 1 fetches partition 1. (targetIndex = 0,1)
InputAttemptIdentifier t0Identifier0 = new InputAttemptIdentifier(0, 0, src1Info.path.getName());
InputAttemptIdentifier t0Identifier1 = new InputAttemptIdentifier(1, 0, src2Info.path.getName());
InputAttemptIdentifier t1Identifier0 = new InputAttemptIdentifier(0, 0, src1Info.path.getName());
InputAttemptIdentifier t1Identifier1 = new InputAttemptIdentifier(1, 0, src2Info.path.getName());
MapOutput t0MapOutput0 = getMapOutputForDirectDiskFetch(t0Identifier0, src1Info.path, src1Info.indexedRecords[0], t0mergeManager);
MapOutput t0MapOutput1 = getMapOutputForDirectDiskFetch(t0Identifier1, src2Info.path, src2Info.indexedRecords[0], t0mergeManager);
MapOutput t1MapOutput0 = getMapOutputForDirectDiskFetch(t1Identifier0, src1Info.path, src1Info.indexedRecords[1], t1mergeManager);
MapOutput t1MapOutput1 = getMapOutputForDirectDiskFetch(t1Identifier1, src2Info.path, src2Info.indexedRecords[1], t1mergeManager);
t0MapOutput0.commit();
t0MapOutput1.commit();
verify(t0mergeManager).closeOnDiskFile(t0MapOutput0.getOutputPath());
verify(t0mergeManager).closeOnDiskFile(t0MapOutput1.getOutputPath());
// Run the OnDiskMerge via MergeManager
// Simulate the thread invocation - remove files, and invoke merge
List<FileChunk> t0MergeFiles = new LinkedList<FileChunk>();
t0MergeFiles.addAll(t0mergeManager.onDiskMapOutputs);
t0mergeManager.onDiskMapOutputs.clear();
if (!interruptInMiddle) {
t0mergeManager.onDiskMerger.merge(t0MergeFiles);
Assert.assertEquals(1, t0mergeManager.onDiskMapOutputs.size());
} else {
// Start Interrupting thread
Thread interruptingThread = new Thread(new InterruptingThread(t0mergeManager.onDiskMerger));
interruptingThread.start();
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
// Will be interrupted in the middle by interruptingThread.
t0mergeManager.onDiskMerger.startMerge(Sets.newHashSet(t0MergeFiles));
t0mergeManager.onDiskMerger.waitForMerge();
Assert.assertNotEquals(1, t0mergeManager.onDiskMapOutputs.size());
}
if (!interruptInMiddle) {
t1MapOutput0.commit();
t1MapOutput1.commit();
verify(t1mergeManager).closeOnDiskFile(t1MapOutput0.getOutputPath());
verify(t1mergeManager).closeOnDiskFile(t1MapOutput1.getOutputPath());
// Run the OnDiskMerge via MergeManager
// Simulate the thread invocation - remove files, and invoke merge
List<FileChunk> t1MergeFiles = new LinkedList<FileChunk>();
t1MergeFiles.addAll(t1mergeManager.onDiskMapOutputs);
t1mergeManager.onDiskMapOutputs.clear();
t1mergeManager.onDiskMerger.merge(t1MergeFiles);
Assert.assertEquals(1, t1mergeManager.onDiskMapOutputs.size());
Assert.assertNotEquals(t0mergeManager.onDiskMapOutputs.iterator().next().getPath(), t1mergeManager.onDiskMapOutputs.iterator().next().getPath());
Assert.assertTrue(t0mergeManager.onDiskMapOutputs.iterator().next().getPath().toString().contains(t0inputContext.getUniqueIdentifier()));
Assert.assertTrue(t1mergeManager.onDiskMapOutputs.iterator().next().getPath().toString().contains(t1inputContext.getUniqueIdentifier()));
}
}
use of org.apache.hadoop.io.FileChunk in project tez by apache.
the class MergeManager method finalMerge.
private TezRawKeyValueIterator finalMerge(Configuration job, FileSystem fs, List<MapOutput> inMemoryMapOutputs, List<FileChunk> onDiskMapOutputs) throws IOException, InterruptedException {
logFinalMergeStart(inMemoryMapOutputs, onDiskMapOutputs);
StringBuilder finalMergeLog = new StringBuilder();
inputContext.notifyProgress();
// merge config params
Class keyClass = (Class) ConfigUtils.getIntermediateInputKeyClass(job);
Class valueClass = (Class) ConfigUtils.getIntermediateInputValueClass(job);
final Path tmpDir = new Path(inputContext.getUniqueIdentifier());
final RawComparator comparator = (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(job);
// segments required to vacate memory
List<Segment> memDiskSegments = new ArrayList<Segment>();
long inMemToDiskBytes = 0;
boolean mergePhaseFinished = false;
if (inMemoryMapOutputs.size() > 0) {
int srcTaskId = inMemoryMapOutputs.get(0).getAttemptIdentifier().getInputIdentifier();
inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, this.postMergeMemLimit);
final int numMemDiskSegments = memDiskSegments.size();
if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
// If we reach here, it implies that we have less than io.sort.factor
// disk segments and this will be incremented by 1 (result of the
// memory segments merge). Since this total would still be
// <= io.sort.factor, we will not do any more intermediate merges,
// the merge of all these disk segments would be directly fed to the
// reduce method
mergePhaseFinished = true;
// must spill to disk, but can't retain in-mem for intermediate merge
// Can not use spill id in final merge as it would clobber with other files, hence using
// Integer.MAX_VALUE
final Path outputPath = mapOutputFile.getInputFileForWrite(srcTaskId, Integer.MAX_VALUE, inMemToDiskBytes).suffix(Constants.MERGED_OUTPUT_PREFIX);
final TezRawKeyValueIterator rIter = TezMerger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
final Writer writer = new Writer(job, fs, outputPath, keyClass, valueClass, codec, null, null);
try {
TezMerger.writeFile(rIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
} catch (IOException e) {
if (null != outputPath) {
try {
fs.delete(outputPath, true);
} catch (IOException ie) {
// NOTHING
}
}
throw e;
} finally {
if (null != writer) {
writer.close();
additionalBytesWritten.increment(writer.getCompressedLength());
}
}
final FileStatus fStatus = localFS.getFileStatus(outputPath);
// add to list of final disk outputs.
onDiskMapOutputs.add(new FileChunk(outputPath, 0, fStatus.getLen()));
if (LOG.isInfoEnabled()) {
finalMergeLog.append("MemMerged: " + numMemDiskSegments + ", " + inMemToDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merged " + numMemDiskSegments + "segments, size=" + inMemToDiskBytes + " to " + outputPath);
}
}
inMemToDiskBytes = 0;
memDiskSegments.clear();
} else if (inMemToDiskBytes != 0) {
if (LOG.isInfoEnabled()) {
finalMergeLog.append("DelayedMemMerge: " + numMemDiskSegments + ", " + inMemToDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
}
}
}
}
// segments on disk
List<Segment> diskSegments = new ArrayList<Segment>();
long onDiskBytes = inMemToDiskBytes;
FileChunk[] onDisk = onDiskMapOutputs.toArray(new FileChunk[onDiskMapOutputs.size()]);
for (FileChunk fileChunk : onDisk) {
final long fileLength = fileChunk.getLength();
onDiskBytes += fileLength;
if (LOG.isDebugEnabled()) {
LOG.debug("Disk file=" + fileChunk.getPath() + ", len=" + fileLength + ", isLocal=" + fileChunk.isLocalFile());
}
final Path file = fileChunk.getPath();
TezCounter counter = file.toString().endsWith(Constants.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter;
final long fileOffset = fileChunk.getOffset();
final boolean preserve = fileChunk.isLocalFile();
diskSegments.add(new DiskSegment(fs, file, fileOffset, fileLength, codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, preserve, counter));
}
if (LOG.isInfoEnabled()) {
finalMergeLog.append(". DiskSeg: " + onDisk.length + ", " + onDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
}
}
Collections.sort(diskSegments, new Comparator<Segment>() {
public int compare(Segment o1, Segment o2) {
if (o1.getLength() == o2.getLength()) {
return 0;
}
return o1.getLength() < o2.getLength() ? -1 : 1;
}
});
// build final list of segments from merged backed by disk + in-mem
List<Segment> finalSegments = new ArrayList<Segment>();
long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
if (LOG.isInfoEnabled()) {
finalMergeLog.append(". MemSeg: " + finalSegments.size() + ", " + inMemBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
}
}
if (0 != onDiskBytes) {
final int numInMemSegments = memDiskSegments.size();
diskSegments.addAll(0, memDiskSegments);
memDiskSegments.clear();
TezRawKeyValueIterator diskMerge = TezMerger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, progressable, false, spilledRecordsCounter, null, additionalBytesRead, null);
diskSegments.clear();
if (0 == finalSegments.size()) {
return diskMerge;
}
finalSegments.add(new Segment(new RawKVIteratorReader(diskMerge, onDiskBytes), null));
}
if (LOG.isInfoEnabled()) {
LOG.info(finalMergeLog.toString());
}
// This is doing nothing but creating an iterator over the segments.
return TezMerger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
}
use of org.apache.hadoop.io.FileChunk in project tez by apache.
the class MergeManager method logFinalMergeStart.
private void logFinalMergeStart(List<MapOutput> inMemoryMapOutputs, List<FileChunk> onDiskMapOutputs) {
long inMemSegmentSize = 0;
for (MapOutput inMemoryMapOutput : inMemoryMapOutputs) {
inMemSegmentSize += inMemoryMapOutput.getSize();
if (LOG.isDebugEnabled()) {
LOG.debug("finalMerge: inMemoryOutput=" + inMemoryMapOutput + ", size=" + inMemoryMapOutput.getSize());
}
}
long onDiskSegmentSize = 0;
for (FileChunk onDiskMapOutput : onDiskMapOutputs) {
onDiskSegmentSize += onDiskMapOutput.getLength();
if (LOG.isDebugEnabled()) {
LOG.debug("finalMerge: onDiskMapOutput=" + onDiskMapOutput.getPath() + ", size=" + onDiskMapOutput.getLength());
}
}
LOG.info("finalMerge with #inMemoryOutputs={}, size={} and #onDiskOutputs={}, size={}", inMemoryMapOutputs.size(), inMemSegmentSize, onDiskMapOutputs.size(), onDiskSegmentSize);
}
use of org.apache.hadoop.io.FileChunk in project tez by apache.
the class MergeManager method closeOnDiskFile.
@Override
public synchronized void closeOnDiskFile(FileChunk file) {
// including only path & offset for valdiations.
for (FileChunk fileChunk : onDiskMapOutputs) {
if (fileChunk.getPath().equals(file.getPath())) {
// ensure offsets are not the same.
Preconditions.checkArgument(fileChunk.getOffset() != file.getOffset(), "Can't have a file with same path and offset." + "OldFilePath=" + fileChunk.getPath() + ", OldFileOffset=" + fileChunk.getOffset() + ", newFilePath=" + file.getPath() + ", newFileOffset=" + file.getOffset());
}
}
onDiskMapOutputs.add(file);
logCloseOnDiskFile(file);
synchronized (onDiskMerger) {
if (!onDiskMerger.isInProgress() && onDiskMapOutputs.size() >= (2 * ioSortFactor - 1)) {
onDiskMerger.startMerge(onDiskMapOutputs);
}
}
}
use of org.apache.hadoop.io.FileChunk in project tez by apache.
the class MergeManager method close.
public TezRawKeyValueIterator close(boolean tryFinalMerge) throws Throwable {
if (!isShutdown.getAndSet(true)) {
// Wait for on-going merges to complete
if (memToMemMerger != null) {
memToMemMerger.close();
}
inMemoryMerger.close();
onDiskMerger.close();
List<MapOutput> memory = new ArrayList<MapOutput>(inMemoryMergedMapOutputs);
inMemoryMergedMapOutputs.clear();
memory.addAll(inMemoryMapOutputs);
inMemoryMapOutputs.clear();
List<FileChunk> disk = new ArrayList<FileChunk>(onDiskMapOutputs);
onDiskMapOutputs.clear();
if (statsInMemTotal.count > 0) {
LOG.info("TotalInMemFetchStats: count={}, totalSize={}, min={}, max={}, avg={}", statsInMemTotal.count, statsInMemTotal.size, statsInMemTotal.minSize, statsInMemTotal.maxSize, (statsInMemTotal.size / (float) statsInMemTotal.size));
}
// shuffle exception / error.
if (tryFinalMerge) {
try {
TezRawKeyValueIterator kvIter = finalMerge(conf, rfs, memory, disk);
this.finalMergeComplete = true;
return kvIter;
} catch (InterruptedException e) {
// Cleanup the disk segments
if (cleanup) {
cleanup(localFS, disk);
cleanup(localFS, onDiskMapOutputs);
}
// reset interrupt status
Thread.currentThread().interrupt();
throw e;
}
}
}
return null;
}
Aggregations