use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class UnorderedPartitionedKVWriter method writeLargeRecord.
private void writeLargeRecord(final Object key, final Object value, final int partition) throws IOException {
numAdditionalSpillsCounter.increment(1);
long size = sizePerBuffer - (currentBuffer.numRecords * META_SIZE) - currentBuffer.skipSize + numPartitions * APPROX_HEADER_LENGTH;
SpillPathDetails spillPathDetails = getSpillPathDetails(false, size);
int spillIndex = spillPathDetails.spillIndex;
FSDataOutputStream out = null;
long outSize = 0;
try {
final TezSpillRecord spillRecord = new TezSpillRecord(numPartitions);
final Path outPath = spillPathDetails.outputFilePath;
out = rfs.create(outPath);
if (!SPILL_FILE_PERMS.equals(SPILL_FILE_PERMS.applyUMask(FsPermission.getUMask(conf)))) {
rfs.setPermission(outPath, SPILL_FILE_PERMS);
}
BitSet emptyPartitions = null;
if (pipelinedShuffle || !isFinalMergeEnabled) {
emptyPartitions = new BitSet(numPartitions);
}
for (int i = 0; i < numPartitions; i++) {
final long recordStart = out.getPos();
if (i == partition) {
spilledRecordsCounter.increment(1);
Writer writer = null;
try {
writer = new IFile.Writer(conf, out, keyClass, valClass, codec, null, null);
writer.append(key, value);
outputLargeRecordsCounter.increment(1);
numRecordsPerPartition[i]++;
if (reportPartitionStats()) {
sizePerPartition[i] += writer.getRawLength();
}
writer.close();
synchronized (additionalSpillBytesWritternCounter) {
additionalSpillBytesWritternCounter.increment(writer.getCompressedLength());
}
TezIndexRecord indexRecord = new TezIndexRecord(recordStart, writer.getRawLength(), writer.getCompressedLength());
spillRecord.putIndex(indexRecord, i);
outSize = writer.getCompressedLength();
writer = null;
} finally {
if (writer != null) {
writer.close();
}
}
} else {
if (emptyPartitions != null) {
emptyPartitions.set(i);
}
}
}
handleSpillIndex(spillPathDetails, spillRecord);
mayBeSendEventsForSpill(emptyPartitions, sizePerPartition, spillIndex, false);
LOG.info(destNameTrimmed + ": " + "Finished writing large record of size " + outSize + " to spill file " + spillIndex);
if (LOG.isDebugEnabled()) {
LOG.debug(destNameTrimmed + ": " + "LargeRecord Spill=" + spillIndex + ", indexPath=" + spillPathDetails.indexFilePath + ", outputPath=" + spillPathDetails.outputFilePath);
}
} finally {
if (out != null) {
out.close();
}
}
}
use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class DefaultSorter method spillSingleRecord.
/**
* Handles the degenerate case where serialization fails to fit in
* the in-memory buffer, so we must spill the record from collect
* directly to a spill file. Consider this "losing".
*/
private void spillSingleRecord(final Object key, final Object value, int partition) throws IOException {
long size = kvbuffer.length + partitions * APPROX_HEADER_LENGTH;
FSDataOutputStream out = null;
try {
// create spill file
final TezSpillRecord spillRec = new TezSpillRecord(partitions);
final Path filename = mapOutputFile.getSpillFileForWrite(numSpills, size);
spillFilePaths.put(numSpills, filename);
out = rfs.create(filename);
if (!SPILL_FILE_PERMS.equals(SPILL_FILE_PERMS.applyUMask(FsPermission.getUMask(conf)))) {
rfs.setPermission(filename, SPILL_FILE_PERMS);
}
// we don't run the combiner for a single record
for (int i = 0; i < partitions; ++i) {
IFile.Writer writer = null;
try {
long segmentStart = out.getPos();
// Create a new codec, don't care!
if (!sendEmptyPartitionDetails || (i == partition)) {
writer = new Writer(conf, out, keyClass, valClass, codec, spilledRecordsCounter, null, false);
}
if (i == partition) {
final long recordStart = out.getPos();
writer.append(key, value);
// Note that our map byte count will not be accurate with
// compression
mapOutputByteCounter.increment(out.getPos() - recordStart);
}
long rawLength = 0;
long partLength = 0;
if (writer != null) {
writer.close();
rawLength = writer.getRawLength();
partLength = writer.getCompressedLength();
}
adjustSpillCounters(rawLength, partLength);
// record offsets
TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
spillRec.putIndex(rec, i);
writer = null;
} catch (IOException e) {
if (null != writer)
writer.close();
throw e;
}
}
if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
// create spill index file
Path indexFilename = mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
spillFileIndexPaths.put(numSpills, indexFilename);
spillRec.writeToFile(indexFilename, conf);
} else {
indexCacheList.add(spillRec);
totalIndexCacheMemory += spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
}
++numSpills;
if (!isFinalMergeEnabled()) {
numShuffleChunks.setValue(numSpills);
} else if (numSpills > 1) {
// Increment only when there is atleast one previous spill
numAdditionalSpills.increment(1);
}
} finally {
if (out != null)
out.close();
}
}
use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class PipelinedSorter method flush.
@Override
public void flush() throws IOException {
final String uniqueIdentifier = outputContext.getUniqueIdentifier();
outputContext.notifyProgress();
/**
* Possible that the thread got interrupted when flush was happening or when the flush was
* never invoked. As a part of cleanup activity in TezTaskRunner, it would invoke close()
* on all I/O. At that time, this is safe to cleanup
*/
if (isThreadInterrupted()) {
return;
}
try {
LOG.info(outputContext.getDestinationVertexName() + ": Starting flush of map output");
span.end();
merger.add(span.sort(sorter));
// force a spill in flush()
// case 1: we want to force because of following scenarios:
// we have no keys written, and flush got called
// we want atleast one spill(be it empty)
// case 2: in pipeline shuffle case, we have no way of
// knowing the last key being written until flush is called
// so for flush()->spill() we want to force spill so that
// we can send pipeline shuffle event with last event true.
spill(false);
sortmaster.shutdown();
// safe to clean up
buffers.clear();
if (indexCacheList.isEmpty()) {
/*
* If we do not have this check, and if the task gets killed in the middle, it can throw
* NPE leading to distraction when debugging.
*/
if (LOG.isDebugEnabled()) {
LOG.debug(outputContext.getDestinationVertexName() + ": Index list is empty... returning");
}
return;
}
if (!isFinalMergeEnabled()) {
// Generate events for all spills
List<Event> events = Lists.newLinkedList();
// For pipelined shuffle, previous events are already sent. Just generate the last event alone
int startIndex = (pipelinedShuffle) ? (numSpills - 1) : 0;
int endIndex = numSpills;
for (int i = startIndex; i < endIndex; i++) {
boolean isLastEvent = (i == numSpills - 1);
String pathComponent = (outputContext.getUniqueIdentifier() + "_" + i);
ShuffleUtils.generateEventOnSpill(events, isFinalMergeEnabled(), isLastEvent, outputContext, i, indexCacheList.get(i), partitions, sendEmptyPartitionDetails, pathComponent, partitionStats, reportDetailedPartitionStats(), auxiliaryService, deflater);
LOG.info(outputContext.getDestinationVertexName() + ": Adding spill event for spill (final update=" + isLastEvent + "), spillId=" + i);
}
outputContext.sendEvents(events);
return;
}
numAdditionalSpills.increment(numSpills - 1);
// In case final merge is required, the following code path is executed.
if (numSpills == 1) {
// someday be able to pass this directly to shuffle
// without writing to disk
final Path filename = spillFilePaths.get(0);
final Path indexFilename = spillFileIndexPaths.get(0);
finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename);
finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(indexFilename);
sameVolRename(filename, finalOutputFile);
sameVolRename(indexFilename, finalIndexFile);
if (LOG.isDebugEnabled()) {
LOG.debug(outputContext.getDestinationVertexName() + ": numSpills=" + numSpills + ", finalOutputFile=" + finalOutputFile + ", " + "finalIndexFile=" + finalIndexFile + ", filename=" + filename + ", indexFilename=" + indexFilename);
}
TezSpillRecord spillRecord = new TezSpillRecord(finalIndexFile, conf);
if (reportPartitionStats()) {
for (int i = 0; i < spillRecord.size(); i++) {
partitionStats[i] += spillRecord.getIndex(i).getPartLength();
}
}
numShuffleChunks.setValue(numSpills);
fileOutputByteCounter.increment(rfs.getFileStatus(finalOutputFile).getLen());
// ??? why are events not being sent here?
return;
}
finalOutputFile = // TODO
mapOutputFile.getOutputFileForWrite(0);
finalIndexFile = // TODO
mapOutputFile.getOutputIndexFileForWrite(0);
if (LOG.isDebugEnabled()) {
LOG.debug(outputContext.getDestinationVertexName() + ": " + "numSpills: " + numSpills + ", finalOutputFile:" + finalOutputFile + ", finalIndexFile:" + finalIndexFile);
}
// The output stream for the final single output file
FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);
if (!SPILL_FILE_PERMS.equals(SPILL_FILE_PERMS.applyUMask(FsPermission.getUMask(conf)))) {
rfs.setPermission(finalOutputFile, SPILL_FILE_PERMS);
}
final TezSpillRecord spillRec = new TezSpillRecord(partitions);
for (int parts = 0; parts < partitions; parts++) {
boolean shouldWrite = false;
// create the segments to be merged
List<Segment> segmentList = new ArrayList<Segment>(numSpills);
for (int i = 0; i < numSpills; i++) {
Path spillFilename = spillFilePaths.get(i);
TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);
if (indexRecord.hasData() || !sendEmptyPartitionDetails) {
shouldWrite = true;
DiskSegment s = new DiskSegment(rfs, spillFilename, indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true);
segmentList.add(s);
}
}
int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
// sort the segments only if there are intermediate merges
boolean sortSegments = segmentList.size() > mergeFactor;
// merge
TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(uniqueIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), progressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, null, // Not using any Progress in TezMerger. Should just work.
merger.needsRLE());
// write merged output to disk
long segmentStart = finalOut.getPos();
long rawLength = 0;
long partLength = 0;
if (shouldWrite) {
Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null, merger.needsRLE());
if (combiner == null || numSpills < minSpillsForCombine) {
TezMerger.writeFile(kvIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
} else {
runCombineProcessor(kvIter, writer);
}
// close
writer.close();
rawLength = writer.getRawLength();
partLength = writer.getCompressedLength();
}
outputBytesWithOverheadCounter.increment(rawLength);
// record offsets
final TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
spillRec.putIndex(rec, parts);
if (reportPartitionStats()) {
partitionStats[parts] += partLength;
}
}
// final merge has happened.
numShuffleChunks.setValue(1);
fileOutputByteCounter.increment(rfs.getFileStatus(finalOutputFile).getLen());
spillRec.writeToFile(finalIndexFile, conf);
finalOut.close();
for (int i = 0; i < numSpills; i++) {
Path indexFilename = spillFileIndexPaths.get(i);
Path spillFilename = spillFilePaths.get(i);
rfs.delete(indexFilename, true);
rfs.delete(spillFilename, true);
}
spillFileIndexPaths.clear();
spillFilePaths.clear();
} catch (InterruptedException ie) {
if (cleanup) {
cleanup();
}
Thread.currentThread().interrupt();
throw new IOInterruptedException("Interrupted while closing Output", ie);
}
}
use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class TestIFile method testExceedMaxSize.
@Test(timeout = 5000)
public // test overflow
void testExceedMaxSize() throws IOException {
final int oldMaxBufferSize = IFile.Reader.MAX_BUFFER_SIZE;
Text shortString = new Text("string");
Text longString = new Text("A string of length 22.");
assertEquals(22, longString.getLength());
Text readKey = new Text();
Text readValue = new Text();
DataInputBuffer keyIn = new DataInputBuffer();
DataInputBuffer valIn = new DataInputBuffer();
IFile.Writer writer;
IFile.Reader reader;
FSDataOutputStream out;
// Check Key length exceeding MAX_BUFFER_SIZE
out = localFs.create(outputPath);
writer = new IFile.Writer(defaultConf, out, Text.class, Text.class, null, null, null, false);
writer.append(longString, shortString);
writer.close();
out.close();
// Set this to a smaller value for testing
IFile.Reader.MAX_BUFFER_SIZE = 16;
reader = new IFile.Reader(localFs, outputPath, null, null, null, false, 0, -1);
try {
reader.nextRawKey(keyIn);
Assert.fail("Expected IllegalArgumentException to be thrown");
} catch (IllegalArgumentException e) {
// test passed
}
reader.close();
// Check Value length exceeding MAX_BUFFER_SIZE
out = localFs.create(outputPath);
writer = new IFile.Writer(defaultConf, out, Text.class, Text.class, null, null, null, false);
writer.append(shortString, longString);
writer.close();
out.close();
// Set this to a smaller value for testing
IFile.Reader.MAX_BUFFER_SIZE = 16;
reader = new IFile.Reader(localFs, outputPath, null, null, null, false, 0, -1);
try {
reader.nextRawKey(keyIn);
reader.nextRawValue(valIn);
Assert.fail("Expected IllegalArgumentException to be thrown");
} catch (IllegalArgumentException e) {
// test passed
}
reader.close();
// Check Key length not getting doubled
out = localFs.create(outputPath);
writer = new IFile.Writer(defaultConf, out, Text.class, Text.class, null, null, null, false);
writer.append(longString, shortString);
writer.close();
out.close();
// Set this to a smaller value for testing
IFile.Reader.MAX_BUFFER_SIZE = 32;
reader = new IFile.Reader(localFs, outputPath, null, null, null, false, 0, -1);
reader.nextRawKey(keyIn);
assertEquals(longString.getLength() + 1, keyIn.getData().length);
reader.close();
// Check Value length not getting doubled
out = localFs.create(outputPath);
writer = new IFile.Writer(defaultConf, out, Text.class, Text.class, null, null, null, false);
writer.append(shortString, longString);
writer.close();
out.close();
// Set this to a smaller value for testing
IFile.Reader.MAX_BUFFER_SIZE = 32;
reader = new IFile.Reader(localFs, outputPath, null, null, null, false, 0, -1);
reader.nextRawKey(keyIn);
reader.nextRawValue(valIn);
assertEquals(longString.getLength() + 1, valIn.getData().length);
reader.close();
// revert back to original value
IFile.Reader.MAX_BUFFER_SIZE = oldMaxBufferSize;
}
use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class TestIFile method testAppendKeyValues.
@Test(timeout = 5000)
public // Test appendKeyValues feature
void testAppendKeyValues() throws IOException {
List<KVPair> data = new ArrayList<KVPair>();
List<IntWritable> values = new ArrayList<IntWritable>();
Text key = new Text("key");
IntWritable val = new IntWritable(1);
for (int i = 0; i < 5; i++) {
data.add(new KVPair(key, val));
values.add(val);
}
IFile.Writer writer = new IFile.Writer(defaultConf, localFs, outputPath, Text.class, IntWritable.class, codec, null, null);
writer.appendKeyValues(data.get(0).getKey(), values.iterator());
Text lastKey = new Text("key3");
IntWritable lastVal = new IntWritable(10);
data.add(new KVPair(lastKey, lastVal));
writer.append(lastKey, lastVal);
writer.close();
readAndVerifyData(writer.getRawLength(), writer.getCompressedLength(), data, codec);
}
Aggregations