use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class TestIFile method testReadToDisk.
@Test(timeout = 20000)
public void testReadToDisk() throws IOException {
// verify sending a stream of zeroes generates an error
byte[] zeroData = new byte[1000];
Arrays.fill(zeroData, (byte) 0);
ByteArrayInputStream in = new ByteArrayInputStream(zeroData);
try {
IFile.Reader.readToDisk(new ByteArrayOutputStream(), in, zeroData.length, false, 0);
fail("Exception should have been thrown");
} catch (IOException e) {
}
// verify sending same stream of zeroes with a valid IFile header still
// generates an error
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(IFile.HEADER);
baos.write(zeroData);
try {
IFile.Reader.readToDisk(new ByteArrayOutputStream(), new ByteArrayInputStream(baos.toByteArray()), zeroData.length, false, 0);
fail("Exception should have been thrown");
} catch (IOException e) {
assertTrue(e instanceof ChecksumException);
}
// verify valid data is copied properly
List<KVPair> data = KVDataGen.generateTestData(true, 0);
Writer writer = writeTestFile(false, false, data, codec);
baos.reset();
IFile.Reader.readToDisk(baos, localFs.open(outputPath), writer.getCompressedLength(), false, 0);
byte[] diskData = baos.toByteArray();
Reader reader = new Reader(new ByteArrayInputStream(diskData), diskData.length, codec, null, null, false, 0, 1024);
verifyData(reader, data);
reader.close();
}
use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class TestIFile method testAppendValueWithDataInputBuffer.
@Test(timeout = 5000)
public // Test appendValue with DataInputBuffer
void testAppendValueWithDataInputBuffer() throws IOException {
List<KVPair> data = KVDataGen.generateTestData(false, rnd.nextInt(100));
IFile.Writer writer = new IFile.Writer(defaultConf, localFs, outputPath, Text.class, IntWritable.class, codec, null, null);
final DataInputBuffer previousKey = new DataInputBuffer();
DataInputBuffer key = new DataInputBuffer();
DataInputBuffer value = new DataInputBuffer();
for (KVPair kvp : data) {
populateData(kvp, key, value);
if ((previousKey != null && BufferUtils.compare(key, previousKey) == 0)) {
writer.appendValue(value);
} else {
writer.append(key, value);
}
previousKey.reset(k.getData(), 0, k.getLength());
}
writer.close();
readAndVerifyData(writer.getRawLength(), writer.getCompressedLength(), data, codec);
}
use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class TestMRCombiner method testRunNewCombiner.
@Test
public void testRunNewCombiner() throws IOException, InterruptedException {
TezConfiguration conf = new TezConfiguration();
setKeyAndValueClassTypes(conf);
conf.setBoolean("mapred.mapper.new-api", true);
conf.setClass(MRJobConfig.COMBINE_CLASS_ATTR, NewReducer.class, Object.class);
TaskContext taskContext = getTaskContext(conf);
MRCombiner combiner = new MRCombiner(taskContext);
Writer writer = Mockito.mock(Writer.class);
combiner.combine(new TezRawKeyValueIteratorTest(), writer);
long inputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_INPUT_RECORDS).getValue();
long outputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_OUTPUT_RECORDS).getValue();
assertEquals(6, inputRecords);
assertEquals(3, outputRecords);
// verify combiner output keys and values
verifyKeyAndValues(writer);
}
use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class TestMRCombiner method testRunOldCombiner.
@Test
public void testRunOldCombiner() throws IOException, InterruptedException {
TezConfiguration conf = new TezConfiguration();
setKeyAndValueClassTypes(conf);
conf.setClass("mapred.combiner.class", OldReducer.class, Object.class);
TaskContext taskContext = getTaskContext(conf);
MRCombiner combiner = new MRCombiner(taskContext);
Writer writer = Mockito.mock(Writer.class);
combiner.combine(new TezRawKeyValueIteratorTest(), writer);
long inputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_INPUT_RECORDS).getValue();
long outputRecords = taskContext.getCounters().findCounter(TaskCounter.COMBINE_OUTPUT_RECORDS).getValue();
assertEquals(6, inputRecords);
assertEquals(3, outputRecords);
// verify combiner output keys and values
verifyKeyAndValues(writer);
}
use of org.apache.tez.runtime.library.common.sort.impl.IFile.Writer in project tez by apache.
the class DefaultSorter method mergeParts.
private void mergeParts() throws IOException, InterruptedException {
// get the approximate size of the final output/index files
long finalOutFileSize = 0;
long finalIndexFileSize = 0;
final Path[] filename = new Path[numSpills];
final String taskIdentifier = outputContext.getUniqueIdentifier();
for (int i = 0; i < numSpills; i++) {
filename[i] = spillFilePaths.get(i);
finalOutFileSize += rfs.getFileStatus(filename[i]).getLen();
}
if (numSpills == 1) {
// the spill is the final output
TezSpillRecord spillRecord = null;
if (isFinalMergeEnabled()) {
finalOutputFile = mapOutputFile.getOutputFileForWriteInVolume(filename[0]);
finalIndexFile = mapOutputFile.getOutputIndexFileForWriteInVolume(filename[0]);
sameVolRename(filename[0], finalOutputFile);
if (indexCacheList.size() == 0) {
sameVolRename(spillFileIndexPaths.get(0), finalIndexFile);
spillRecord = new TezSpillRecord(finalIndexFile, conf);
} else {
spillRecord = indexCacheList.get(0);
spillRecord.writeToFile(finalIndexFile, conf);
}
} else {
List<Event> events = Lists.newLinkedList();
// Since there is only one spill, spill record would be present in cache.
spillRecord = indexCacheList.get(0);
Path indexPath = mapOutputFile.getSpillIndexFileForWrite(numSpills - 1, partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH);
spillRecord.writeToFile(indexPath, conf);
maybeSendEventForSpill(events, true, spillRecord, 0, true);
fileOutputByteCounter.increment(rfs.getFileStatus(spillFilePaths.get(0)).getLen());
// No need to populate finalIndexFile, finalOutputFile etc when finalMerge is disabled
}
if (spillRecord != null && reportPartitionStats()) {
for (int i = 0; i < spillRecord.size(); i++) {
partitionStats[i] += spillRecord.getIndex(i).getPartLength();
}
}
numShuffleChunks.setValue(numSpills);
return;
}
// read in paged indices
for (int i = indexCacheList.size(); i < numSpills; ++i) {
Path indexFileName = spillFileIndexPaths.get(i);
indexCacheList.add(new TezSpillRecord(indexFileName, conf));
}
// Check if it is needed to do final merge. Or else, exit early.
if (numSpills > 0 && !isFinalMergeEnabled()) {
maybeAddEventsForSpills();
// No need to do final merge.
return;
}
// make correction in the length to include the sequence file header
// lengths for each partition
finalOutFileSize += partitions * APPROX_HEADER_LENGTH;
finalIndexFileSize = partitions * MAP_OUTPUT_INDEX_RECORD_LENGTH;
if (isFinalMergeEnabled()) {
finalOutputFile = mapOutputFile.getOutputFileForWrite(finalOutFileSize);
finalIndexFile = mapOutputFile.getOutputIndexFileForWrite(finalIndexFileSize);
} else if (numSpills == 0) {
// e.g attempt_1424502260528_0119_1_07_000058_0_10012_0/file.out when final merge is
// disabled
finalOutputFile = mapOutputFile.getSpillFileForWrite(numSpills, finalOutFileSize);
finalIndexFile = mapOutputFile.getSpillIndexFileForWrite(numSpills, finalIndexFileSize);
}
// The output stream for the final single output file
FSDataOutputStream finalOut = rfs.create(finalOutputFile, true, 4096);
if (!SPILL_FILE_PERMS.equals(SPILL_FILE_PERMS.applyUMask(FsPermission.getUMask(conf)))) {
rfs.setPermission(finalOutputFile, SPILL_FILE_PERMS);
}
if (numSpills == 0) {
// TODO Change event generation to say there is no data rather than generating a dummy file
// create dummy files
long rawLength = 0;
long partLength = 0;
TezSpillRecord sr = new TezSpillRecord(partitions);
try {
for (int i = 0; i < partitions; i++) {
long segmentStart = finalOut.getPos();
if (!sendEmptyPartitionDetails) {
Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, null, null);
writer.close();
rawLength = writer.getRawLength();
partLength = writer.getCompressedLength();
}
TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
// Covers the case of multiple spills.
outputBytesWithOverheadCounter.increment(rawLength);
sr.putIndex(rec, i);
}
sr.writeToFile(finalIndexFile, conf);
} finally {
finalOut.close();
}
++numSpills;
if (!isFinalMergeEnabled()) {
List<Event> events = Lists.newLinkedList();
maybeSendEventForSpill(events, true, sr, 0, true);
fileOutputByteCounter.increment(rfs.getFileStatus(finalOutputFile).getLen());
}
numShuffleChunks.setValue(numSpills);
return;
} else {
final TezSpillRecord spillRec = new TezSpillRecord(partitions);
for (int parts = 0; parts < partitions; parts++) {
boolean shouldWrite = false;
// create the segments to be merged
List<Segment> segmentList = new ArrayList<Segment>(numSpills);
for (int i = 0; i < numSpills; i++) {
outputContext.notifyProgress();
TezIndexRecord indexRecord = indexCacheList.get(i).getIndex(parts);
if (indexRecord.hasData() || !sendEmptyPartitionDetails) {
shouldWrite = true;
DiskSegment s = new DiskSegment(rfs, filename[i], indexRecord.getStartOffset(), indexRecord.getPartLength(), codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, true);
segmentList.add(s);
}
if (LOG.isDebugEnabled()) {
LOG.debug(outputContext.getDestinationVertexName() + ": " + "TaskIdentifier=" + taskIdentifier + " Partition=" + parts + "Spill =" + i + "(" + indexRecord.getStartOffset() + "," + indexRecord.getRawLength() + ", " + indexRecord.getPartLength() + ")");
}
}
int mergeFactor = this.conf.getInt(TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR, TezRuntimeConfiguration.TEZ_RUNTIME_IO_SORT_FACTOR_DEFAULT);
// sort the segments only if there are intermediate merges
boolean sortSegments = segmentList.size() > mergeFactor;
// merge
TezRawKeyValueIterator kvIter = TezMerger.merge(conf, rfs, keyClass, valClass, codec, segmentList, mergeFactor, new Path(taskIdentifier), (RawComparator) ConfigUtils.getIntermediateOutputKeyComparator(conf), progressable, sortSegments, true, null, spilledRecordsCounter, additionalSpillBytesRead, // Not using any Progress in TezMerger. Should just work.
null);
// write merged output to disk
long segmentStart = finalOut.getPos();
long rawLength = 0;
long partLength = 0;
if (shouldWrite) {
Writer writer = new Writer(conf, finalOut, keyClass, valClass, codec, spilledRecordsCounter, null);
if (combiner == null || numSpills < minSpillsForCombine) {
TezMerger.writeFile(kvIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
} else {
runCombineProcessor(kvIter, writer);
}
writer.close();
rawLength = writer.getRawLength();
partLength = writer.getCompressedLength();
}
outputBytesWithOverheadCounter.increment(rawLength);
// record offsets
final TezIndexRecord rec = new TezIndexRecord(segmentStart, rawLength, partLength);
spillRec.putIndex(rec, parts);
if (reportPartitionStats()) {
partitionStats[parts] += partLength;
}
}
// final merge has happened
numShuffleChunks.setValue(1);
spillRec.writeToFile(finalIndexFile, conf);
finalOut.close();
for (int i = 0; i < numSpills; i++) {
rfs.delete(filename[i], true);
}
}
}
Aggregations