use of org.apache.hadoop.io.RawComparator in project tez by apache.
the class TestTezMerger method testWithCustomComparator_mixedFiles.
@Test(timeout = 5000)
public void testWithCustomComparator_mixedFiles() throws Exception {
List<Path> pathList = new LinkedList<Path>();
List<String> data = Lists.newLinkedList();
LOG.info("Test with custom comparator with mixed set of segments (empty, non-empty etc)");
// Test with 2 files, where the RLE keys can span across files
// First file
data.clear();
data.add("0");
pathList.add(createIFileWithTextData(data));
// Second file; empty file
data.clear();
pathList.add(createIFileWithTextData(data));
// Third file with empty key
data.clear();
data.add("");
pathList.add(createIFileWithTextData(data));
// Fourth file with repeated keys
data.clear();
data.add("0");
data.add("0");
data.add("0");
pathList.add(createIFileWithTextData(data));
// Merge datasets with custom comparator
RawComparator rc = new CustomComparator();
TezRawKeyValueIterator records = merge(pathList, rc);
// expected result
String[][] expectedResult = { // formatting intentionally
{ "", DIFF_KEY }, { "0", DIFF_KEY }, { "0", SAME_KEY }, { "0", SAME_KEY }, { "0", SAME_KEY } };
verify(records, expectedResult);
pathList.clear();
data.clear();
}
use of org.apache.hadoop.io.RawComparator in project tez by apache.
the class TestTezMerger method testWithCustomComparator_No_RLE.
@Test(timeout = 5000)
public void testWithCustomComparator_No_RLE() throws Exception {
List<Path> pathList = new LinkedList<Path>();
List<String> data = Lists.newLinkedList();
// Merge datasets with custom comparator
RawComparator rc = new CustomComparator();
LOG.info("Test with custom comparator with no RLE");
// Test with 3 files,
data.add("1");
data.add("4");
data.add("5");
pathList.add(createIFileWithTextData(data));
// Second file with empty key
data.clear();
data.add("2");
data.add("6");
data.add("7");
pathList.add(createIFileWithTextData(data));
// Third file
data.clear();
data.add("3");
data.add("8");
data.add("9");
pathList.add(createIFileWithTextData(data));
TezRawKeyValueIterator records = merge(pathList, rc);
String[][] expectedResult = { { "1", DIFF_KEY }, { "2", DIFF_KEY }, { "3", DIFF_KEY }, { "4", DIFF_KEY }, { "5", DIFF_KEY }, { "6", DIFF_KEY }, { "7", DIFF_KEY }, { "8", DIFF_KEY }, { "9", DIFF_KEY } };
verify(records, expectedResult);
pathList.clear();
data.clear();
}
use of org.apache.hadoop.io.RawComparator in project tez by apache.
the class OrderedGroupedKVInput method createValuesIterator.
@SuppressWarnings({ "rawtypes", "unchecked" })
protected synchronized void createValuesIterator() throws IOException {
// Not used by ReduceProcessor
RawComparator rawComparator = ConfigUtils.getIntermediateInputKeyComparator(conf);
Class<?> keyClass = ConfigUtils.getIntermediateInputKeyClass(conf);
Class<?> valClass = ConfigUtils.getIntermediateInputValueClass(conf);
LOG.info(getContext().getSourceVertexName() + ": " + "creating ValuesIterator with " + "comparator=" + rawComparator.getClass().getName() + ", keyClass=" + keyClass.getName() + ", valClass=" + valClass.getName());
vIter = new ValuesIterator(rawIter, rawComparator, keyClass, valClass, conf, inputKeyCounter, inputValueCounter);
}
use of org.apache.hadoop.io.RawComparator in project tez by apache.
the class ReduceProcessor method run.
@Override
public void run(Map<String, LogicalInput> _inputs, Map<String, LogicalOutput> _outputs) throws Exception {
this.inputs = _inputs;
this.outputs = _outputs;
progressHelper = new ProgressHelper(this.inputs, processorContext, this.getClass().getSimpleName());
LOG.info("Running reduce: " + processorContext.getUniqueIdentifier());
if (_outputs.size() <= 0 || _outputs.size() > 1) {
throw new IOException("Invalid number of _outputs" + ", outputCount=" + _outputs.size());
}
if (_inputs.size() <= 0 || _inputs.size() > 1) {
throw new IOException("Invalid number of _inputs" + ", inputCount=" + _inputs.size());
}
LogicalInput in = _inputs.values().iterator().next();
in.start();
List<Input> pendingInputs = new LinkedList<Input>();
pendingInputs.add(in);
processorContext.waitForAllInputsReady(pendingInputs);
LOG.info("Input is ready for consumption. Starting Output");
LogicalOutput out = _outputs.values().iterator().next();
out.start();
initTask(out);
progressHelper.scheduleProgressTaskService(0, 100);
this.statusUpdate();
Class keyClass = ConfigUtils.getIntermediateInputKeyClass(jobConf);
Class valueClass = ConfigUtils.getIntermediateInputValueClass(jobConf);
LOG.info("Using keyClass: " + keyClass);
LOG.info("Using valueClass: " + valueClass);
RawComparator comparator = ConfigUtils.getInputKeySecondaryGroupingComparator(jobConf);
LOG.info("Using comparator: " + comparator);
reduceInputKeyCounter = mrReporter.getCounter(TaskCounter.REDUCE_INPUT_GROUPS);
reduceInputValueCounter = mrReporter.getCounter(TaskCounter.REDUCE_INPUT_RECORDS);
// Sanity check
if (!(in instanceof OrderedGroupedInputLegacy)) {
throw new IOException("Illegal input to reduce: " + in.getClass());
}
OrderedGroupedInputLegacy shuffleInput = (OrderedGroupedInputLegacy) in;
KeyValuesReader kvReader = shuffleInput.getReader();
KeyValueWriter kvWriter = null;
if ((out instanceof MROutputLegacy)) {
kvWriter = ((MROutputLegacy) out).getWriter();
} else if ((out instanceof OrderedPartitionedKVOutput)) {
kvWriter = ((OrderedPartitionedKVOutput) out).getWriter();
} else {
throw new IOException("Illegal output to reduce: " + in.getClass());
}
if (useNewApi) {
try {
runNewReducer(jobConf, mrReporter, shuffleInput, comparator, keyClass, valueClass, kvWriter);
} catch (ClassNotFoundException cnfe) {
throw new IOException(cnfe);
}
} else {
runOldReducer(jobConf, mrReporter, kvReader, comparator, keyClass, valueClass, kvWriter);
}
done();
}
use of org.apache.hadoop.io.RawComparator in project tez by apache.
the class MergeManager method finalMerge.
private TezRawKeyValueIterator finalMerge(Configuration job, FileSystem fs, List<MapOutput> inMemoryMapOutputs, List<FileChunk> onDiskMapOutputs) throws IOException, InterruptedException {
logFinalMergeStart(inMemoryMapOutputs, onDiskMapOutputs);
StringBuilder finalMergeLog = new StringBuilder();
inputContext.notifyProgress();
// merge config params
Class keyClass = (Class) ConfigUtils.getIntermediateInputKeyClass(job);
Class valueClass = (Class) ConfigUtils.getIntermediateInputValueClass(job);
final Path tmpDir = new Path(inputContext.getUniqueIdentifier());
final RawComparator comparator = (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(job);
// segments required to vacate memory
List<Segment> memDiskSegments = new ArrayList<Segment>();
long inMemToDiskBytes = 0;
boolean mergePhaseFinished = false;
if (inMemoryMapOutputs.size() > 0) {
int srcTaskId = inMemoryMapOutputs.get(0).getAttemptIdentifier().getInputIdentifier();
inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, this.postMergeMemLimit);
final int numMemDiskSegments = memDiskSegments.size();
if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
// If we reach here, it implies that we have less than io.sort.factor
// disk segments and this will be incremented by 1 (result of the
// memory segments merge). Since this total would still be
// <= io.sort.factor, we will not do any more intermediate merges,
// the merge of all these disk segments would be directly fed to the
// reduce method
mergePhaseFinished = true;
// must spill to disk, but can't retain in-mem for intermediate merge
// Can not use spill id in final merge as it would clobber with other files, hence using
// Integer.MAX_VALUE
final Path outputPath = mapOutputFile.getInputFileForWrite(srcTaskId, Integer.MAX_VALUE, inMemToDiskBytes).suffix(Constants.MERGED_OUTPUT_PREFIX);
final TezRawKeyValueIterator rIter = TezMerger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
final Writer writer = new Writer(job, fs, outputPath, keyClass, valueClass, codec, null, null);
try {
TezMerger.writeFile(rIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
} catch (IOException e) {
if (null != outputPath) {
try {
fs.delete(outputPath, true);
} catch (IOException ie) {
// NOTHING
}
}
throw e;
} finally {
if (null != writer) {
writer.close();
additionalBytesWritten.increment(writer.getCompressedLength());
}
}
final FileStatus fStatus = localFS.getFileStatus(outputPath);
// add to list of final disk outputs.
onDiskMapOutputs.add(new FileChunk(outputPath, 0, fStatus.getLen()));
if (LOG.isInfoEnabled()) {
finalMergeLog.append("MemMerged: " + numMemDiskSegments + ", " + inMemToDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merged " + numMemDiskSegments + "segments, size=" + inMemToDiskBytes + " to " + outputPath);
}
}
inMemToDiskBytes = 0;
memDiskSegments.clear();
} else if (inMemToDiskBytes != 0) {
if (LOG.isInfoEnabled()) {
finalMergeLog.append("DelayedMemMerge: " + numMemDiskSegments + ", " + inMemToDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
}
}
}
}
// segments on disk
List<Segment> diskSegments = new ArrayList<Segment>();
long onDiskBytes = inMemToDiskBytes;
FileChunk[] onDisk = onDiskMapOutputs.toArray(new FileChunk[onDiskMapOutputs.size()]);
for (FileChunk fileChunk : onDisk) {
final long fileLength = fileChunk.getLength();
onDiskBytes += fileLength;
if (LOG.isDebugEnabled()) {
LOG.debug("Disk file=" + fileChunk.getPath() + ", len=" + fileLength + ", isLocal=" + fileChunk.isLocalFile());
}
final Path file = fileChunk.getPath();
TezCounter counter = file.toString().endsWith(Constants.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter;
final long fileOffset = fileChunk.getOffset();
final boolean preserve = fileChunk.isLocalFile();
diskSegments.add(new DiskSegment(fs, file, fileOffset, fileLength, codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, preserve, counter));
}
if (LOG.isInfoEnabled()) {
finalMergeLog.append(". DiskSeg: " + onDisk.length + ", " + onDiskBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
}
}
Collections.sort(diskSegments, new Comparator<Segment>() {
public int compare(Segment o1, Segment o2) {
if (o1.getLength() == o2.getLength()) {
return 0;
}
return o1.getLength() < o2.getLength() ? -1 : 1;
}
});
// build final list of segments from merged backed by disk + in-mem
List<Segment> finalSegments = new ArrayList<Segment>();
long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
if (LOG.isInfoEnabled()) {
finalMergeLog.append(". MemSeg: " + finalSegments.size() + ", " + inMemBytes);
if (LOG.isDebugEnabled()) {
LOG.debug("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
}
}
if (0 != onDiskBytes) {
final int numInMemSegments = memDiskSegments.size();
diskSegments.addAll(0, memDiskSegments);
memDiskSegments.clear();
TezRawKeyValueIterator diskMerge = TezMerger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, progressable, false, spilledRecordsCounter, null, additionalBytesRead, null);
diskSegments.clear();
if (0 == finalSegments.size()) {
return diskMerge;
}
finalSegments.add(new Segment(new RawKVIteratorReader(diskMerge, onDiskBytes), null));
}
if (LOG.isInfoEnabled()) {
LOG.info(finalMergeLog.toString());
}
// This is doing nothing but creating an iterator over the segments.
return TezMerger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
}
Aggregations