use of org.apache.hadoop.io.RawComparator in project hadoop by apache.
the class TotalOrderPartitioner method setConf.
/**
* Read in the partition file and build indexing data structures.
* If the keytype is {@link org.apache.hadoop.io.BinaryComparable} and
* <tt>total.order.partitioner.natural.order</tt> is not false, a trie
* of the first <tt>total.order.partitioner.max.trie.depth</tt>(2) + 1 bytes
* will be built. Otherwise, keys will be located using a binary search of
* the partition keyset using the {@link org.apache.hadoop.io.RawComparator}
* defined for this job. The input file must be sorted with the same
* comparator and contain {@link Job#getNumReduceTasks()} - 1 keys.
*/
// keytype from conf not static
@SuppressWarnings("unchecked")
public void setConf(Configuration conf) {
try {
this.conf = conf;
String parts = getPartitionFile(conf);
final Path partFile = new Path(parts);
final FileSystem fs = (DEFAULT_PATH.equals(parts)) ? // assume in DistributedCache
FileSystem.getLocal(conf) : partFile.getFileSystem(conf);
Job job = Job.getInstance(conf);
Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
K[] splitPoints = readPartitions(fs, partFile, keyClass, conf);
if (splitPoints.length != job.getNumReduceTasks() - 1) {
throw new IOException("Wrong number of partitions in keyset");
}
RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
for (int i = 0; i < splitPoints.length - 1; ++i) {
if (comparator.compare(splitPoints[i], splitPoints[i + 1]) >= 0) {
throw new IOException("Split points are out of order");
}
}
boolean natOrder = conf.getBoolean(NATURAL_ORDER, true);
if (natOrder && BinaryComparable.class.isAssignableFrom(keyClass)) {
partitions = buildTrie((BinaryComparable[]) splitPoints, 0, splitPoints.length, new byte[0], // limit large but not huge.
conf.getInt(MAX_TRIE_DEPTH, 200));
} else {
partitions = new BinarySearchNode(splitPoints, comparator);
}
} catch (IOException e) {
throw new IllegalArgumentException("Can't read partitions file", e);
}
}
use of org.apache.hadoop.io.RawComparator in project hadoop by apache.
the class MergeManagerImpl method finalMerge.
private RawKeyValueIterator finalMerge(JobConf job, FileSystem fs, List<InMemoryMapOutput<K, V>> inMemoryMapOutputs, List<CompressAwarePath> onDiskMapOutputs) throws IOException {
LOG.info("finalMerge called with " + inMemoryMapOutputs.size() + " in-memory map-outputs and " + onDiskMapOutputs.size() + " on-disk map-outputs");
final long maxInMemReduce = getMaxInMemReduceLimit();
// merge config params
Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
Class<V> valueClass = (Class<V>) job.getMapOutputValueClass();
boolean keepInputs = job.getKeepFailedTaskFiles();
final Path tmpDir = new Path(reduceId.toString());
final RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();
// segments required to vacate memory
List<Segment<K, V>> memDiskSegments = new ArrayList<Segment<K, V>>();
long inMemToDiskBytes = 0;
boolean mergePhaseFinished = false;
if (inMemoryMapOutputs.size() > 0) {
TaskID mapId = inMemoryMapOutputs.get(0).getMapId().getTaskID();
inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce);
final int numMemDiskSegments = memDiskSegments.size();
if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
// If we reach here, it implies that we have less than io.sort.factor
// disk segments and this will be incremented by 1 (result of the
// memory segments merge). Since this total would still be
// <= io.sort.factor, we will not do any more intermediate merges,
// the merge of all these disk segments would be directly fed to the
// reduce method
mergePhaseFinished = true;
// must spill to disk, but can't retain in-mem for intermediate merge
final Path outputPath = mapOutputFile.getInputFileForWrite(mapId, inMemToDiskBytes).suffix(Task.MERGED_OUTPUT_PREFIX);
final RawKeyValueIterator rIter = Merger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null, mergePhase);
FSDataOutputStream out = CryptoUtils.wrapIfNecessary(job, fs.create(outputPath));
Writer<K, V> writer = new Writer<K, V>(job, out, keyClass, valueClass, codec, null, true);
try {
Merger.writeFile(rIter, writer, reporter, job);
writer.close();
onDiskMapOutputs.add(new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()));
writer = null;
// add to list of final disk outputs.
} catch (IOException e) {
if (null != outputPath) {
try {
fs.delete(outputPath, true);
} catch (IOException ie) {
// NOTHING
}
}
throw e;
} finally {
if (null != writer) {
writer.close();
}
}
LOG.info("Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy " + "reduce memory limit");
inMemToDiskBytes = 0;
memDiskSegments.clear();
} else if (inMemToDiskBytes != 0) {
LOG.info("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
}
}
// segments on disk
List<Segment<K, V>> diskSegments = new ArrayList<Segment<K, V>>();
long onDiskBytes = inMemToDiskBytes;
long rawBytes = inMemToDiskBytes;
CompressAwarePath[] onDisk = onDiskMapOutputs.toArray(new CompressAwarePath[onDiskMapOutputs.size()]);
for (CompressAwarePath file : onDisk) {
long fileLength = fs.getFileStatus(file).getLen();
onDiskBytes += fileLength;
rawBytes += (file.getRawDataLength() > 0) ? file.getRawDataLength() : fileLength;
LOG.debug("Disk file: " + file + " Length is " + fileLength);
diskSegments.add(new Segment<K, V>(job, fs, file, codec, keepInputs, (file.toString().endsWith(Task.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter), file.getRawDataLength()));
}
LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
Collections.sort(diskSegments, new Comparator<Segment<K, V>>() {
public int compare(Segment<K, V> o1, Segment<K, V> o2) {
if (o1.getLength() == o2.getLength()) {
return 0;
}
return o1.getLength() < o2.getLength() ? -1 : 1;
}
});
// build final list of segments from merged backed by disk + in-mem
List<Segment<K, V>> finalSegments = new ArrayList<Segment<K, V>>();
long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
LOG.info("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
if (0 != onDiskBytes) {
final int numInMemSegments = memDiskSegments.size();
diskSegments.addAll(0, memDiskSegments);
memDiskSegments.clear();
// Pass mergePhase only if there is a going to be intermediate
// merges. See comment where mergePhaseFinished is being set
Progress thisPhase = (mergePhaseFinished) ? null : mergePhase;
RawKeyValueIterator diskMerge = Merger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false, spilledRecordsCounter, null, thisPhase);
diskSegments.clear();
if (0 == finalSegments.size()) {
return diskMerge;
}
finalSegments.add(new Segment<K, V>(new RawKVIteratorReader(diskMerge, onDiskBytes), true, rawBytes));
}
return Merger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, reporter, spilledRecordsCounter, null, null);
}
use of org.apache.hadoop.io.RawComparator in project hadoop by apache.
the class MergeManagerImpl method combineAndSpill.
private void combineAndSpill(RawKeyValueIterator kvIter, Counters.Counter inCounter) throws IOException {
JobConf job = jobConf;
Reducer combiner = ReflectionUtils.newInstance(combinerClass, job);
Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
Class<V> valClass = (Class<V>) job.getMapOutputValueClass();
RawComparator<K> comparator = (RawComparator<K>) job.getCombinerKeyGroupingComparator();
try {
CombineValuesIterator values = new CombineValuesIterator(kvIter, comparator, keyClass, valClass, job, Reporter.NULL, inCounter);
while (values.more()) {
combiner.reduce(values.getKey(), values, combineCollector, Reporter.NULL);
values.nextKey();
}
} finally {
combiner.close();
}
}
use of org.apache.hadoop.io.RawComparator in project hadoop by apache.
the class InputSampler method writePartitionFile.
/**
* Write a partition file for the given job, using the Sampler provided.
* Queries the sampler for a sample keyset, sorts by the output key
* comparator, selects the keys for each rank, and writes to the destination
* returned from {@link TotalOrderPartitioner#getPartitionFile}.
*/
// getInputFormat, getOutputKeyComparator
@SuppressWarnings("unchecked")
public static <K, V> void writePartitionFile(Job job, Sampler<K, V> sampler) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = job.getConfiguration();
final InputFormat inf = ReflectionUtils.newInstance(job.getInputFormatClass(), conf);
int numPartitions = job.getNumReduceTasks();
K[] samples = (K[]) sampler.getSample(inf, job);
LOG.info("Using " + samples.length + " samples");
RawComparator<K> comparator = (RawComparator<K>) job.getSortComparator();
Arrays.sort(samples, comparator);
Path dst = new Path(TotalOrderPartitioner.getPartitionFile(conf));
FileSystem fs = dst.getFileSystem(conf);
fs.delete(dst, false);
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, dst, job.getMapOutputKeyClass(), NullWritable.class);
NullWritable nullValue = NullWritable.get();
float stepSize = samples.length / (float) numPartitions;
int last = -1;
for (int i = 1; i < numPartitions; ++i) {
int k = Math.round(stepSize * i);
while (last >= k && comparator.compare(samples[last], samples[k]) == 0) {
++k;
}
writer.append(samples[k], nullValue);
last = k;
}
writer.close();
}
use of org.apache.hadoop.io.RawComparator in project tez by apache.
the class TestTezMerger method testWithCustomComparator_RLE_acrossFiles.
@Test(timeout = 5000)
public void testWithCustomComparator_RLE_acrossFiles() throws Exception {
List<Path> pathList = new LinkedList<Path>();
List<String> data = Lists.newLinkedList();
LOG.info("Test with custom comparator with RLE spanning across segment boundaries");
// Test with 2 files, where the RLE keys can span across files
// First file
data.clear();
data.add("0");
data.add("0");
pathList.add(createIFileWithTextData(data));
// Second file
data.clear();
data.add("0");
data.add("1");
pathList.add(createIFileWithTextData(data));
// Merge datasets with custom comparator
RawComparator rc = new CustomComparator();
TezRawKeyValueIterator records = merge(pathList, rc);
// expected result
String[][] expectedResult = { // formatting intentionally
{ "0", DIFF_KEY }, { "0", SAME_KEY }, { "0", SAME_KEY }, { "1", DIFF_KEY } };
verify(records, expectedResult);
pathList.clear();
data.clear();
}
Aggregations