Examples with RawComparator - org.apache.hadoop.io.RawComparator

Example 11 with RawComparator

use of org.apache.hadoop.io.RawComparator in project hadoop by apache.

the class ReduceTask method run.

@Override
@SuppressWarnings("unchecked")
public void run(JobConf job, final TaskUmbilicalProtocol umbilical) throws IOException, InterruptedException, ClassNotFoundException {
    job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
    if (isMapOrReduce()) {
        copyPhase = getProgress().addPhase("copy");
        sortPhase = getProgress().addPhase("sort");
        reducePhase = getProgress().addPhase("reduce");
    }
    // start thread that will handle communication with parent
    TaskReporter reporter = startReporter(umbilical);
    boolean useNewApi = job.getUseNewReducer();
    initialize(job, getJobID(), reporter, useNewApi);
    // check if it is a cleanupJobTask
    if (jobCleanup) {
        runJobCleanupTask(umbilical, reporter);
        return;
    }
    if (jobSetup) {
        runJobSetupTask(umbilical, reporter);
        return;
    }
    if (taskCleanup) {
        runTaskCleanupTask(umbilical, reporter);
        return;
    }
    // Initialize the codec
    codec = initCodec();
    RawKeyValueIterator rIter = null;
    ShuffleConsumerPlugin shuffleConsumerPlugin = null;
    Class combinerClass = conf.getCombinerClass();
    CombineOutputCollector combineCollector = (null != combinerClass) ? new CombineOutputCollector(reduceCombineOutputCounter, reporter, conf) : null;
    Class<? extends ShuffleConsumerPlugin> clazz = job.getClass(MRConfig.SHUFFLE_CONSUMER_PLUGIN, Shuffle.class, ShuffleConsumerPlugin.class);
    shuffleConsumerPlugin = ReflectionUtils.newInstance(clazz, job);
    LOG.info("Using ShuffleConsumerPlugin: " + shuffleConsumerPlugin);
    ShuffleConsumerPlugin.Context shuffleContext = new ShuffleConsumerPlugin.Context(getTaskID(), job, FileSystem.getLocal(job), umbilical, super.lDirAlloc, reporter, codec, combinerClass, combineCollector, spilledRecordsCounter, reduceCombineInputCounter, shuffledMapsCounter, reduceShuffleBytes, failedShuffleCounter, mergedMapOutputsCounter, taskStatus, copyPhase, sortPhase, this, mapOutputFile, localMapFiles);
    shuffleConsumerPlugin.init(shuffleContext);
    rIter = shuffleConsumerPlugin.run();
    // free up the data structures
    mapOutputFilesOnDisk.clear();
    // sort is complete
    sortPhase.complete();
    setPhase(TaskStatus.Phase.REDUCE);
    statusUpdate(umbilical);
    Class keyClass = job.getMapOutputKeyClass();
    Class valueClass = job.getMapOutputValueClass();
    RawComparator comparator = job.getOutputValueGroupingComparator();
    if (useNewApi) {
        runNewReducer(job, umbilical, reporter, rIter, comparator, keyClass, valueClass);
    } else {
        runOldReducer(job, umbilical, reporter, rIter, comparator, keyClass, valueClass);
    }
    shuffleConsumerPlugin.close();
    done(umbilical, reporter);
}

Also used : TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) RawComparator(org.apache.hadoop.io.RawComparator)

Example 12 with RawComparator

use of org.apache.hadoop.io.RawComparator in project tez by apache.

the class TestTezMerger method testWithCustomComparator_WithEmptyStrings.

@Test(timeout = 5000)
public void testWithCustomComparator_WithEmptyStrings() throws Exception {
    List<Path> pathList = new LinkedList<Path>();
    List<String> data = Lists.newLinkedList();
    // Merge datasets with custom comparator
    RawComparator rc = new CustomComparator();
    LOG.info("Test with custom comparator with empty strings in middle");
    // Test with 4 files, where some texts are empty strings
    data.add("0");
    data.add("0");
    pathList.add(createIFileWithTextData(data));
    // Second file with empty key
    data.clear();
    data.add("");
    pathList.add(createIFileWithTextData(data));
    // Third file
    data.clear();
    data.add("0");
    data.add("0");
    pathList.add(createIFileWithTextData(data));
    // Third file
    data.clear();
    data.add("1");
    data.add("2");
    pathList.add(createIFileWithTextData(data));
    TezRawKeyValueIterator records = merge(pathList, rc);
    String[][] expectedResult = { // formatting intentionally
    { "", DIFF_KEY }, { "0", DIFF_KEY }, { "0", SAME_KEY }, { "0", SAME_KEY }, { "0", SAME_KEY }, { "1", DIFF_KEY }, { "2", DIFF_KEY } };
    verify(records, expectedResult);
    pathList.clear();
    data.clear();
}

Also used : Path(org.apache.hadoop.fs.Path) RawComparator(org.apache.hadoop.io.RawComparator) LinkedList(java.util.LinkedList) Test(org.junit.Test)

Aggregations

RawComparator (org.apache.hadoop.io.RawComparator)12 Path (org.apache.hadoop.fs.Path)8 LinkedList (java.util.LinkedList)5 IOException (java.io.IOException)4 Test (org.junit.Test)4 ArrayList (java.util.ArrayList)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Configuration (org.apache.hadoop.conf.Configuration)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 BinaryComparable (org.apache.hadoop.io.BinaryComparable)1 FileChunk (org.apache.hadoop.io.FileChunk)1 NullWritable (org.apache.hadoop.io.NullWritable)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 Writer (org.apache.hadoop.mapred.IFile.Writer)1 JobConf (org.apache.hadoop.mapred.JobConf)1 Segment (org.apache.hadoop.mapred.Merger.Segment)1 RawKeyValueIterator (org.apache.hadoop.mapred.RawKeyValueIterator)1 Reducer (org.apache.hadoop.mapred.Reducer)1 CombineValuesIterator (org.apache.hadoop.mapred.Task.CombineValuesIterator)1