Search in sources :

Example 1 with Writer

use of org.apache.hadoop.mapred.IFile.Writer in project hadoop by apache.

the class MergeManagerImpl method finalMerge.

private RawKeyValueIterator finalMerge(JobConf job, FileSystem fs, List<InMemoryMapOutput<K, V>> inMemoryMapOutputs, List<CompressAwarePath> onDiskMapOutputs) throws IOException {
    LOG.info("finalMerge called with " + inMemoryMapOutputs.size() + " in-memory map-outputs and " + onDiskMapOutputs.size() + " on-disk map-outputs");
    final long maxInMemReduce = getMaxInMemReduceLimit();
    // merge config params
    Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
    Class<V> valueClass = (Class<V>) job.getMapOutputValueClass();
    boolean keepInputs = job.getKeepFailedTaskFiles();
    final Path tmpDir = new Path(reduceId.toString());
    final RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();
    // segments required to vacate memory
    List<Segment<K, V>> memDiskSegments = new ArrayList<Segment<K, V>>();
    long inMemToDiskBytes = 0;
    boolean mergePhaseFinished = false;
    if (inMemoryMapOutputs.size() > 0) {
        TaskID mapId = inMemoryMapOutputs.get(0).getMapId().getTaskID();
        inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce);
        final int numMemDiskSegments = memDiskSegments.size();
        if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
            // If we reach here, it implies that we have less than io.sort.factor
            // disk segments and this will be incremented by 1 (result of the 
            // memory segments merge). Since this total would still be 
            // <= io.sort.factor, we will not do any more intermediate merges,
            // the merge of all these disk segments would be directly fed to the
            // reduce method
            mergePhaseFinished = true;
            // must spill to disk, but can't retain in-mem for intermediate merge
            final Path outputPath = mapOutputFile.getInputFileForWrite(mapId, inMemToDiskBytes).suffix(Task.MERGED_OUTPUT_PREFIX);
            final RawKeyValueIterator rIter = Merger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null, mergePhase);
            FSDataOutputStream out = CryptoUtils.wrapIfNecessary(job, fs.create(outputPath));
            Writer<K, V> writer = new Writer<K, V>(job, out, keyClass, valueClass, codec, null, true);
            try {
                Merger.writeFile(rIter, writer, reporter, job);
                writer.close();
                onDiskMapOutputs.add(new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()));
                writer = null;
            // add to list of final disk outputs.
            } catch (IOException e) {
                if (null != outputPath) {
                    try {
                        fs.delete(outputPath, true);
                    } catch (IOException ie) {
                    // NOTHING
                    }
                }
                throw e;
            } finally {
                if (null != writer) {
                    writer.close();
                }
            }
            LOG.info("Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy " + "reduce memory limit");
            inMemToDiskBytes = 0;
            memDiskSegments.clear();
        } else if (inMemToDiskBytes != 0) {
            LOG.info("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
        }
    }
    // segments on disk
    List<Segment<K, V>> diskSegments = new ArrayList<Segment<K, V>>();
    long onDiskBytes = inMemToDiskBytes;
    long rawBytes = inMemToDiskBytes;
    CompressAwarePath[] onDisk = onDiskMapOutputs.toArray(new CompressAwarePath[onDiskMapOutputs.size()]);
    for (CompressAwarePath file : onDisk) {
        long fileLength = fs.getFileStatus(file).getLen();
        onDiskBytes += fileLength;
        rawBytes += (file.getRawDataLength() > 0) ? file.getRawDataLength() : fileLength;
        LOG.debug("Disk file: " + file + " Length is " + fileLength);
        diskSegments.add(new Segment<K, V>(job, fs, file, codec, keepInputs, (file.toString().endsWith(Task.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter), file.getRawDataLength()));
    }
    LOG.info("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
    Collections.sort(diskSegments, new Comparator<Segment<K, V>>() {

        public int compare(Segment<K, V> o1, Segment<K, V> o2) {
            if (o1.getLength() == o2.getLength()) {
                return 0;
            }
            return o1.getLength() < o2.getLength() ? -1 : 1;
        }
    });
    // build final list of segments from merged backed by disk + in-mem
    List<Segment<K, V>> finalSegments = new ArrayList<Segment<K, V>>();
    long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
    LOG.info("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
    if (0 != onDiskBytes) {
        final int numInMemSegments = memDiskSegments.size();
        diskSegments.addAll(0, memDiskSegments);
        memDiskSegments.clear();
        // Pass mergePhase only if there is a going to be intermediate
        // merges. See comment where mergePhaseFinished is being set
        Progress thisPhase = (mergePhaseFinished) ? null : mergePhase;
        RawKeyValueIterator diskMerge = Merger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false, spilledRecordsCounter, null, thisPhase);
        diskSegments.clear();
        if (0 == finalSegments.size()) {
            return diskMerge;
        }
        finalSegments.add(new Segment<K, V>(new RawKVIteratorReader(diskMerge, onDiskBytes), true, rawBytes));
    }
    return Merger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, reporter, spilledRecordsCounter, null, null);
}
Also used : ArrayList(java.util.ArrayList) Segment(org.apache.hadoop.mapred.Merger.Segment) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Path(org.apache.hadoop.fs.Path) Progress(org.apache.hadoop.util.Progress) TaskID(org.apache.hadoop.mapreduce.TaskID) IOException(java.io.IOException) RawKeyValueIterator(org.apache.hadoop.mapred.RawKeyValueIterator) RawComparator(org.apache.hadoop.io.RawComparator) Writer(org.apache.hadoop.mapred.IFile.Writer)

Example 2 with Writer

use of org.apache.hadoop.mapred.IFile.Writer in project hadoop by apache.

the class TestPipeApplication method testApplication.

/**
   * test org.apache.hadoop.mapred.pipes.Application
   * test a internal functions: MessageType.REGISTER_COUNTER,  INCREMENT_COUNTER, STATUS, PROGRESS...
   *
   * @throws Throwable
   */
@Test
public void testApplication() throws Throwable {
    JobConf conf = new JobConf();
    RecordReader<FloatWritable, NullWritable> rReader = new Reader();
    // client for test
    File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationStub");
    TestTaskReporter reporter = new TestTaskReporter();
    File[] psw = cleanTokenPasswordFile();
    try {
        conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);
        conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
        // token for authorization
        Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service"));
        TokenCache.setJobToken(token, conf.getCredentials());
        FakeCollector output = new FakeCollector(new Counters.Counter(), new Progress());
        FileSystem fs = new RawLocalFileSystem();
        fs.initialize(FsConstants.LOCAL_FS_URI, conf);
        Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(new Path(workSpace.getAbsolutePath() + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true);
        output.setWriter(wr);
        conf.set(Submitter.PRESERVE_COMMANDFILE, "true");
        initStdOut(conf);
        Application<WritableComparable<IntWritable>, Writable, IntWritable, Text> application = new Application<WritableComparable<IntWritable>, Writable, IntWritable, Text>(conf, rReader, output, reporter, IntWritable.class, Text.class);
        application.getDownlink().flush();
        application.getDownlink().mapItem(new IntWritable(3), new Text("txt"));
        application.getDownlink().flush();
        application.waitForFinish();
        wr.close();
        // test getDownlink().mapItem();
        String stdOut = readStdOut(conf);
        assertTrue(stdOut.contains("key:3"));
        assertTrue(stdOut.contains("value:txt"));
        // reporter test counter, and status should be sended
        // test MessageType.REGISTER_COUNTER and INCREMENT_COUNTER
        assertEquals(1.0, reporter.getProgress(), 0.01);
        assertNotNull(reporter.getCounter("group", "name"));
        // test status MessageType.STATUS
        assertEquals(reporter.getStatus(), "PROGRESS");
        stdOut = readFile(new File(workSpace.getAbsolutePath() + File.separator + "outfile"));
        // check MessageType.PROGRESS
        assertEquals(0.55f, rReader.getProgress(), 0.001);
        application.getDownlink().close();
        // test MessageType.OUTPUT
        Entry<IntWritable, Text> entry = output.getCollect().entrySet().iterator().next();
        assertEquals(123, entry.getKey().get());
        assertEquals("value", entry.getValue().toString());
        try {
            // try to abort
            application.abort(new Throwable());
            fail();
        } catch (IOException e) {
            // abort works ?
            assertEquals("pipe child exception", e.getMessage());
        }
    } finally {
        if (psw != null) {
            // remove password files
            for (File file : psw) {
                file.deleteOnExit();
            }
        }
    }
}
Also used : RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) RecordReader(org.apache.hadoop.mapred.RecordReader) NullWritable(org.apache.hadoop.io.NullWritable) Writable(org.apache.hadoop.io.Writable) IntWritable(org.apache.hadoop.io.IntWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) Token(org.apache.hadoop.security.token.Token) AMRMTokenIdentifier(org.apache.hadoop.yarn.security.AMRMTokenIdentifier) FileSystem(org.apache.hadoop.fs.FileSystem) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) NullWritable(org.apache.hadoop.io.NullWritable) Counter(org.apache.hadoop.mapred.Counters.Counter) FloatWritable(org.apache.hadoop.io.FloatWritable) WritableComparable(org.apache.hadoop.io.WritableComparable) Counters(org.apache.hadoop.mapred.Counters) File(java.io.File) Writer(org.apache.hadoop.mapred.IFile.Writer) Test(org.junit.Test)

Example 3 with Writer

use of org.apache.hadoop.mapred.IFile.Writer in project hadoop by apache.

the class TestPipeApplication method testRunner.

/**
   * test PipesMapRunner    test the transfer data from reader
   *
   * @throws Exception
   */
@Test
public void testRunner() throws Exception {
    // clean old password files
    File[] psw = cleanTokenPasswordFile();
    try {
        RecordReader<FloatWritable, NullWritable> rReader = new ReaderPipesMapRunner();
        JobConf conf = new JobConf();
        conf.set(Submitter.IS_JAVA_RR, "true");
        // for stdour and stderror
        conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);
        CombineOutputCollector<IntWritable, Text> output = new CombineOutputCollector<IntWritable, Text>(new Counters.Counter(), new Progress());
        FileSystem fs = new RawLocalFileSystem();
        fs.initialize(FsConstants.LOCAL_FS_URI, conf);
        Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(new Path(workSpace + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true);
        output.setWriter(wr);
        // stub for client
        File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationRunnableStub");
        conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
        // token for authorization
        Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service"));
        TokenCache.setJobToken(token, conf.getCredentials());
        conf.setBoolean(MRJobConfig.SKIP_RECORDS, true);
        TestTaskReporter reporter = new TestTaskReporter();
        PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text> runner = new PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text>();
        initStdOut(conf);
        runner.configure(conf);
        runner.run(rReader, output, reporter);
        String stdOut = readStdOut(conf);
        // test part of translated data. As common file for client and test -
        // clients stdOut
        // check version
        assertTrue(stdOut.contains("CURRENT_PROTOCOL_VERSION:0"));
        // check key and value classes
        assertTrue(stdOut.contains("Key class:org.apache.hadoop.io.FloatWritable"));
        assertTrue(stdOut.contains("Value class:org.apache.hadoop.io.NullWritable"));
        // test have sent all data from reader
        assertTrue(stdOut.contains("value:0.0"));
        assertTrue(stdOut.contains("value:9.0"));
    } finally {
        if (psw != null) {
            // remove password files
            for (File file : psw) {
                file.deleteOnExit();
            }
        }
    }
}
Also used : RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) Token(org.apache.hadoop.security.token.Token) AMRMTokenIdentifier(org.apache.hadoop.yarn.security.AMRMTokenIdentifier) FileSystem(org.apache.hadoop.fs.FileSystem) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) NullWritable(org.apache.hadoop.io.NullWritable) Counter(org.apache.hadoop.mapred.Counters.Counter) FloatWritable(org.apache.hadoop.io.FloatWritable) Counters(org.apache.hadoop.mapred.Counters) File(java.io.File) Writer(org.apache.hadoop.mapred.IFile.Writer) Test(org.junit.Test)

Aggregations

Path (org.apache.hadoop.fs.Path)3 Writer (org.apache.hadoop.mapred.IFile.Writer)3 File (java.io.File)2 IOException (java.io.IOException)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 RawLocalFileSystem (org.apache.hadoop.fs.RawLocalFileSystem)2 FloatWritable (org.apache.hadoop.io.FloatWritable)2 IntWritable (org.apache.hadoop.io.IntWritable)2 NullWritable (org.apache.hadoop.io.NullWritable)2 Text (org.apache.hadoop.io.Text)2 Counters (org.apache.hadoop.mapred.Counters)2 Counter (org.apache.hadoop.mapred.Counters.Counter)2 JobConf (org.apache.hadoop.mapred.JobConf)2 Token (org.apache.hadoop.security.token.Token)2 AMRMTokenIdentifier (org.apache.hadoop.yarn.security.AMRMTokenIdentifier)2 Test (org.junit.Test)2 ArrayList (java.util.ArrayList)1 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)1 BooleanWritable (org.apache.hadoop.io.BooleanWritable)1 RawComparator (org.apache.hadoop.io.RawComparator)1