Example 1 with Writer

use of org.apache.hadoop.mapred.IFile.Writer in project hadoop by apache.

the class MergeManagerImpl method finalMerge.

private RawKeyValueIterator finalMerge(JobConf job, FileSystem fs, List<InMemoryMapOutput<K, V>> inMemoryMapOutputs, List<CompressAwarePath> onDiskMapOutputs) throws IOException {"finalMerge called with " + inMemoryMapOutputs.size() + " in-memory map-outputs and " + onDiskMapOutputs.size() + " on-disk map-outputs");
    final long maxInMemReduce = getMaxInMemReduceLimit();
    // merge config params
    Class<K> keyClass = (Class<K>) job.getMapOutputKeyClass();
    Class<V> valueClass = (Class<V>) job.getMapOutputValueClass();
    boolean keepInputs = job.getKeepFailedTaskFiles();
    final Path tmpDir = new Path(reduceId.toString());
    final RawComparator<K> comparator = (RawComparator<K>) job.getOutputKeyComparator();
    // segments required to vacate memory
    List<Segment<K, V>> memDiskSegments = new ArrayList<Segment<K, V>>();
    long inMemToDiskBytes = 0;
    boolean mergePhaseFinished = false;
    if (inMemoryMapOutputs.size() > 0) {
        TaskID mapId = inMemoryMapOutputs.get(0).getMapId().getTaskID();
        inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, maxInMemReduce);
        final int numMemDiskSegments = memDiskSegments.size();
        if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
            // If we reach here, it implies that we have less than io.sort.factor
            // disk segments and this will be incremented by 1 (result of the 
            // memory segments merge). Since this total would still be 
            // <= io.sort.factor, we will not do any more intermediate merges,
            // the merge of all these disk segments would be directly fed to the
            // reduce method
            mergePhaseFinished = true;
            // must spill to disk, but can't retain in-mem for intermediate merge
            final Path outputPath = mapOutputFile.getInputFileForWrite(mapId, inMemToDiskBytes).suffix(Task.MERGED_OUTPUT_PREFIX);
            final RawKeyValueIterator rIter = Merger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, reporter, spilledRecordsCounter, null, mergePhase);
            FSDataOutputStream out = CryptoUtils.wrapIfNecessary(job, fs.create(outputPath));
            Writer<K, V> writer = new Writer<K, V>(job, out, keyClass, valueClass, codec, null, true);
            try {
                Merger.writeFile(rIter, writer, reporter, job);
                onDiskMapOutputs.add(new CompressAwarePath(outputPath, writer.getRawLength(), writer.getCompressedLength()));
                writer = null;
            // add to list of final disk outputs.
            } catch (IOException e) {
                if (null != outputPath) {
                    try {
                        fs.delete(outputPath, true);
                    } catch (IOException ie) {
                    // NOTHING
                throw e;
            } finally {
                if (null != writer) {
  "Merged " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes to disk to satisfy " + "reduce memory limit");
            inMemToDiskBytes = 0;
        } else if (inMemToDiskBytes != 0) {
  "Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
    // segments on disk
    List<Segment<K, V>> diskSegments = new ArrayList<Segment<K, V>>();
    long onDiskBytes = inMemToDiskBytes;
    long rawBytes = inMemToDiskBytes;
    CompressAwarePath[] onDisk = onDiskMapOutputs.toArray(new CompressAwarePath[onDiskMapOutputs.size()]);
    for (CompressAwarePath file : onDisk) {
        long fileLength = fs.getFileStatus(file).getLen();
        onDiskBytes += fileLength;
        rawBytes += (file.getRawDataLength() > 0) ? file.getRawDataLength() : fileLength;
        LOG.debug("Disk file: " + file + " Length is " + fileLength);
        diskSegments.add(new Segment<K, V>(job, fs, file, codec, keepInputs, (file.toString().endsWith(Task.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter), file.getRawDataLength()));
    }"Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
    Collections.sort(diskSegments, new Comparator<Segment<K, V>>() {

        public int compare(Segment<K, V> o1, Segment<K, V> o2) {
            if (o1.getLength() == o2.getLength()) {
                return 0;
            return o1.getLength() < o2.getLength() ? -1 : 1;
    // build final list of segments from merged backed by disk + in-mem
    List<Segment<K, V>> finalSegments = new ArrayList<Segment<K, V>>();
    long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);"Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
    if (0 != onDiskBytes) {
        final int numInMemSegments = memDiskSegments.size();
        diskSegments.addAll(0, memDiskSegments);
        // Pass mergePhase only if there is a going to be intermediate
        // merges. See comment where mergePhaseFinished is being set
        Progress thisPhase = (mergePhaseFinished) ? null : mergePhase;
        RawKeyValueIterator diskMerge = Merger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, reporter, false, spilledRecordsCounter, null, thisPhase);
        if (0 == finalSegments.size()) {
            return diskMerge;
        finalSegments.add(new Segment<K, V>(new RawKVIteratorReader(diskMerge, onDiskBytes), true, rawBytes));
    return Merger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, reporter, spilledRecordsCounter, null, null);
Also used : ArrayList(java.util.ArrayList) Segment(org.apache.hadoop.mapred.Merger.Segment) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Path(org.apache.hadoop.fs.Path) Progress(org.apache.hadoop.util.Progress) TaskID(org.apache.hadoop.mapreduce.TaskID) IOException( RawKeyValueIterator(org.apache.hadoop.mapred.RawKeyValueIterator) RawComparator( Writer(org.apache.hadoop.mapred.IFile.Writer)

Example 2 with Writer

use of org.apache.hadoop.mapred.IFile.Writer in project hadoop by apache.

the class TestPipeApplication method testApplication.

   * test org.apache.hadoop.mapred.pipes.Application
   * test a internal functions: MessageType.REGISTER_COUNTER,  INCREMENT_COUNTER, STATUS, PROGRESS...
   * @throws Throwable
public void testApplication() throws Throwable {
    JobConf conf = new JobConf();
    RecordReader<FloatWritable, NullWritable> rReader = new Reader();
    // client for test
    File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationStub");
    TestTaskReporter reporter = new TestTaskReporter();
    File[] psw = cleanTokenPasswordFile();
    try {
        conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);
        conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
        // token for authorization
        Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service"));
        TokenCache.setJobToken(token, conf.getCredentials());
        FakeCollector output = new FakeCollector(new Counters.Counter(), new Progress());
        FileSystem fs = new RawLocalFileSystem();
        fs.initialize(FsConstants.LOCAL_FS_URI, conf);
        Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(new Path(workSpace.getAbsolutePath() + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true);
        conf.set(Submitter.PRESERVE_COMMANDFILE, "true");
        Application<WritableComparable<IntWritable>, Writable, IntWritable, Text> application = new Application<WritableComparable<IntWritable>, Writable, IntWritable, Text>(conf, rReader, output, reporter, IntWritable.class, Text.class);
        application.getDownlink().mapItem(new IntWritable(3), new Text("txt"));
        // test getDownlink().mapItem();
        String stdOut = readStdOut(conf);
        // reporter test counter, and status should be sended
        // test MessageType.REGISTER_COUNTER and INCREMENT_COUNTER
        assertEquals(1.0, reporter.getProgress(), 0.01);
        assertNotNull(reporter.getCounter("group", "name"));
        // test status MessageType.STATUS
        assertEquals(reporter.getStatus(), "PROGRESS");
        stdOut = readFile(new File(workSpace.getAbsolutePath() + File.separator + "outfile"));
        // check MessageType.PROGRESS
        assertEquals(0.55f, rReader.getProgress(), 0.001);
        // test MessageType.OUTPUT
        Entry<IntWritable, Text> entry = output.getCollect().entrySet().iterator().next();
        assertEquals(123, entry.getKey().get());
        assertEquals("value", entry.getValue().toString());
        try {
            // try to abort
            application.abort(new Throwable());
        } catch (IOException e) {
            // abort works ?
            assertEquals("pipe child exception", e.getMessage());
    } finally {
        if (psw != null) {
            // remove password files
            for (File file : psw) {
Also used : RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) RecordReader(org.apache.hadoop.mapred.RecordReader) NullWritable( Writable( IntWritable( BooleanWritable( FloatWritable( Token( AMRMTokenIdentifier( FileSystem(org.apache.hadoop.fs.FileSystem) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable( Path(org.apache.hadoop.fs.Path) Text( IOException( NullWritable( Counter(org.apache.hadoop.mapred.Counters.Counter) FloatWritable( WritableComparable( Counters(org.apache.hadoop.mapred.Counters) File( Writer(org.apache.hadoop.mapred.IFile.Writer) Test(org.junit.Test)

Example 3 with Writer

use of org.apache.hadoop.mapred.IFile.Writer in project hadoop by apache.

the class TestPipeApplication method testRunner.

   * test PipesMapRunner    test the transfer data from reader
   * @throws Exception
public void testRunner() throws Exception {
    // clean old password files
    File[] psw = cleanTokenPasswordFile();
    try {
        RecordReader<FloatWritable, NullWritable> rReader = new ReaderPipesMapRunner();
        JobConf conf = new JobConf();
        conf.set(Submitter.IS_JAVA_RR, "true");
        // for stdour and stderror
        conf.set(MRJobConfig.TASK_ATTEMPT_ID, taskName);
        CombineOutputCollector<IntWritable, Text> output = new CombineOutputCollector<IntWritable, Text>(new Counters.Counter(), new Progress());
        FileSystem fs = new RawLocalFileSystem();
        fs.initialize(FsConstants.LOCAL_FS_URI, conf);
        Writer<IntWritable, Text> wr = new Writer<IntWritable, Text>(conf, fs.create(new Path(workSpace + File.separator + "outfile")), IntWritable.class, Text.class, null, null, true);
        // stub for client
        File fCommand = getFileCommand("org.apache.hadoop.mapred.pipes.PipeApplicationRunnableStub");
        conf.set(MRJobConfig.CACHE_LOCALFILES, fCommand.getAbsolutePath());
        // token for authorization
        Token<AMRMTokenIdentifier> token = new Token<AMRMTokenIdentifier>("user".getBytes(), "password".getBytes(), new Text("kind"), new Text("service"));
        TokenCache.setJobToken(token, conf.getCredentials());
        conf.setBoolean(MRJobConfig.SKIP_RECORDS, true);
        TestTaskReporter reporter = new TestTaskReporter();
        PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text> runner = new PipesMapRunner<FloatWritable, NullWritable, IntWritable, Text>();
        runner.configure(conf);, output, reporter);
        String stdOut = readStdOut(conf);
        // test part of translated data. As common file for client and test -
        // clients stdOut
        // check version
        // check key and value classes
        // test have sent all data from reader
    } finally {
        if (psw != null) {
            // remove password files
            for (File file : psw) {
Also used : RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) Token( AMRMTokenIdentifier( FileSystem(org.apache.hadoop.fs.FileSystem) RawLocalFileSystem(org.apache.hadoop.fs.RawLocalFileSystem) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable( Path(org.apache.hadoop.fs.Path) Text( NullWritable( Counter(org.apache.hadoop.mapred.Counters.Counter) FloatWritable( Counters(org.apache.hadoop.mapred.Counters) File( Writer(org.apache.hadoop.mapred.IFile.Writer) Test(org.junit.Test)


