Examples with FileChunk - org.apache.hadoop.io.FileChunk

Example 1 with FileChunk

use of org.apache.hadoop.io.FileChunk in project tez by apache.

the class TestMergeManager method testLocalDiskMergeMultipleTasks.

void testLocalDiskMergeMultipleTasks(final boolean interruptInMiddle) throws IOException, InterruptedException {
    Configuration conf = new TezConfiguration(defaultConf);
    conf.setBoolean(TezRuntimeConfiguration.TEZ_RUNTIME_COMPRESS, false);
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_KEY_CLASS, IntWritable.class.getName());
    conf.set(TezRuntimeConfiguration.TEZ_RUNTIME_VALUE_CLASS, IntWritable.class.getName());
    Path localDir = new Path(workDir, "local");
    Path srcDir = new Path(workDir, "srcData");
    localFs.mkdirs(localDir);
    localFs.mkdirs(srcDir);
    conf.setStrings(TezRuntimeFrameworkConfigs.LOCAL_DIRS, localDir.toString());
    FileSystem localFs = FileSystem.getLocal(conf);
    LocalDirAllocator localDirAllocator = new LocalDirAllocator(TezRuntimeFrameworkConfigs.LOCAL_DIRS);
    InputContext t0inputContext = createMockInputContext(UUID.randomUUID().toString());
    InputContext t1inputContext = createMockInputContext(UUID.randomUUID().toString());
    ExceptionReporter t0exceptionReporter = mock(ExceptionReporter.class);
    ExceptionReporter t1exceptionReporter = mock(ExceptionReporter.class);
    MergeManager t0mergeManagerReal = new MergeManager(conf, localFs, localDirAllocator, t0inputContext, null, null, null, null, t0exceptionReporter, 2000000, null, false, -1) {

        // override for interruptInMiddle testing
        @Override
        public synchronized void closeOnDiskFile(FileChunk file) {
            if (interruptInMiddle) {
                try {
                    Thread.sleep(2000);
                } catch (InterruptedException e) {
                    Thread.currentThread().interrupt();
                    return;
                }
            }
            super.closeOnDiskFile(file);
        }
    };
    MergeManager t0mergeManager = spy(t0mergeManagerReal);
    t0mergeManager.configureAndStart();
    MergeManager t1mergeManagerReal = new MergeManager(conf, localFs, localDirAllocator, t1inputContext, null, null, null, null, t1exceptionReporter, 2000000, null, false, -1);
    MergeManager t1mergeManager = spy(t1mergeManagerReal);
    // Partition 0 Keys 0-2, Partition 1 Keys 3-5
    SrcFileInfo src1Info = createFile(conf, localFs, new Path(srcDir, InputAttemptIdentifier.PATH_PREFIX + "src1.out"), 2, 3, 0);
    // Partition 0 Keys 6-8, Partition 1 Keys 9-11
    SrcFileInfo src2Info = createFile(conf, localFs, new Path(srcDir, InputAttemptIdentifier.PATH_PREFIX + "src2.out"), 2, 3, 6);
    // Simulating Task 0 fetches partition 0. (targetIndex = 0,1)
    // Simulating Task 1 fetches partition 1. (targetIndex = 0,1)
    InputAttemptIdentifier t0Identifier0 = new InputAttemptIdentifier(0, 0, src1Info.path.getName());
    InputAttemptIdentifier t0Identifier1 = new InputAttemptIdentifier(1, 0, src2Info.path.getName());
    InputAttemptIdentifier t1Identifier0 = new InputAttemptIdentifier(0, 0, src1Info.path.getName());
    InputAttemptIdentifier t1Identifier1 = new InputAttemptIdentifier(1, 0, src2Info.path.getName());
    MapOutput t0MapOutput0 = getMapOutputForDirectDiskFetch(t0Identifier0, src1Info.path, src1Info.indexedRecords[0], t0mergeManager);
    MapOutput t0MapOutput1 = getMapOutputForDirectDiskFetch(t0Identifier1, src2Info.path, src2Info.indexedRecords[0], t0mergeManager);
    MapOutput t1MapOutput0 = getMapOutputForDirectDiskFetch(t1Identifier0, src1Info.path, src1Info.indexedRecords[1], t1mergeManager);
    MapOutput t1MapOutput1 = getMapOutputForDirectDiskFetch(t1Identifier1, src2Info.path, src2Info.indexedRecords[1], t1mergeManager);
    t0MapOutput0.commit();
    t0MapOutput1.commit();
    verify(t0mergeManager).closeOnDiskFile(t0MapOutput0.getOutputPath());
    verify(t0mergeManager).closeOnDiskFile(t0MapOutput1.getOutputPath());
    // Run the OnDiskMerge via MergeManager
    // Simulate the thread invocation - remove files, and invoke merge
    List<FileChunk> t0MergeFiles = new LinkedList<FileChunk>();
    t0MergeFiles.addAll(t0mergeManager.onDiskMapOutputs);
    t0mergeManager.onDiskMapOutputs.clear();
    if (!interruptInMiddle) {
        t0mergeManager.onDiskMerger.merge(t0MergeFiles);
        Assert.assertEquals(1, t0mergeManager.onDiskMapOutputs.size());
    } else {
        // Start Interrupting thread
        Thread interruptingThread = new Thread(new InterruptingThread(t0mergeManager.onDiskMerger));
        interruptingThread.start();
        try {
            Thread.sleep(1000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        // Will be interrupted in the middle by interruptingThread.
        t0mergeManager.onDiskMerger.startMerge(Sets.newHashSet(t0MergeFiles));
        t0mergeManager.onDiskMerger.waitForMerge();
        Assert.assertNotEquals(1, t0mergeManager.onDiskMapOutputs.size());
    }
    if (!interruptInMiddle) {
        t1MapOutput0.commit();
        t1MapOutput1.commit();
        verify(t1mergeManager).closeOnDiskFile(t1MapOutput0.getOutputPath());
        verify(t1mergeManager).closeOnDiskFile(t1MapOutput1.getOutputPath());
        // Run the OnDiskMerge via MergeManager
        // Simulate the thread invocation - remove files, and invoke merge
        List<FileChunk> t1MergeFiles = new LinkedList<FileChunk>();
        t1MergeFiles.addAll(t1mergeManager.onDiskMapOutputs);
        t1mergeManager.onDiskMapOutputs.clear();
        t1mergeManager.onDiskMerger.merge(t1MergeFiles);
        Assert.assertEquals(1, t1mergeManager.onDiskMapOutputs.size());
        Assert.assertNotEquals(t0mergeManager.onDiskMapOutputs.iterator().next().getPath(), t1mergeManager.onDiskMapOutputs.iterator().next().getPath());
        Assert.assertTrue(t0mergeManager.onDiskMapOutputs.iterator().next().getPath().toString().contains(t0inputContext.getUniqueIdentifier()));
        Assert.assertTrue(t1mergeManager.onDiskMapOutputs.iterator().next().getPath().toString().contains(t1inputContext.getUniqueIdentifier()));
    }
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TezRuntimeConfiguration(org.apache.tez.runtime.library.api.TezRuntimeConfiguration) TezConfiguration(org.apache.tez.dag.api.TezConfiguration) InputContext(org.apache.tez.runtime.api.InputContext) InputAttemptIdentifier(org.apache.tez.runtime.library.common.InputAttemptIdentifier) LinkedList(java.util.LinkedList) FileSystem(org.apache.hadoop.fs.FileSystem) LocalDirAllocator(org.apache.hadoop.fs.LocalDirAllocator) FileChunk(org.apache.hadoop.io.FileChunk) IntWritable(org.apache.hadoop.io.IntWritable) TezConfiguration(org.apache.tez.dag.api.TezConfiguration)

Example 2 with FileChunk

use of org.apache.hadoop.io.FileChunk in project tez by apache.

the class MergeManager method finalMerge.

private TezRawKeyValueIterator finalMerge(Configuration job, FileSystem fs, List<MapOutput> inMemoryMapOutputs, List<FileChunk> onDiskMapOutputs) throws IOException, InterruptedException {
    logFinalMergeStart(inMemoryMapOutputs, onDiskMapOutputs);
    StringBuilder finalMergeLog = new StringBuilder();
    inputContext.notifyProgress();
    // merge config params
    Class keyClass = (Class) ConfigUtils.getIntermediateInputKeyClass(job);
    Class valueClass = (Class) ConfigUtils.getIntermediateInputValueClass(job);
    final Path tmpDir = new Path(inputContext.getUniqueIdentifier());
    final RawComparator comparator = (RawComparator) ConfigUtils.getIntermediateInputKeyComparator(job);
    // segments required to vacate memory
    List<Segment> memDiskSegments = new ArrayList<Segment>();
    long inMemToDiskBytes = 0;
    boolean mergePhaseFinished = false;
    if (inMemoryMapOutputs.size() > 0) {
        int srcTaskId = inMemoryMapOutputs.get(0).getAttemptIdentifier().getInputIdentifier();
        inMemToDiskBytes = createInMemorySegments(inMemoryMapOutputs, memDiskSegments, this.postMergeMemLimit);
        final int numMemDiskSegments = memDiskSegments.size();
        if (numMemDiskSegments > 0 && ioSortFactor > onDiskMapOutputs.size()) {
            // If we reach here, it implies that we have less than io.sort.factor
            // disk segments and this will be incremented by 1 (result of the
            // memory segments merge). Since this total would still be
            // <= io.sort.factor, we will not do any more intermediate merges,
            // the merge of all these disk segments would be directly fed to the
            // reduce method
            mergePhaseFinished = true;
            // must spill to disk, but can't retain in-mem for intermediate merge
            // Can not use spill id in final merge as it would clobber with other files, hence using
            // Integer.MAX_VALUE
            final Path outputPath = mapOutputFile.getInputFileForWrite(srcTaskId, Integer.MAX_VALUE, inMemToDiskBytes).suffix(Constants.MERGED_OUTPUT_PREFIX);
            final TezRawKeyValueIterator rIter = TezMerger.merge(job, fs, keyClass, valueClass, memDiskSegments, numMemDiskSegments, tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
            final Writer writer = new Writer(job, fs, outputPath, keyClass, valueClass, codec, null, null);
            try {
                TezMerger.writeFile(rIter, writer, progressable, TezRuntimeConfiguration.TEZ_RUNTIME_RECORDS_BEFORE_PROGRESS_DEFAULT);
            } catch (IOException e) {
                if (null != outputPath) {
                    try {
                        fs.delete(outputPath, true);
                    } catch (IOException ie) {
                    // NOTHING
                    }
                }
                throw e;
            } finally {
                if (null != writer) {
                    writer.close();
                    additionalBytesWritten.increment(writer.getCompressedLength());
                }
            }
            final FileStatus fStatus = localFS.getFileStatus(outputPath);
            // add to list of final disk outputs.
            onDiskMapOutputs.add(new FileChunk(outputPath, 0, fStatus.getLen()));
            if (LOG.isInfoEnabled()) {
                finalMergeLog.append("MemMerged: " + numMemDiskSegments + ", " + inMemToDiskBytes);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Merged " + numMemDiskSegments + "segments, size=" + inMemToDiskBytes + " to " + outputPath);
                }
            }
            inMemToDiskBytes = 0;
            memDiskSegments.clear();
        } else if (inMemToDiskBytes != 0) {
            if (LOG.isInfoEnabled()) {
                finalMergeLog.append("DelayedMemMerge: " + numMemDiskSegments + ", " + inMemToDiskBytes);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Keeping " + numMemDiskSegments + " segments, " + inMemToDiskBytes + " bytes in memory for " + "intermediate, on-disk merge");
                }
            }
        }
    }
    // segments on disk
    List<Segment> diskSegments = new ArrayList<Segment>();
    long onDiskBytes = inMemToDiskBytes;
    FileChunk[] onDisk = onDiskMapOutputs.toArray(new FileChunk[onDiskMapOutputs.size()]);
    for (FileChunk fileChunk : onDisk) {
        final long fileLength = fileChunk.getLength();
        onDiskBytes += fileLength;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Disk file=" + fileChunk.getPath() + ", len=" + fileLength + ", isLocal=" + fileChunk.isLocalFile());
        }
        final Path file = fileChunk.getPath();
        TezCounter counter = file.toString().endsWith(Constants.MERGED_OUTPUT_PREFIX) ? null : mergedMapOutputsCounter;
        final long fileOffset = fileChunk.getOffset();
        final boolean preserve = fileChunk.isLocalFile();
        diskSegments.add(new DiskSegment(fs, file, fileOffset, fileLength, codec, ifileReadAhead, ifileReadAheadLength, ifileBufferSize, preserve, counter));
    }
    if (LOG.isInfoEnabled()) {
        finalMergeLog.append(". DiskSeg: " + onDisk.length + ", " + onDiskBytes);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Merging " + onDisk.length + " files, " + onDiskBytes + " bytes from disk");
        }
    }
    Collections.sort(diskSegments, new Comparator<Segment>() {

        public int compare(Segment o1, Segment o2) {
            if (o1.getLength() == o2.getLength()) {
                return 0;
            }
            return o1.getLength() < o2.getLength() ? -1 : 1;
        }
    });
    // build final list of segments from merged backed by disk + in-mem
    List<Segment> finalSegments = new ArrayList<Segment>();
    long inMemBytes = createInMemorySegments(inMemoryMapOutputs, finalSegments, 0);
    if (LOG.isInfoEnabled()) {
        finalMergeLog.append(". MemSeg: " + finalSegments.size() + ", " + inMemBytes);
        if (LOG.isDebugEnabled()) {
            LOG.debug("Merging " + finalSegments.size() + " segments, " + inMemBytes + " bytes from memory into reduce");
        }
    }
    if (0 != onDiskBytes) {
        final int numInMemSegments = memDiskSegments.size();
        diskSegments.addAll(0, memDiskSegments);
        memDiskSegments.clear();
        TezRawKeyValueIterator diskMerge = TezMerger.merge(job, fs, keyClass, valueClass, codec, diskSegments, ioSortFactor, numInMemSegments, tmpDir, comparator, progressable, false, spilledRecordsCounter, null, additionalBytesRead, null);
        diskSegments.clear();
        if (0 == finalSegments.size()) {
            return diskMerge;
        }
        finalSegments.add(new Segment(new RawKVIteratorReader(diskMerge, onDiskBytes), null));
    }
    if (LOG.isInfoEnabled()) {
        LOG.info(finalMergeLog.toString());
    }
    // This is doing nothing but creating an iterator over the segments.
    return TezMerger.merge(job, fs, keyClass, valueClass, finalSegments, finalSegments.size(), tmpDir, comparator, progressable, spilledRecordsCounter, null, additionalBytesRead, null);
}

Also used : Path(org.apache.hadoop.fs.Path) DiskSegment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) IOException(java.io.IOException) TezCounter(org.apache.tez.common.counters.TezCounter) DiskSegment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment) Segment(org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment) RawComparator(org.apache.hadoop.io.RawComparator) FileChunk(org.apache.hadoop.io.FileChunk) TezRawKeyValueIterator(org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator) Writer(org.apache.tez.runtime.library.common.sort.impl.IFile.Writer)

Example 3 with FileChunk

use of org.apache.hadoop.io.FileChunk in project tez by apache.

the class MergeManager method logFinalMergeStart.

private void logFinalMergeStart(List<MapOutput> inMemoryMapOutputs, List<FileChunk> onDiskMapOutputs) {
    long inMemSegmentSize = 0;
    for (MapOutput inMemoryMapOutput : inMemoryMapOutputs) {
        inMemSegmentSize += inMemoryMapOutput.getSize();
        if (LOG.isDebugEnabled()) {
            LOG.debug("finalMerge: inMemoryOutput=" + inMemoryMapOutput + ", size=" + inMemoryMapOutput.getSize());
        }
    }
    long onDiskSegmentSize = 0;
    for (FileChunk onDiskMapOutput : onDiskMapOutputs) {
        onDiskSegmentSize += onDiskMapOutput.getLength();
        if (LOG.isDebugEnabled()) {
            LOG.debug("finalMerge: onDiskMapOutput=" + onDiskMapOutput.getPath() + ", size=" + onDiskMapOutput.getLength());
        }
    }
    LOG.info("finalMerge with #inMemoryOutputs={}, size={} and #onDiskOutputs={}, size={}", inMemoryMapOutputs.size(), inMemSegmentSize, onDiskMapOutputs.size(), onDiskSegmentSize);
}

Also used : FileChunk(org.apache.hadoop.io.FileChunk)

Example 4 with FileChunk

use of org.apache.hadoop.io.FileChunk in project tez by apache.

the class MergeManager method closeOnDiskFile.

@Override
public synchronized void closeOnDiskFile(FileChunk file) {
    // including only path & offset for valdiations.
    for (FileChunk fileChunk : onDiskMapOutputs) {
        if (fileChunk.getPath().equals(file.getPath())) {
            // ensure offsets are not the same.
            Preconditions.checkArgument(fileChunk.getOffset() != file.getOffset(), "Can't have a file with same path and offset." + "OldFilePath=" + fileChunk.getPath() + ", OldFileOffset=" + fileChunk.getOffset() + ", newFilePath=" + file.getPath() + ", newFileOffset=" + file.getOffset());
        }
    }
    onDiskMapOutputs.add(file);
    logCloseOnDiskFile(file);
    synchronized (onDiskMerger) {
        if (!onDiskMerger.isInProgress() && onDiskMapOutputs.size() >= (2 * ioSortFactor - 1)) {
            onDiskMerger.startMerge(onDiskMapOutputs);
        }
    }
}

Also used : FileChunk(org.apache.hadoop.io.FileChunk)

Example 5 with FileChunk

use of org.apache.hadoop.io.FileChunk in project tez by apache.

the class MergeManager method close.

public TezRawKeyValueIterator close(boolean tryFinalMerge) throws Throwable {
    if (!isShutdown.getAndSet(true)) {
        // Wait for on-going merges to complete
        if (memToMemMerger != null) {
            memToMemMerger.close();
        }
        inMemoryMerger.close();
        onDiskMerger.close();
        List<MapOutput> memory = new ArrayList<MapOutput>(inMemoryMergedMapOutputs);
        inMemoryMergedMapOutputs.clear();
        memory.addAll(inMemoryMapOutputs);
        inMemoryMapOutputs.clear();
        List<FileChunk> disk = new ArrayList<FileChunk>(onDiskMapOutputs);
        onDiskMapOutputs.clear();
        if (statsInMemTotal.count > 0) {
            LOG.info("TotalInMemFetchStats: count={}, totalSize={}, min={}, max={}, avg={}", statsInMemTotal.count, statsInMemTotal.size, statsInMemTotal.minSize, statsInMemTotal.maxSize, (statsInMemTotal.size / (float) statsInMemTotal.size));
        }
        // shuffle exception / error.
        if (tryFinalMerge) {
            try {
                TezRawKeyValueIterator kvIter = finalMerge(conf, rfs, memory, disk);
                this.finalMergeComplete = true;
                return kvIter;
            } catch (InterruptedException e) {
                // Cleanup the disk segments
                if (cleanup) {
                    cleanup(localFS, disk);
                    cleanup(localFS, onDiskMapOutputs);
                }
                // reset interrupt status
                Thread.currentThread().interrupt();
                throw e;
            }
        }
    }
    return null;
}

Also used : ArrayList(java.util.ArrayList) FileChunk(org.apache.hadoop.io.FileChunk) TezRawKeyValueIterator(org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator)

Aggregations

FileChunk (org.apache.hadoop.io.FileChunk)6 Path (org.apache.hadoop.fs.Path)3 ArrayList (java.util.ArrayList)2 LinkedList (java.util.LinkedList)2 Configuration (org.apache.hadoop.conf.Configuration)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 LocalDirAllocator (org.apache.hadoop.fs.LocalDirAllocator)2 IntWritable (org.apache.hadoop.io.IntWritable)2 TezConfiguration (org.apache.tez.dag.api.TezConfiguration)2 InputContext (org.apache.tez.runtime.api.InputContext)2 TezRuntimeConfiguration (org.apache.tez.runtime.library.api.TezRuntimeConfiguration)2 InputAttemptIdentifier (org.apache.tez.runtime.library.common.InputAttemptIdentifier)2 TezRawKeyValueIterator (org.apache.tez.runtime.library.common.sort.impl.TezRawKeyValueIterator)2 IOException (java.io.IOException)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 RawComparator (org.apache.hadoop.io.RawComparator)1 TezCounter (org.apache.tez.common.counters.TezCounter)1 Writer (org.apache.tez.runtime.library.common.sort.impl.IFile.Writer)1 DiskSegment (org.apache.tez.runtime.library.common.sort.impl.TezMerger.DiskSegment)1 Segment (org.apache.tez.runtime.library.common.sort.impl.TezMerger.Segment)1