Search in sources :

Example 61 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class TestFsck method testFsckMoveAfterCorruption.

@Test(timeout = 300000)
public void testFsckMoveAfterCorruption() throws Exception {
    final int dfsBlockSize = 512 * 1024;
    final int numDatanodes = 1;
    final int replication = 1;
    conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, dfsBlockSize);
    conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
    conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_INTERVAL_KEY, 1);
    conf.setInt(DFSConfigKeys.DFS_REPLICATION_KEY, replication);
    cluster = new MiniDFSCluster.Builder(conf).build();
    DistributedFileSystem dfs = cluster.getFileSystem();
    cluster.waitActive();
    final String srcDir = "/srcdat";
    final DFSTestUtil util = new DFSTestUtil.Builder().setName("TestFsck").setMinSize(dfsBlockSize * 2).setMaxSize(dfsBlockSize * 3).setNumFiles(1).build();
    util.createFiles(dfs, srcDir, (short) replication);
    final String[] fileNames = util.getFileNames(srcDir);
    LOG.info("Created files: " + Arrays.toString(fileNames));
    // Run fsck here. The output is automatically logged for easier debugging
    String outStr = runFsck(conf, 0, true, "/", "-files", "-blocks");
    assertTrue(outStr.contains(NamenodeFsck.HEALTHY_STATUS));
    // Corrupt the first block
    final DFSClient dfsClient = new DFSClient(new InetSocketAddress("localhost", cluster.getNameNodePort()), conf);
    final String blockFileToCorrupt = fileNames[0];
    final CorruptedTestFile ctf = new CorruptedTestFile(blockFileToCorrupt, Sets.newHashSet(0), dfsClient, numDatanodes, dfsBlockSize);
    ctf.corruptBlocks(cluster);
    // Wait for fsck to discover all the missing blocks
    GenericTestUtils.waitFor(new Supplier<Boolean>() {

        @Override
        public Boolean get() {
            try {
                final String str = runFsck(conf, 1, false, "/");
                String numCorrupt = null;
                for (String line : str.split(LINE_SEPARATOR)) {
                    Matcher m = NUM_CORRUPT_BLOCKS_PATTERN.matcher(line);
                    if (m.matches()) {
                        numCorrupt = m.group(1);
                        break;
                    }
                }
                if (numCorrupt == null) {
                    Assert.fail("Cannot find corrupt blocks count in fsck output.");
                }
                if (Integer.parseInt(numCorrupt) == ctf.getTotalMissingBlocks()) {
                    assertTrue(str.contains(NamenodeFsck.CORRUPT_STATUS));
                    return true;
                }
            } catch (Exception e) {
                LOG.error("Exception caught", e);
                Assert.fail("Caught unexpected exception.");
            }
            return false;
        }
    }, 1000, 60000);
    runFsck(conf, 1, true, "/", "-files", "-blocks", "-racks");
    LOG.info("Moving blocks to lost+found");
    // Fsck will return error since we corrupted a block
    runFsck(conf, 1, false, "/", "-move");
    final List<LocatedFileStatus> retVal = new ArrayList<>();
    final RemoteIterator<LocatedFileStatus> iter = dfs.listFiles(new Path("/lost+found"), true);
    while (iter.hasNext()) {
        retVal.add(iter.next());
    }
    LOG.info("Items in lost+found: " + retVal);
    // Expect all good blocks moved, only corrupted block skipped.
    long totalLength = 0;
    for (LocatedFileStatus lfs : retVal) {
        totalLength += lfs.getLen();
    }
    Assert.assertTrue("Nothing is moved to lost+found!", totalLength > 0);
    util.cleanup(dfs, srcDir);
}
Also used : DFSClient(org.apache.hadoop.hdfs.DFSClient) Path(org.apache.hadoop.fs.Path) DFSTestUtil(org.apache.hadoop.hdfs.DFSTestUtil) Matcher(java.util.regex.Matcher) InetSocketAddress(java.net.InetSocketAddress) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Matchers.anyString(org.mockito.Matchers.anyString) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) IOException(java.io.IOException) ChecksumException(org.apache.hadoop.fs.ChecksumException) TimeoutException(java.util.concurrent.TimeoutException) UnresolvedLinkException(org.apache.hadoop.fs.UnresolvedLinkException) FileNotFoundException(java.io.FileNotFoundException) AccessControlException(org.apache.hadoop.security.AccessControlException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Test(org.junit.Test)

Example 62 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class FileInputFormat method singleThreadedListStatus.

private List<FileStatus> singleThreadedListStatus(JobConf job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (Path p : dirs) {
        FileSystem fs = p.getFileSystem(job);
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
                    while (iter.hasNext()) {
                        LocatedFileStatus stat = iter.next();
                        if (inputFilter.accept(stat.getPath())) {
                            if (recursive && stat.isDirectory()) {
                                addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                            } else {
                                result.add(stat);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }
    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}
Also used : Path(org.apache.hadoop.fs.Path) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException)

Example 63 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class FileInputFormat method getSplits.

/** Splits files returned by {@link #listStatus(JobConf)} when
   * they're too big.*/
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    StopWatch sw = new StopWatch().start();
    FileStatus[] files = listStatus(job);
    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    // compute total size
    long totalSize = 0;
    for (FileStatus file : files) {
        // check we have valid files
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }
    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job);
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(fs, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(goalSize, minSize, blockSize);
                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts[0], splitHosts[1]));
                    bytesRemaining -= splitSize;
                }
                if (bytesRemaining != 0) {
                    String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts[0], splitHosts[1]));
                }
            } else {
                if (LOG.isDebugEnabled()) {
                    // Log only if the file is big enough to be splitted
                    if (length > Math.min(file.getBlockSize(), minSize)) {
                        LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
                    }
                }
                String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, 0, length, clusterMap);
                splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
    }
    return splits.toArray(new FileSplit[splits.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) BlockLocation(org.apache.hadoop.fs.BlockLocation) StopWatch(org.apache.hadoop.util.StopWatch) NetworkTopology(org.apache.hadoop.net.NetworkTopology) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 64 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class FileInputFormat method getSplits.

/** 
   * Generate the list of files and make them into FileSplits.
   * @param job the job context
   * @throws IOException
   */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    StopWatch sw = new StopWatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);
                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }
                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else {
                // not splitable
                if (LOG.isDebugEnabled()) {
                    // Log only if the file is big enough to be splitted
                    if (length > Math.min(file.getBlockSize(), minSize)) {
                        LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
                    }
                }
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) BlockLocation(org.apache.hadoop.fs.BlockLocation) StopWatch(org.apache.hadoop.util.StopWatch) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 65 with LocatedFileStatus

use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.

the class FileInputFormat method singleThreadedListStatus.

private List<FileStatus> singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException {
    List<FileStatus> result = new ArrayList<FileStatus>();
    List<IOException> errors = new ArrayList<IOException>();
    for (int i = 0; i < dirs.length; ++i) {
        Path p = dirs[i];
        FileSystem fs = p.getFileSystem(job.getConfiguration());
        FileStatus[] matches = fs.globStatus(p, inputFilter);
        if (matches == null) {
            errors.add(new IOException("Input path does not exist: " + p));
        } else if (matches.length == 0) {
            errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
        } else {
            for (FileStatus globStat : matches) {
                if (globStat.isDirectory()) {
                    RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
                    while (iter.hasNext()) {
                        LocatedFileStatus stat = iter.next();
                        if (inputFilter.accept(stat.getPath())) {
                            if (recursive && stat.isDirectory()) {
                                addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                            } else {
                                result.add(stat);
                            }
                        }
                    }
                } else {
                    result.add(globStat);
                }
            }
        }
    }
    if (!errors.isEmpty()) {
        throw new InvalidInputException(errors);
    }
    return result;
}
Also used : Path(org.apache.hadoop.fs.Path) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException)

Aggregations

LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)70 Path (org.apache.hadoop.fs.Path)51 FileSystem (org.apache.hadoop.fs.FileSystem)29 ArrayList (java.util.ArrayList)24 Test (org.junit.Test)20 FileStatus (org.apache.hadoop.fs.FileStatus)18 IOException (java.io.IOException)14 Configuration (org.apache.hadoop.conf.Configuration)10 File (java.io.File)8 FileNotFoundException (java.io.FileNotFoundException)7 BlockLocation (org.apache.hadoop.fs.BlockLocation)5 BufferedReader (java.io.BufferedReader)4 InputStreamReader (java.io.InputStreamReader)4 Matcher (java.util.regex.Matcher)4 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)4 RemoteIterator (org.apache.hadoop.fs.RemoteIterator)4 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)4 PrestoException (com.facebook.presto.spi.PrestoException)3 DataSegment (io.druid.timeline.DataSegment)3 Path (java.nio.file.Path)3