use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class TestFsck method testFsckMoveAfterCorruption.
@Test(timeout = 300000)
public void testFsckMoveAfterCorruption() throws Exception {
final int dfsBlockSize = 512 * 1024;
final int numDatanodes = 1;
final int replication = 1;
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, dfsBlockSize);
conf.setLong(DFSConfigKeys.DFS_BLOCKREPORT_INTERVAL_MSEC_KEY, 1000L);
conf.setInt(DFSConfigKeys.DFS_DATANODE_DIRECTORYSCAN_INTERVAL_KEY, 1);
conf.setInt(DFSConfigKeys.DFS_REPLICATION_KEY, replication);
cluster = new MiniDFSCluster.Builder(conf).build();
DistributedFileSystem dfs = cluster.getFileSystem();
cluster.waitActive();
final String srcDir = "/srcdat";
final DFSTestUtil util = new DFSTestUtil.Builder().setName("TestFsck").setMinSize(dfsBlockSize * 2).setMaxSize(dfsBlockSize * 3).setNumFiles(1).build();
util.createFiles(dfs, srcDir, (short) replication);
final String[] fileNames = util.getFileNames(srcDir);
LOG.info("Created files: " + Arrays.toString(fileNames));
// Run fsck here. The output is automatically logged for easier debugging
String outStr = runFsck(conf, 0, true, "/", "-files", "-blocks");
assertTrue(outStr.contains(NamenodeFsck.HEALTHY_STATUS));
// Corrupt the first block
final DFSClient dfsClient = new DFSClient(new InetSocketAddress("localhost", cluster.getNameNodePort()), conf);
final String blockFileToCorrupt = fileNames[0];
final CorruptedTestFile ctf = new CorruptedTestFile(blockFileToCorrupt, Sets.newHashSet(0), dfsClient, numDatanodes, dfsBlockSize);
ctf.corruptBlocks(cluster);
// Wait for fsck to discover all the missing blocks
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
try {
final String str = runFsck(conf, 1, false, "/");
String numCorrupt = null;
for (String line : str.split(LINE_SEPARATOR)) {
Matcher m = NUM_CORRUPT_BLOCKS_PATTERN.matcher(line);
if (m.matches()) {
numCorrupt = m.group(1);
break;
}
}
if (numCorrupt == null) {
Assert.fail("Cannot find corrupt blocks count in fsck output.");
}
if (Integer.parseInt(numCorrupt) == ctf.getTotalMissingBlocks()) {
assertTrue(str.contains(NamenodeFsck.CORRUPT_STATUS));
return true;
}
} catch (Exception e) {
LOG.error("Exception caught", e);
Assert.fail("Caught unexpected exception.");
}
return false;
}
}, 1000, 60000);
runFsck(conf, 1, true, "/", "-files", "-blocks", "-racks");
LOG.info("Moving blocks to lost+found");
// Fsck will return error since we corrupted a block
runFsck(conf, 1, false, "/", "-move");
final List<LocatedFileStatus> retVal = new ArrayList<>();
final RemoteIterator<LocatedFileStatus> iter = dfs.listFiles(new Path("/lost+found"), true);
while (iter.hasNext()) {
retVal.add(iter.next());
}
LOG.info("Items in lost+found: " + retVal);
// Expect all good blocks moved, only corrupted block skipped.
long totalLength = 0;
for (LocatedFileStatus lfs : retVal) {
totalLength += lfs.getLen();
}
Assert.assertTrue("Nothing is moved to lost+found!", totalLength > 0);
util.cleanup(dfs, srcDir);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class FileInputFormat method singleThreadedListStatus.
private List<FileStatus> singleThreadedListStatus(JobConf job, Path[] dirs, PathFilter inputFilter, boolean recursive) throws IOException {
List<FileStatus> result = new ArrayList<FileStatus>();
List<IOException> errors = new ArrayList<IOException>();
for (Path p : dirs) {
FileSystem fs = p.getFileSystem(job);
FileStatus[] matches = fs.globStatus(p, inputFilter);
if (matches == null) {
errors.add(new IOException("Input path does not exist: " + p));
} else if (matches.length == 0) {
errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
} else {
for (FileStatus globStat : matches) {
if (globStat.isDirectory()) {
RemoteIterator<LocatedFileStatus> iter = fs.listLocatedStatus(globStat.getPath());
while (iter.hasNext()) {
LocatedFileStatus stat = iter.next();
if (inputFilter.accept(stat.getPath())) {
if (recursive && stat.isDirectory()) {
addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
} else {
result.add(stat);
}
}
}
} else {
result.add(globStat);
}
}
}
}
if (!errors.isEmpty()) {
throw new InvalidInputException(errors);
}
return result;
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class FileInputFormat method getSplits.
/** Splits files returned by {@link #listStatus(JobConf)} when
* they're too big.*/
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
StopWatch sw = new StopWatch().start();
FileStatus[] files = listStatus(job);
// Save the number of input files for metrics/loadgen
job.setLong(NUM_INPUT_FILES, files.length);
// compute total size
long totalSize = 0;
for (FileStatus file : files) {
// check we have valid files
if (file.isDirectory()) {
throw new IOException("Not a file: " + file.getPath());
}
totalSize += file.getLen();
}
long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
// generate splits
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
NetworkTopology clusterMap = new NetworkTopology();
for (FileStatus file : files) {
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
FileSystem fs = path.getFileSystem(job);
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
if (isSplitable(fs, path)) {
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(goalSize, minSize, blockSize);
long bytesRemaining = length;
while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts[0], splitHosts[1]));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts[0], splitHosts[1]));
}
} else {
if (LOG.isDebugEnabled()) {
// Log only if the file is big enough to be splitted
if (length > Math.min(file.getBlockSize(), minSize)) {
LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
}
}
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, 0, length, clusterMap);
splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
}
return splits.toArray(new FileSplit[splits.size()]);
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class FileInputFormat method getSplits.
/**
* Generate the list of files and make them into FileSplits.
* @param job the job context
* @throws IOException
*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
StopWatch sw = new StopWatch().start();
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
// generate splits
List<InputSplit> splits = new ArrayList<InputSplit>();
List<FileStatus> files = listStatus(job);
for (FileStatus file : files) {
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
FileSystem fs = path.getFileSystem(job.getConfiguration());
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
if (isSplitable(job, path)) {
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
long bytesRemaining = length;
while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
}
} else {
// not splitable
if (LOG.isDebugEnabled()) {
// Log only if the file is big enough to be splitted
if (length > Math.min(file.getBlockSize(), minSize)) {
LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
}
}
splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts()));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
// Save the number of input files for metrics/loadgen
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
}
return splits;
}
use of org.apache.hadoop.fs.LocatedFileStatus in project hadoop by apache.
the class AbstractContractGetFileStatusTest method testListFilesFile.
@Test
public void testListFilesFile() throws Throwable {
describe("test the listStatus(path) on a file");
Path f = touchf("listfilesfile");
List<LocatedFileStatus> statusList = toList(getFileSystem().listFiles(f, false));
assertEquals("size of file list returned", 1, statusList.size());
assertIsNamedFile(f, statusList.get(0));
List<LocatedFileStatus> statusList2 = toListThroughNextCallsAlone(getFileSystem().listFiles(f, false));
assertEquals("size of file list returned through next() calls", 1, statusList2.size());
assertIsNamedFile(f, statusList2.get(0));
}
Aggregations