use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class TestWebHDFS method verifyEquals.
private void verifyEquals(BlockLocation[] locations1, BlockLocation[] locations2) throws IOException {
for (int i = 0; i < locations1.length; i++) {
BlockLocation location1 = locations1[i];
BlockLocation location2 = locations2[i];
Assert.assertEquals(location1.getLength(), location2.getLength());
Assert.assertEquals(location1.getOffset(), location2.getOffset());
Assert.assertArrayEquals(location1.getCachedHosts(), location2.getCachedHosts());
Assert.assertArrayEquals(location1.getHosts(), location2.getHosts());
Assert.assertArrayEquals(location1.getNames(), location2.getNames());
Assert.assertArrayEquals(location1.getStorageIds(), location2.getStorageIds());
Assert.assertArrayEquals(location1.getTopologyPaths(), location2.getTopologyPaths());
Assert.assertArrayEquals(location1.getStorageTypes(), location2.getStorageTypes());
use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class FileInputFormat method getSplits.
/** Splits files returned by {@link #listStatus(JobConf)} when
* they're too big.*/
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
StopWatch sw = new StopWatch().start();
FileStatus[] files = listStatus(job);
// Save the number of input files for metrics/loadgen
job.setLong(NUM_INPUT_FILES, files.length);
// compute total size
long totalSize = 0;
for (FileStatus file : files) {
// check we have valid files
if (file.isDirectory()) {
throw new IOException("Not a file: " + file.getPath());
totalSize += file.getLen();
long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
// generate splits
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
NetworkTopology clusterMap = new NetworkTopology();
for (FileStatus file : files) {
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
FileSystem fs = path.getFileSystem(job);
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
blkLocations = fs.getFileBlockLocations(file, 0, length);
if (isSplitable(fs, path)) {
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(goalSize, minSize, blockSize);
long bytesRemaining = length;
while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts[0], splitHosts[1]));
bytesRemaining -= splitSize;
if (bytesRemaining != 0) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts[0], splitHosts[1]));
} else {
if (LOG.isDebugEnabled()) {
// Log only if the file is big enough to be splitted
if (length > Math.min(file.getBlockSize(), minSize)) {
LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, 0, length, clusterMap);
splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " +;
return splits.toArray(new FileSplit[splits.size()]);
use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class FileInputFormat method getBlockIndex.
protected int getBlockIndex(BlockLocation[] blkLocations, long offset) {
for (int i = 0; i < blkLocations.length; i++) {
// is the offset inside this block?
if ((blkLocations[i].getOffset() <= offset) && (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) {
return i;
BlockLocation last = blkLocations[blkLocations.length - 1];
long fileLength = last.getOffset() + last.getLength() - 1;
throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")");
use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class FileInputFormat method getSplits.
* Generate the list of files and make them into FileSplits.
* @param job the job context
* @throws IOException
public List<InputSplit> getSplits(JobContext job) throws IOException {
StopWatch sw = new StopWatch().start();
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
// generate splits
List<InputSplit> splits = new ArrayList<InputSplit>();
List<FileStatus> files = listStatus(job);
for (FileStatus file : files) {
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
FileSystem fs = path.getFileSystem(job.getConfiguration());
blkLocations = fs.getFileBlockLocations(file, 0, length);
if (isSplitable(job, path)) {
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
long bytesRemaining = length;
while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
bytesRemaining -= splitSize;
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
} else {
// not splitable
if (LOG.isDebugEnabled()) {
// Log only if the file is big enough to be splitted
if (length > Math.min(file.getBlockSize(), minSize)) {
LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts()));
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
// Save the number of input files for metrics/loadgen
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " +;
return splits;
use of org.apache.hadoop.fs.BlockLocation in project mongo-hadoop by mongodb.
the class BSONSplitter method getBlockIndex.
* Get the index of the block within the given BlockLocations that
* contains the given offset. Raises IllegalArgumentException if the
* offset is outside the file.
* @param blockLocations BlockLocations to search.
* @param offset the offset into the file.
* @return the index of the BlockLocation containing the offset.
private static int getBlockIndex(final BlockLocation[] blockLocations, final long offset) {
for (int i = 0; i < blockLocations.length; i++) {
BlockLocation bl = blockLocations[i];
if (bl.getOffset() <= offset && offset < bl.getOffset() + bl.getLength()) {
return i;
BlockLocation lastBlock = blockLocations[blockLocations.length - 1];
long fileLength = lastBlock.getOffset() + lastBlock.getLength() - 1;
throw new IllegalArgumentException(String.format("Offset %d is outside the file [0..%d].", offset, fileLength));