Search in sources :

Example 56 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class TestWebHDFS method verifyEquals.

private void verifyEquals(BlockLocation[] locations1, BlockLocation[] locations2) throws IOException {
    for (int i = 0; i < locations1.length; i++) {
        BlockLocation location1 = locations1[i];
        BlockLocation location2 = locations2[i];
        Assert.assertEquals(location1.getLength(), location2.getLength());
        Assert.assertEquals(location1.getOffset(), location2.getOffset());
        Assert.assertArrayEquals(location1.getCachedHosts(), location2.getCachedHosts());
        Assert.assertArrayEquals(location1.getHosts(), location2.getHosts());
        Assert.assertArrayEquals(location1.getNames(), location2.getNames());
        Assert.assertArrayEquals(location1.getStorageIds(), location2.getStorageIds());
        Assert.assertArrayEquals(location1.getTopologyPaths(), location2.getTopologyPaths());
        Assert.assertArrayEquals(location1.getStorageTypes(), location2.getStorageTypes());
    }
}
Also used : BlockLocation(org.apache.hadoop.fs.BlockLocation)

Example 57 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class FileInputFormat method getSplits.

/** Splits files returned by {@link #listStatus(JobConf)} when
   * they're too big.*/
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    StopWatch sw = new StopWatch().start();
    FileStatus[] files = listStatus(job);
    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    // compute total size
    long totalSize = 0;
    for (FileStatus file : files) {
        // check we have valid files
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }
    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job);
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(fs, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(goalSize, minSize, blockSize);
                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts[0], splitHosts[1]));
                    bytesRemaining -= splitSize;
                }
                if (bytesRemaining != 0) {
                    String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts[0], splitHosts[1]));
                }
            } else {
                if (LOG.isDebugEnabled()) {
                    // Log only if the file is big enough to be splitted
                    if (length > Math.min(file.getBlockSize(), minSize)) {
                        LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
                    }
                }
                String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, 0, length, clusterMap);
                splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
    }
    return splits.toArray(new FileSplit[splits.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) BlockLocation(org.apache.hadoop.fs.BlockLocation) StopWatch(org.apache.hadoop.util.StopWatch) NetworkTopology(org.apache.hadoop.net.NetworkTopology) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 58 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class FileInputFormat method getBlockIndex.

protected int getBlockIndex(BlockLocation[] blkLocations, long offset) {
    for (int i = 0; i < blkLocations.length; i++) {
        // is the offset inside this block?
        if ((blkLocations[i].getOffset() <= offset) && (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) {
            return i;
        }
    }
    BlockLocation last = blkLocations[blkLocations.length - 1];
    long fileLength = last.getOffset() + last.getLength() - 1;
    throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")");
}
Also used : BlockLocation(org.apache.hadoop.fs.BlockLocation)

Example 59 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class FileInputFormat method getSplits.

/** 
   * Generate the list of files and make them into FileSplits.
   * @param job the job context
   * @throws IOException
   */
public List<InputSplit> getSplits(JobContext job) throws IOException {
    StopWatch sw = new StopWatch().start();
    long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
    long maxSize = getMaxSplitSize(job);
    // generate splits
    List<InputSplit> splits = new ArrayList<InputSplit>();
    List<FileStatus> files = listStatus(job);
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                FileSystem fs = path.getFileSystem(job.getConfiguration());
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(job, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(blockSize, minSize, maxSize);
                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                    bytesRemaining -= splitSize;
                }
                if (bytesRemaining != 0) {
                    int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkIndex].getHosts(), blkLocations[blkIndex].getCachedHosts()));
                }
            } else {
                // not splitable
                if (LOG.isDebugEnabled()) {
                    // Log only if the file is big enough to be splitted
                    if (length > Math.min(file.getBlockSize(), minSize)) {
                        LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
                    }
                }
                splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(), blkLocations[0].getCachedHosts()));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    // Save the number of input files for metrics/loadgen
    job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
    }
    return splits;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) BlockLocation(org.apache.hadoop.fs.BlockLocation) StopWatch(org.apache.hadoop.util.StopWatch) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 60 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project mongo-hadoop by mongodb.

the class BSONSplitter method getBlockIndex.

/**
     * Get the index of the block within the given BlockLocations that
     * contains the given offset. Raises IllegalArgumentException if the
     * offset is outside the file.
     *
     * @param blockLocations BlockLocations to search.
     * @param offset the offset into the file.
     * @return the index of the BlockLocation containing the offset.
     */
private static int getBlockIndex(final BlockLocation[] blockLocations, final long offset) {
    for (int i = 0; i < blockLocations.length; i++) {
        BlockLocation bl = blockLocations[i];
        if (bl.getOffset() <= offset && offset < bl.getOffset() + bl.getLength()) {
            return i;
        }
    }
    BlockLocation lastBlock = blockLocations[blockLocations.length - 1];
    long fileLength = lastBlock.getOffset() + lastBlock.getLength() - 1;
    throw new IllegalArgumentException(String.format("Offset %d is outside the file [0..%d].", offset, fileLength));
}
Also used : BlockLocation(org.apache.hadoop.fs.BlockLocation)

Aggregations

BlockLocation (org.apache.hadoop.fs.BlockLocation)88 Path (org.apache.hadoop.fs.Path)41 FileStatus (org.apache.hadoop.fs.FileStatus)30 Test (org.junit.Test)29 FileSystem (org.apache.hadoop.fs.FileSystem)16 ArrayList (java.util.ArrayList)14 Configuration (org.apache.hadoop.conf.Configuration)14 IOException (java.io.IOException)10 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)10 DatanodeInfo (org.apache.hadoop.hdfs.protocol.DatanodeInfo)7 DataNode (org.apache.hadoop.hdfs.server.datanode.DataNode)7 InetSocketAddress (java.net.InetSocketAddress)5 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)5 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)5 HdfsConfiguration (org.apache.hadoop.hdfs.HdfsConfiguration)5 LocatedBlocks (org.apache.hadoop.hdfs.protocol.LocatedBlocks)5 IgfsBlockLocation (org.apache.ignite.igfs.IgfsBlockLocation)5 IgfsPath (org.apache.ignite.igfs.IgfsPath)5 HashMap (java.util.HashMap)4 Random (java.util.Random)4