Search in sources :

Example 21 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class InputStriper method splitFor.

/**
   * @param inputDir Pool used to resolve block locations.
   * @param bytes Target byte count
   * @param nLocs Number of block locations per split.
   * @return A set of files satisfying the byte count, with locations weighted
   *         to the dominating proportion of input bytes.
   */
CombineFileSplit splitFor(FilePool inputDir, long bytes, int nLocs) throws IOException {
    final ArrayList<Path> paths = new ArrayList<Path>();
    final ArrayList<Long> start = new ArrayList<Long>();
    final ArrayList<Long> length = new ArrayList<Long>();
    final HashMap<String, Double> sb = new HashMap<String, Double>();
    do {
        paths.add(current.getPath());
        start.add(currentStart);
        final long fromFile = Math.min(bytes, current.getLen() - currentStart);
        length.add(fromFile);
        for (BlockLocation loc : inputDir.locationsFor(current, currentStart, fromFile)) {
            final double tedium = loc.getLength() / (1.0 * bytes);
            for (String l : loc.getHosts()) {
                Double j = sb.get(l);
                if (null == j) {
                    sb.put(l, tedium);
                } else {
                    sb.put(l, j.doubleValue() + tedium);
                }
            }
        }
        currentStart += fromFile;
        bytes -= fromFile;
        // Switch to a new file if
        //  - the current file is uncompressed and completely used
        //  - the current file is compressed
        CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
        CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
        if (current.getLen() - currentStart == 0 || codec != null) {
            current = files.get(++idx % files.size());
            currentStart = 0;
        }
    } while (bytes > 0);
    final ArrayList<Entry<String, Double>> sort = new ArrayList<Entry<String, Double>>(sb.entrySet());
    Collections.sort(sort, hostRank);
    final String[] hosts = new String[Math.min(nLocs, sort.size())];
    for (int i = 0; i < nLocs && i < sort.size(); ++i) {
        hosts[i] = sort.get(i).getKey();
    }
    return new CombineFileSplit(paths.toArray(new Path[0]), toLongArray(start), toLongArray(length), hosts);
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CombineFileSplit(org.apache.hadoop.mapreduce.lib.input.CombineFileSplit) BlockLocation(org.apache.hadoop.fs.BlockLocation) Entry(java.util.Map.Entry) CompressionCodecFactory(org.apache.hadoop.io.compress.CompressionCodecFactory) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec)

Example 22 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class TestFilePool method testStriper.

@Test
public void testStriper() throws Exception {
    final Random r = new Random();
    final Configuration conf = new Configuration();
    final FileSystem fs = FileSystem.getLocal(conf).getRaw();
    conf.setLong(FilePool.GRIDMIX_MIN_FILE, 3 * 1024);
    final FilePool pool = new FilePool(conf, base) {

        @Override
        public BlockLocation[] locationsFor(FileStatus stat, long start, long len) throws IOException {
            return new BlockLocation[] { new BlockLocation() };
        }
    };
    pool.refresh();
    final int expectedPoolSize = (NFILES / 2 * (NFILES / 2 + 1) - 6) * 1024;
    final InputStriper striper = new InputStriper(pool, expectedPoolSize);
    int last = 0;
    for (int i = 0; i < expectedPoolSize; last = Math.min(expectedPoolSize - i, r.nextInt(expectedPoolSize))) {
        checkSplitEq(fs, striper.splitFor(pool, last, 0), last);
        i += last;
    }
    final InputStriper striper2 = new InputStriper(pool, expectedPoolSize);
    checkSplitEq(fs, striper2.splitFor(pool, expectedPoolSize, 0), expectedPoolSize);
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) Random(java.util.Random) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) BlockLocation(org.apache.hadoop.fs.BlockLocation) Test(org.junit.Test)

Example 23 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class MultiFileSplit method getLocations.

public String[] getLocations() throws IOException {
    HashSet<String> hostSet = new HashSet<String>();
    for (Path file : getPaths()) {
        FileSystem fs = file.getFileSystem(getJob());
        FileStatus status = fs.getFileStatus(file);
        BlockLocation[] blkLocations = fs.getFileBlockLocations(status, 0, status.getLen());
        if (blkLocations != null && blkLocations.length > 0) {
            addToSet(hostSet, blkLocations[0].getHosts());
        }
    }
    return hostSet.toArray(new String[hostSet.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) BlockLocation(org.apache.hadoop.fs.BlockLocation) HashSet(java.util.HashSet)

Example 24 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class FileInputFormat method getBlockIndex.

protected int getBlockIndex(BlockLocation[] blkLocations, long offset) {
    for (int i = 0; i < blkLocations.length; i++) {
        // is the offset inside this block?
        if ((blkLocations[i].getOffset() <= offset) && (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) {
            return i;
        }
    }
    BlockLocation last = blkLocations[blkLocations.length - 1];
    long fileLength = last.getOffset() + last.getLength() - 1;
    throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")");
}
Also used : BlockLocation(org.apache.hadoop.fs.BlockLocation)

Example 25 with BlockLocation

use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.

the class TestGetSplitHosts method testGetSplitHosts.

@Test
public void testGetSplitHosts() throws Exception {
    int numBlocks = 3;
    int block1Size = 100, block2Size = 150, block3Size = 75;
    int fileSize = block1Size + block2Size + block3Size;
    int replicationFactor = 3;
    NetworkTopology clusterMap = new NetworkTopology();
    BlockLocation[] bs = new BlockLocation[numBlocks];
    String[] block1Hosts = { "host1", "host2", "host3" };
    String[] block1Names = { "host1:100", "host2:100", "host3:100" };
    String[] block1Racks = { "/rack1/", "/rack1/", "/rack2/" };
    String[] block1Paths = new String[replicationFactor];
    for (int i = 0; i < replicationFactor; i++) {
        block1Paths[i] = block1Racks[i] + block1Names[i];
    }
    bs[0] = new BlockLocation(block1Names, block1Hosts, block1Paths, 0, block1Size);
    String[] block2Hosts = { "host4", "host5", "host6" };
    String[] block2Names = { "host4:100", "host5:100", "host6:100" };
    String[] block2Racks = { "/rack2/", "/rack3/", "/rack3/" };
    String[] block2Paths = new String[replicationFactor];
    for (int i = 0; i < replicationFactor; i++) {
        block2Paths[i] = block2Racks[i] + block2Names[i];
    }
    bs[1] = new BlockLocation(block2Names, block2Hosts, block2Paths, block1Size, block2Size);
    String[] block3Hosts = { "host1", "host7", "host8" };
    String[] block3Names = { "host1:100", "host7:100", "host8:100" };
    String[] block3Racks = { "/rack1/", "/rack4/", "/rack4/" };
    String[] block3Paths = new String[replicationFactor];
    for (int i = 0; i < replicationFactor; i++) {
        block3Paths[i] = block3Racks[i] + block3Names[i];
    }
    bs[2] = new BlockLocation(block3Names, block3Hosts, block3Paths, block1Size + block2Size, block3Size);
    SequenceFileInputFormat<String, String> sif = new SequenceFileInputFormat<String, String>();
    String[] hosts = sif.getSplitHosts(bs, 0, fileSize, clusterMap);
    // Contributions By Racks are
    // Rack1   175       
    // Rack2   275       
    // Rack3   150       
    // So, Rack2 hosts, host4 and host 3 should be returned
    // even if their individual contribution is not the highest
    assertTrue(hosts.length == replicationFactor);
    assertTrue(hosts[0].equalsIgnoreCase("host4"));
    assertTrue(hosts[1].equalsIgnoreCase("host3"));
    assertTrue(hosts[2].equalsIgnoreCase("host1"));
    // Now Create the blocks without topology information
    bs[0] = new BlockLocation(block1Names, block1Hosts, 0, block1Size);
    bs[1] = new BlockLocation(block2Names, block2Hosts, block1Size, block2Size);
    bs[2] = new BlockLocation(block3Names, block3Hosts, block1Size + block2Size, block3Size);
    hosts = sif.getSplitHosts(bs, 0, fileSize, clusterMap);
    // host1 makes the highest contribution among all hosts
    // So, that should be returned before others
    assertTrue(hosts.length == replicationFactor);
    assertTrue(hosts[0].equalsIgnoreCase("host1"));
}
Also used : NetworkTopology(org.apache.hadoop.net.NetworkTopology) BlockLocation(org.apache.hadoop.fs.BlockLocation) Test(org.junit.Test)

Aggregations

BlockLocation (org.apache.hadoop.fs.BlockLocation)87 Path (org.apache.hadoop.fs.Path)41 FileStatus (org.apache.hadoop.fs.FileStatus)30 Test (org.junit.Test)29 FileSystem (org.apache.hadoop.fs.FileSystem)16 Configuration (org.apache.hadoop.conf.Configuration)14 ArrayList (java.util.ArrayList)13 IOException (java.io.IOException)10 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)10 DatanodeInfo (org.apache.hadoop.hdfs.protocol.DatanodeInfo)7 DataNode (org.apache.hadoop.hdfs.server.datanode.DataNode)7 InetSocketAddress (java.net.InetSocketAddress)5 FSDataInputStream (org.apache.hadoop.fs.FSDataInputStream)5 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)5 HdfsConfiguration (org.apache.hadoop.hdfs.HdfsConfiguration)5 LocatedBlocks (org.apache.hadoop.hdfs.protocol.LocatedBlocks)5 IgfsBlockLocation (org.apache.ignite.igfs.IgfsBlockLocation)5 IgfsPath (org.apache.ignite.igfs.IgfsPath)5 HashMap (java.util.HashMap)4 Random (java.util.Random)4