use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class InputStriper method splitFor.
/**
* @param inputDir Pool used to resolve block locations.
* @param bytes Target byte count
* @param nLocs Number of block locations per split.
* @return A set of files satisfying the byte count, with locations weighted
* to the dominating proportion of input bytes.
*/
CombineFileSplit splitFor(FilePool inputDir, long bytes, int nLocs) throws IOException {
final ArrayList<Path> paths = new ArrayList<Path>();
final ArrayList<Long> start = new ArrayList<Long>();
final ArrayList<Long> length = new ArrayList<Long>();
final HashMap<String, Double> sb = new HashMap<String, Double>();
do {
paths.add(current.getPath());
start.add(currentStart);
final long fromFile = Math.min(bytes, current.getLen() - currentStart);
length.add(fromFile);
for (BlockLocation loc : inputDir.locationsFor(current, currentStart, fromFile)) {
final double tedium = loc.getLength() / (1.0 * bytes);
for (String l : loc.getHosts()) {
Double j = sb.get(l);
if (null == j) {
sb.put(l, tedium);
} else {
sb.put(l, j.doubleValue() + tedium);
}
}
}
currentStart += fromFile;
bytes -= fromFile;
// Switch to a new file if
// - the current file is uncompressed and completely used
// - the current file is compressed
CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(conf);
CompressionCodec codec = compressionCodecs.getCodec(current.getPath());
if (current.getLen() - currentStart == 0 || codec != null) {
current = files.get(++idx % files.size());
currentStart = 0;
}
} while (bytes > 0);
final ArrayList<Entry<String, Double>> sort = new ArrayList<Entry<String, Double>>(sb.entrySet());
Collections.sort(sort, hostRank);
final String[] hosts = new String[Math.min(nLocs, sort.size())];
for (int i = 0; i < nLocs && i < sort.size(); ++i) {
hosts[i] = sort.get(i).getKey();
}
return new CombineFileSplit(paths.toArray(new Path[0]), toLongArray(start), toLongArray(length), hosts);
}
use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class TestFilePool method testStriper.
@Test
public void testStriper() throws Exception {
final Random r = new Random();
final Configuration conf = new Configuration();
final FileSystem fs = FileSystem.getLocal(conf).getRaw();
conf.setLong(FilePool.GRIDMIX_MIN_FILE, 3 * 1024);
final FilePool pool = new FilePool(conf, base) {
@Override
public BlockLocation[] locationsFor(FileStatus stat, long start, long len) throws IOException {
return new BlockLocation[] { new BlockLocation() };
}
};
pool.refresh();
final int expectedPoolSize = (NFILES / 2 * (NFILES / 2 + 1) - 6) * 1024;
final InputStriper striper = new InputStriper(pool, expectedPoolSize);
int last = 0;
for (int i = 0; i < expectedPoolSize; last = Math.min(expectedPoolSize - i, r.nextInt(expectedPoolSize))) {
checkSplitEq(fs, striper.splitFor(pool, last, 0), last);
i += last;
}
final InputStriper striper2 = new InputStriper(pool, expectedPoolSize);
checkSplitEq(fs, striper2.splitFor(pool, expectedPoolSize, 0), expectedPoolSize);
}
use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class MultiFileSplit method getLocations.
public String[] getLocations() throws IOException {
HashSet<String> hostSet = new HashSet<String>();
for (Path file : getPaths()) {
FileSystem fs = file.getFileSystem(getJob());
FileStatus status = fs.getFileStatus(file);
BlockLocation[] blkLocations = fs.getFileBlockLocations(status, 0, status.getLen());
if (blkLocations != null && blkLocations.length > 0) {
addToSet(hostSet, blkLocations[0].getHosts());
}
}
return hostSet.toArray(new String[hostSet.size()]);
}
use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class FileInputFormat method getBlockIndex.
protected int getBlockIndex(BlockLocation[] blkLocations, long offset) {
for (int i = 0; i < blkLocations.length; i++) {
// is the offset inside this block?
if ((blkLocations[i].getOffset() <= offset) && (offset < blkLocations[i].getOffset() + blkLocations[i].getLength())) {
return i;
}
}
BlockLocation last = blkLocations[blkLocations.length - 1];
long fileLength = last.getOffset() + last.getLength() - 1;
throw new IllegalArgumentException("Offset " + offset + " is outside of file (0.." + fileLength + ")");
}
use of org.apache.hadoop.fs.BlockLocation in project hadoop by apache.
the class TestGetSplitHosts method testGetSplitHosts.
@Test
public void testGetSplitHosts() throws Exception {
int numBlocks = 3;
int block1Size = 100, block2Size = 150, block3Size = 75;
int fileSize = block1Size + block2Size + block3Size;
int replicationFactor = 3;
NetworkTopology clusterMap = new NetworkTopology();
BlockLocation[] bs = new BlockLocation[numBlocks];
String[] block1Hosts = { "host1", "host2", "host3" };
String[] block1Names = { "host1:100", "host2:100", "host3:100" };
String[] block1Racks = { "/rack1/", "/rack1/", "/rack2/" };
String[] block1Paths = new String[replicationFactor];
for (int i = 0; i < replicationFactor; i++) {
block1Paths[i] = block1Racks[i] + block1Names[i];
}
bs[0] = new BlockLocation(block1Names, block1Hosts, block1Paths, 0, block1Size);
String[] block2Hosts = { "host4", "host5", "host6" };
String[] block2Names = { "host4:100", "host5:100", "host6:100" };
String[] block2Racks = { "/rack2/", "/rack3/", "/rack3/" };
String[] block2Paths = new String[replicationFactor];
for (int i = 0; i < replicationFactor; i++) {
block2Paths[i] = block2Racks[i] + block2Names[i];
}
bs[1] = new BlockLocation(block2Names, block2Hosts, block2Paths, block1Size, block2Size);
String[] block3Hosts = { "host1", "host7", "host8" };
String[] block3Names = { "host1:100", "host7:100", "host8:100" };
String[] block3Racks = { "/rack1/", "/rack4/", "/rack4/" };
String[] block3Paths = new String[replicationFactor];
for (int i = 0; i < replicationFactor; i++) {
block3Paths[i] = block3Racks[i] + block3Names[i];
}
bs[2] = new BlockLocation(block3Names, block3Hosts, block3Paths, block1Size + block2Size, block3Size);
SequenceFileInputFormat<String, String> sif = new SequenceFileInputFormat<String, String>();
String[] hosts = sif.getSplitHosts(bs, 0, fileSize, clusterMap);
// Contributions By Racks are
// Rack1 175
// Rack2 275
// Rack3 150
// So, Rack2 hosts, host4 and host 3 should be returned
// even if their individual contribution is not the highest
assertTrue(hosts.length == replicationFactor);
assertTrue(hosts[0].equalsIgnoreCase("host4"));
assertTrue(hosts[1].equalsIgnoreCase("host3"));
assertTrue(hosts[2].equalsIgnoreCase("host1"));
// Now Create the blocks without topology information
bs[0] = new BlockLocation(block1Names, block1Hosts, 0, block1Size);
bs[1] = new BlockLocation(block2Names, block2Hosts, block1Size, block2Size);
bs[2] = new BlockLocation(block3Names, block3Hosts, block1Size + block2Size, block3Size);
hosts = sif.getSplitHosts(bs, 0, fileSize, clusterMap);
// host1 makes the highest contribution among all hosts
// So, that should be returned before others
assertTrue(hosts.length == replicationFactor);
assertTrue(hosts[0].equalsIgnoreCase("host1"));
}
Aggregations