use of org.apache.hadoop.net.NetworkTopology in project hadoop by apache.
the class FileInputFormat method getSplits.
/** Splits files returned by {@link #listStatus(JobConf)} when
* they're too big.*/
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
StopWatch sw = new StopWatch().start();
FileStatus[] files = listStatus(job);
// Save the number of input files for metrics/loadgen
job.setLong(NUM_INPUT_FILES, files.length);
// compute total size
long totalSize = 0;
for (FileStatus file : files) {
// check we have valid files
if (file.isDirectory()) {
throw new IOException("Not a file: " + file.getPath());
}
totalSize += file.getLen();
}
long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
// generate splits
ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
NetworkTopology clusterMap = new NetworkTopology();
for (FileStatus file : files) {
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
FileSystem fs = path.getFileSystem(job);
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
if (isSplitable(fs, path)) {
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(goalSize, minSize, blockSize);
long bytesRemaining = length;
while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts[0], splitHosts[1]));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap);
splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts[0], splitHosts[1]));
}
} else {
if (LOG.isDebugEnabled()) {
// Log only if the file is big enough to be splitted
if (length > Math.min(file.getBlockSize(), minSize)) {
LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
}
}
String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, 0, length, clusterMap);
splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
sw.stop();
if (LOG.isDebugEnabled()) {
LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
}
return splits.toArray(new FileSplit[splits.size()]);
}
use of org.apache.hadoop.net.NetworkTopology in project hadoop by apache.
the class TestBlockStoragePolicy method testChooseSsdOverDisk.
@Test
public void testChooseSsdOverDisk() throws Exception {
BlockStoragePolicy policy = new BlockStoragePolicy((byte) 9, "TEST1", new StorageType[] { StorageType.SSD, StorageType.DISK, StorageType.ARCHIVE }, new StorageType[] {}, new StorageType[] {});
final String[] racks = { "/d1/r1", "/d1/r1", "/d1/r1" };
final String[] hosts = { "host1", "host2", "host3" };
final StorageType[] disks = { StorageType.DISK, StorageType.DISK, StorageType.DISK };
final DatanodeStorageInfo[] diskStorages = DFSTestUtil.createDatanodeStorageInfos(3, racks, hosts, disks);
final DatanodeDescriptor[] dataNodes = DFSTestUtil.toDatanodeDescriptor(diskStorages);
for (int i = 0; i < dataNodes.length; i++) {
BlockManagerTestUtil.updateStorage(dataNodes[i], new DatanodeStorage("ssd" + i, DatanodeStorage.State.NORMAL, StorageType.SSD));
}
FileSystem.setDefaultUri(conf, "hdfs://localhost:0");
conf.set(DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY, "0.0.0.0:0");
File baseDir = PathUtils.getTestDir(TestReplicationPolicy.class);
conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY, new File(baseDir, "name").getPath());
DFSTestUtil.formatNameNode(conf);
NameNode namenode = new NameNode(conf);
final BlockManager bm = namenode.getNamesystem().getBlockManager();
BlockPlacementPolicy replicator = bm.getBlockPlacementPolicy();
NetworkTopology cluster = bm.getDatanodeManager().getNetworkTopology();
for (DatanodeDescriptor datanode : dataNodes) {
cluster.add(datanode);
}
DatanodeStorageInfo[] targets = replicator.chooseTarget("/foo", 3, dataNodes[0], Collections.<DatanodeStorageInfo>emptyList(), false, new HashSet<Node>(), 0, policy, null);
System.out.println(policy.getName() + ": " + Arrays.asList(targets));
Assert.assertEquals(2, targets.length);
Assert.assertEquals(StorageType.SSD, targets[0].getStorageType());
Assert.assertEquals(StorageType.DISK, targets[1].getStorageType());
}
Aggregations