Search in sources :

Example 11 with NetworkTopology

use of org.apache.hadoop.net.NetworkTopology in project hadoop by apache.

the class FileInputFormat method getSplits.

/** Splits files returned by {@link #listStatus(JobConf)} when
   * they're too big.*/
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    StopWatch sw = new StopWatch().start();
    FileStatus[] files = listStatus(job);
    // Save the number of input files for metrics/loadgen
    job.setLong(NUM_INPUT_FILES, files.length);
    // compute total size
    long totalSize = 0;
    for (FileStatus file : files) {
        // check we have valid files
        if (file.isDirectory()) {
            throw new IOException("Not a file: " + file.getPath());
        }
        totalSize += file.getLen();
    }
    long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits);
    long minSize = Math.max(job.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), minSplitSize);
    // generate splits
    ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits);
    NetworkTopology clusterMap = new NetworkTopology();
    for (FileStatus file : files) {
        Path path = file.getPath();
        long length = file.getLen();
        if (length != 0) {
            FileSystem fs = path.getFileSystem(job);
            BlockLocation[] blkLocations;
            if (file instanceof LocatedFileStatus) {
                blkLocations = ((LocatedFileStatus) file).getBlockLocations();
            } else {
                blkLocations = fs.getFileBlockLocations(file, 0, length);
            }
            if (isSplitable(fs, path)) {
                long blockSize = file.getBlockSize();
                long splitSize = computeSplitSize(goalSize, minSize, blockSize);
                long bytesRemaining = length;
                while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) {
                    String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap);
                    splits.add(makeSplit(path, length - bytesRemaining, splitSize, splitHosts[0], splitHosts[1]));
                    bytesRemaining -= splitSize;
                }
                if (bytesRemaining != 0) {
                    String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, length - bytesRemaining, bytesRemaining, clusterMap);
                    splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, splitHosts[0], splitHosts[1]));
                }
            } else {
                if (LOG.isDebugEnabled()) {
                    // Log only if the file is big enough to be splitted
                    if (length > Math.min(file.getBlockSize(), minSize)) {
                        LOG.debug("File is not splittable so no parallelization " + "is possible: " + file.getPath());
                    }
                }
                String[][] splitHosts = getSplitHostsAndCachedHosts(blkLocations, 0, length, clusterMap);
                splits.add(makeSplit(path, 0, length, splitHosts[0], splitHosts[1]));
            }
        } else {
            //Create empty hosts array for zero length files
            splits.add(makeSplit(path, 0, length, new String[0]));
        }
    }
    sw.stop();
    if (LOG.isDebugEnabled()) {
        LOG.debug("Total # of splits generated by getSplits: " + splits.size() + ", TimeTaken: " + sw.now(TimeUnit.MILLISECONDS));
    }
    return splits.toArray(new FileSplit[splits.size()]);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) IOException(java.io.IOException) BlockLocation(org.apache.hadoop.fs.BlockLocation) StopWatch(org.apache.hadoop.util.StopWatch) NetworkTopology(org.apache.hadoop.net.NetworkTopology) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 12 with NetworkTopology

use of org.apache.hadoop.net.NetworkTopology in project hadoop by apache.

the class TestBlockStoragePolicy method testChooseSsdOverDisk.

@Test
public void testChooseSsdOverDisk() throws Exception {
    BlockStoragePolicy policy = new BlockStoragePolicy((byte) 9, "TEST1", new StorageType[] { StorageType.SSD, StorageType.DISK, StorageType.ARCHIVE }, new StorageType[] {}, new StorageType[] {});
    final String[] racks = { "/d1/r1", "/d1/r1", "/d1/r1" };
    final String[] hosts = { "host1", "host2", "host3" };
    final StorageType[] disks = { StorageType.DISK, StorageType.DISK, StorageType.DISK };
    final DatanodeStorageInfo[] diskStorages = DFSTestUtil.createDatanodeStorageInfos(3, racks, hosts, disks);
    final DatanodeDescriptor[] dataNodes = DFSTestUtil.toDatanodeDescriptor(diskStorages);
    for (int i = 0; i < dataNodes.length; i++) {
        BlockManagerTestUtil.updateStorage(dataNodes[i], new DatanodeStorage("ssd" + i, DatanodeStorage.State.NORMAL, StorageType.SSD));
    }
    FileSystem.setDefaultUri(conf, "hdfs://localhost:0");
    conf.set(DFSConfigKeys.DFS_NAMENODE_HTTP_ADDRESS_KEY, "0.0.0.0:0");
    File baseDir = PathUtils.getTestDir(TestReplicationPolicy.class);
    conf.set(DFSConfigKeys.DFS_NAMENODE_NAME_DIR_KEY, new File(baseDir, "name").getPath());
    DFSTestUtil.formatNameNode(conf);
    NameNode namenode = new NameNode(conf);
    final BlockManager bm = namenode.getNamesystem().getBlockManager();
    BlockPlacementPolicy replicator = bm.getBlockPlacementPolicy();
    NetworkTopology cluster = bm.getDatanodeManager().getNetworkTopology();
    for (DatanodeDescriptor datanode : dataNodes) {
        cluster.add(datanode);
    }
    DatanodeStorageInfo[] targets = replicator.chooseTarget("/foo", 3, dataNodes[0], Collections.<DatanodeStorageInfo>emptyList(), false, new HashSet<Node>(), 0, policy, null);
    System.out.println(policy.getName() + ": " + Arrays.asList(targets));
    Assert.assertEquals(2, targets.length);
    Assert.assertEquals(StorageType.SSD, targets[0].getStorageType());
    Assert.assertEquals(StorageType.DISK, targets[1].getStorageType());
}
Also used : NameNode(org.apache.hadoop.hdfs.server.namenode.NameNode) StorageType(org.apache.hadoop.fs.StorageType) DataNode(org.apache.hadoop.hdfs.server.datanode.DataNode) Node(org.apache.hadoop.net.Node) NameNode(org.apache.hadoop.hdfs.server.namenode.NameNode) DatanodeStorage(org.apache.hadoop.hdfs.server.protocol.DatanodeStorage) NetworkTopology(org.apache.hadoop.net.NetworkTopology) File(java.io.File) Test(org.junit.Test)

Aggregations

NetworkTopology (org.apache.hadoop.net.NetworkTopology)12 Test (org.junit.Test)7 Path (org.apache.hadoop.fs.Path)4 PrintWriter (java.io.PrintWriter)3 StringWriter (java.io.StringWriter)3 Writer (java.io.Writer)3 InetAddress (java.net.InetAddress)3 HashMap (java.util.HashMap)3 StorageType (org.apache.hadoop.fs.StorageType)3 MiniDFSCluster (org.apache.hadoop.hdfs.MiniDFSCluster)3 HdfsFileStatus (org.apache.hadoop.hdfs.protocol.HdfsFileStatus)3 DataNode (org.apache.hadoop.hdfs.server.datanode.DataNode)3 ErasureCodingResult (org.apache.hadoop.hdfs.server.namenode.NamenodeFsck.ErasureCodingResult)3 ReplicationResult (org.apache.hadoop.hdfs.server.namenode.NamenodeFsck.ReplicationResult)3 Result (org.apache.hadoop.hdfs.server.namenode.NamenodeFsck.Result)3 HostsFileWriter (org.apache.hadoop.hdfs.util.HostsFileWriter)3 Matchers.anyString (org.mockito.Matchers.anyString)3 File (java.io.File)2 IOException (java.io.IOException)2 BlockLocation (org.apache.hadoop.fs.BlockLocation)2