Search in sources :

Example 1 with OneBlockInfo

use of org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneBlockInfo in project hadoop by apache.

the class TestCombineFileInputFormat method testNodeInputSplit.

@Test
public void testNodeInputSplit() throws IOException, InterruptedException {
    // Regression test for MAPREDUCE-4892. There are 2 nodes with all blocks on 
    // both nodes. The grouping ensures that both nodes get splits instead of 
    // just the first node
    DummyInputFormat inFormat = new DummyInputFormat();
    int numBlocks = 12;
    long totLength = 0;
    long blockSize = 100;
    long maxSize = 200;
    long minSizeNode = 50;
    long minSizeRack = 50;
    String[] locations = { "h1", "h2" };
    String[] racks = new String[0];
    Path path = new Path("hdfs://file");
    OneBlockInfo[] blocks = new OneBlockInfo[numBlocks];
    for (int i = 0; i < numBlocks; ++i) {
        blocks[i] = new OneBlockInfo(path, i * blockSize, blockSize, locations, racks);
        totLength += blockSize;
    }
    List<InputSplit> splits = new ArrayList<InputSplit>();
    HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();
    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();
    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();
    HashMap<String, Set<OneBlockInfo>> nodeToBlocks = new HashMap<String, Set<OneBlockInfo>>();
    OneFileInfo.populateBlockInfo(blocks, rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes);
    inFormat.createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength, maxSize, minSizeNode, minSizeRack, splits);
    int expectedSplitCount = (int) (totLength / maxSize);
    assertEquals(expectedSplitCount, splits.size());
    HashMultiset<String> nodeSplits = HashMultiset.create();
    for (int i = 0; i < expectedSplitCount; ++i) {
        InputSplit inSplit = splits.get(i);
        assertEquals(maxSize, inSplit.getLength());
        assertEquals(1, inSplit.getLocations().length);
        nodeSplits.add(inSplit.getLocations()[0]);
    }
    assertEquals(3, nodeSplits.count(locations[0]));
    assertEquals(3, nodeSplits.count(locations[1]));
}
Also used : Path(org.apache.hadoop.fs.Path) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) OneBlockInfo(org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneBlockInfo) List(java.util.List) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 2 with OneBlockInfo

use of org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneBlockInfo in project hadoop by apache.

the class TestCombineFileInputFormat method testNodeDistribution.

@Test
public void testNodeDistribution() throws IOException, InterruptedException {
    DummyInputFormat inFormat = new DummyInputFormat();
    int numBlocks = 60;
    long totLength = 0;
    long blockSize = 100;
    int numNodes = 10;
    long minSizeNode = 50;
    long minSizeRack = 50;
    // 4 blocks per split.
    int maxSplitSize = 200;
    String[] locations = new String[numNodes];
    for (int i = 0; i < numNodes; i++) {
        locations[i] = "h" + i;
    }
    String[] racks = new String[0];
    Path path = new Path("hdfs://file");
    OneBlockInfo[] blocks = new OneBlockInfo[numBlocks];
    int hostCountBase = 0;
    // Generate block list. Replication 3 per block.
    for (int i = 0; i < numBlocks; i++) {
        int localHostCount = hostCountBase;
        String[] blockHosts = new String[3];
        for (int j = 0; j < 3; j++) {
            int hostNum = localHostCount % numNodes;
            blockHosts[j] = "h" + hostNum;
            localHostCount++;
        }
        hostCountBase++;
        blocks[i] = new OneBlockInfo(path, i * blockSize, blockSize, blockHosts, racks);
        totLength += blockSize;
    }
    List<InputSplit> splits = new ArrayList<InputSplit>();
    HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();
    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();
    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();
    Map<String, Set<OneBlockInfo>> nodeToBlocks = new TreeMap<String, Set<OneBlockInfo>>();
    OneFileInfo.populateBlockInfo(blocks, rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes);
    inFormat.createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength, maxSplitSize, minSizeNode, minSizeRack, splits);
    int expectedSplitCount = (int) (totLength / maxSplitSize);
    assertEquals(expectedSplitCount, splits.size());
    // Ensure 90+% of the splits have node local blocks.
    // 100% locality may not always be achieved.
    int numLocalSplits = 0;
    for (InputSplit inputSplit : splits) {
        assertEquals(maxSplitSize, inputSplit.getLength());
        if (inputSplit.getLocations().length == 1) {
            numLocalSplits++;
        }
    }
    assertTrue(numLocalSplits >= 0.9 * splits.size());
}
Also used : Path(org.apache.hadoop.fs.Path) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) OneBlockInfo(org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneBlockInfo) TreeMap(java.util.TreeMap) List(java.util.List) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Aggregations

ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Set (java.util.Set)2 Path (org.apache.hadoop.fs.Path)2 InputSplit (org.apache.hadoop.mapreduce.InputSplit)2 OneBlockInfo (org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneBlockInfo)2 Test (org.junit.Test)2 TreeMap (java.util.TreeMap)1