Examples with InputSplit - org.apache.hadoop.mapreduce.InputSplit

Example 11 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestCombineFileInputFormat method testNodeInputSplit.

@Test
public void testNodeInputSplit() throws IOException, InterruptedException {
    // Regression test for MAPREDUCE-4892. There are 2 nodes with all blocks on 
    // both nodes. The grouping ensures that both nodes get splits instead of 
    // just the first node
    DummyInputFormat inFormat = new DummyInputFormat();
    int numBlocks = 12;
    long totLength = 0;
    long blockSize = 100;
    long maxSize = 200;
    long minSizeNode = 50;
    long minSizeRack = 50;
    String[] locations = { "h1", "h2" };
    String[] racks = new String[0];
    Path path = new Path("hdfs://file");
    OneBlockInfo[] blocks = new OneBlockInfo[numBlocks];
    for (int i = 0; i < numBlocks; ++i) {
        blocks[i] = new OneBlockInfo(path, i * blockSize, blockSize, locations, racks);
        totLength += blockSize;
    }
    List<InputSplit> splits = new ArrayList<InputSplit>();
    HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();
    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();
    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();
    HashMap<String, Set<OneBlockInfo>> nodeToBlocks = new HashMap<String, Set<OneBlockInfo>>();
    OneFileInfo.populateBlockInfo(blocks, rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes);
    inFormat.createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength, maxSize, minSizeNode, minSizeRack, splits);
    int expectedSplitCount = (int) (totLength / maxSize);
    assertEquals(expectedSplitCount, splits.size());
    HashMultiset<String> nodeSplits = HashMultiset.create();
    for (int i = 0; i < expectedSplitCount; ++i) {
        InputSplit inSplit = splits.get(i);
        assertEquals(maxSize, inSplit.getLength());
        assertEquals(1, inSplit.getLocations().length);
        nodeSplits.add(inSplit.getLocations()[0]);
    }
    assertEquals(3, nodeSplits.count(locations[0]));
    assertEquals(3, nodeSplits.count(locations[1]));
}

Also used : Path(org.apache.hadoop.fs.Path) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) OneBlockInfo(org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneBlockInfo) List(java.util.List) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 12 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestCombineFileInputFormat method testSplitPlacement.

/**
   * The test suppresses unchecked warnings in
   * {@link org.mockito.Mockito#reset}. Although calling the method is
   * a bad manner, we call the method instead of splitting the test
   * (i.e. restarting MiniDFSCluster) to save time.
   */
@Test
@SuppressWarnings("unchecked")
public void testSplitPlacement() throws Exception {
    MiniDFSCluster dfs = null;
    FileSystem fileSys = null;
    try {
        /* Start 3 datanodes, one each in rack r1, r2, r3. Create five files
       * 1) file1 and file5, just after starting the datanode on r1, with 
       *    a repl factor of 1, and,
       * 2) file2, just after starting the datanode on r2, with 
       *    a repl factor of 2, and,
       * 3) file3, file4 after starting the all three datanodes, with a repl 
       *    factor of 3.
       * At the end, file1, file5 will be present on only datanode1, file2 will 
       * be present on datanode 1 and datanode2 and 
       * file3, file4 will be present on all datanodes. 
       */
        Configuration conf = new Configuration();
        conf.setBoolean("dfs.replication.considerLoad", false);
        dfs = new MiniDFSCluster.Builder(conf).racks(rack1).hosts(hosts1).build();
        dfs.waitActive();
        fileSys = dfs.getFileSystem();
        if (!fileSys.mkdirs(inDir)) {
            throw new IOException("Mkdirs failed to create " + inDir.toString());
        }
        Path file1 = new Path(dir1 + "/file1");
        writeFile(conf, file1, (short) 1, 1);
        // create another file on the same datanode
        Path file5 = new Path(dir5 + "/file5");
        writeFile(conf, file5, (short) 1, 1);
        // split it using a CombinedFile input format
        DummyInputFormat inFormat = new DummyInputFormat();
        Job job = Job.getInstance(conf);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir5);
        List<InputSplit> splits = inFormat.getSplits(job);
        System.out.println("Made splits(Test0): " + splits.size());
        for (InputSplit split : splits) {
            System.out.println("File split(Test0): " + split);
        }
        assertEquals(1, splits.size());
        CombineFileSplit fileSplit = (CombineFileSplit) splits.get(0);
        assertEquals(2, fileSplit.getNumPaths());
        assertEquals(1, fileSplit.getLocations().length);
        assertEquals(file1.getName(), fileSplit.getPath(0).getName());
        assertEquals(0, fileSplit.getOffset(0));
        assertEquals(BLOCKSIZE, fileSplit.getLength(0));
        assertEquals(file5.getName(), fileSplit.getPath(1).getName());
        assertEquals(0, fileSplit.getOffset(1));
        assertEquals(BLOCKSIZE, fileSplit.getLength(1));
        assertEquals(hosts1[0], fileSplit.getLocations()[0]);
        dfs.startDataNodes(conf, 1, true, null, rack2, hosts2, null);
        dfs.waitActive();
        // create file on two datanodes.
        Path file2 = new Path(dir2 + "/file2");
        writeFile(conf, file2, (short) 2, 2);
        // split it using a CombinedFile input format
        inFormat = new DummyInputFormat();
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2);
        inFormat.setMinSplitSizeRack(BLOCKSIZE);
        splits = inFormat.getSplits(job);
        System.out.println("Made splits(Test1): " + splits.size());
        for (InputSplit split : splits) {
            System.out.println("File split(Test1): " + split);
        }
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            /**
         * If rack1 is processed first by
         * {@link CombineFileInputFormat#createSplits},
         * create only one split on rack1. Otherwise create two splits.
         */
            if (splits.size() == 2) {
                // first split is on rack2, contains file2
                if (split.equals(splits.get(0))) {
                    assertEquals(2, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file2.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(file2.getName(), fileSplit.getPath(1).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(1));
                    assertEquals(hosts2[0], fileSplit.getLocations()[0]);
                }
                // second split is on rack1, contains file1
                if (split.equals(splits.get(1))) {
                    assertEquals(1, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file1.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 1) {
                // first split is on rack1, contains file1 and file2.
                assertEquals(3, fileSplit.getNumPaths());
                Set<Split> expected = new HashSet<>();
                expected.add(new Split(file1.getName(), BLOCKSIZE, 0));
                expected.add(new Split(file2.getName(), BLOCKSIZE, 0));
                expected.add(new Split(file2.getName(), BLOCKSIZE, BLOCKSIZE));
                List<Split> actual = new ArrayList<>();
                for (int i = 0; i < 3; i++) {
                    String name = fileSplit.getPath(i).getName();
                    long length = fileSplit.getLength(i);
                    long offset = fileSplit.getOffset(i);
                    actual.add(new Split(name, length, offset));
                }
                assertTrue(actual.containsAll(expected));
                assertEquals(1, fileSplit.getLocations().length);
                assertEquals(hosts1[0], fileSplit.getLocations()[0]);
            } else {
                fail("Expected split size is 1 or 2, but actual size is " + splits.size());
            }
        }
        // create another file on 3 datanodes and 3 racks.
        dfs.startDataNodes(conf, 1, true, null, rack3, hosts3, null);
        dfs.waitActive();
        Path file3 = new Path(dir3 + "/file3");
        writeFile(conf, new Path(dir3 + "/file3"), (short) 3, 3);
        inFormat = new DummyInputFormat();
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3);
        inFormat.setMinSplitSizeRack(BLOCKSIZE);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test2): " + split);
        }
        Set<Split> expected = new HashSet<>();
        expected.add(new Split(file1.getName(), BLOCKSIZE, 0));
        expected.add(new Split(file2.getName(), BLOCKSIZE, 0));
        expected.add(new Split(file2.getName(), BLOCKSIZE, BLOCKSIZE));
        expected.add(new Split(file3.getName(), BLOCKSIZE, 0));
        expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE));
        expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE * 2));
        List<Split> actual = new ArrayList<>();
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            /**
         * If rack1 is processed first by
         * {@link CombineFileInputFormat#createSplits},
         * create only one split on rack1.
         * If rack2 or rack3 is processed first and rack1 is processed second,
         * create one split on rack2 or rack3 and the other split is on rack1.
         * Otherwise create 3 splits for each rack.
         */
            if (splits.size() == 3) {
                // first split is on rack3, contains file3
                if (split.equals(splits.get(0))) {
                    assertEquals(3, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file3.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(file3.getName(), fileSplit.getPath(1).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(1));
                    assertEquals(file3.getName(), fileSplit.getPath(2).getName());
                    assertEquals(2 * BLOCKSIZE, fileSplit.getOffset(2));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(2));
                    assertEquals(hosts3[0], fileSplit.getLocations()[0]);
                }
                // second split is on rack2, contains file2
                if (split.equals(splits.get(1))) {
                    assertEquals(2, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file2.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(file2.getName(), fileSplit.getPath(1).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(1));
                    assertEquals(hosts2[0], fileSplit.getLocations()[0]);
                }
                // third split is on rack1, contains file1
                if (split.equals(splits.get(2))) {
                    assertEquals(1, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file1.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 2) {
                // first split is on rack2 or rack3, contains one or two files.
                if (split.equals(splits.get(0))) {
                    assertEquals(1, fileSplit.getLocations().length);
                    if (fileSplit.getLocations()[0].equals(hosts2[0])) {
                        assertEquals(2, fileSplit.getNumPaths());
                    } else if (fileSplit.getLocations()[0].equals(hosts3[0])) {
                        assertEquals(3, fileSplit.getNumPaths());
                    } else {
                        fail("First split should be on rack2 or rack3.");
                    }
                }
                // second split is on rack1, contains the rest files.
                if (split.equals(splits.get(1))) {
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 1) {
                // first split is rack1, contains all three files.
                assertEquals(1, fileSplit.getLocations().length);
                assertEquals(6, fileSplit.getNumPaths());
                assertEquals(hosts1[0], fileSplit.getLocations()[0]);
            } else {
                fail("Split size should be 1, 2, or 3.");
            }
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
        }
        assertEquals(6, actual.size());
        assertTrue(actual.containsAll(expected));
        // create file4 on all three racks
        Path file4 = new Path(dir4 + "/file4");
        writeFile(conf, file4, (short) 3, 3);
        inFormat = new DummyInputFormat();
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        inFormat.setMinSplitSizeRack(BLOCKSIZE);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test3): " + split);
        }
        expected.add(new Split(file4.getName(), BLOCKSIZE, 0));
        expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE));
        expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE * 2));
        actual.clear();
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            /**
         * If rack1 is processed first by
         * {@link CombineFileInputFormat#createSplits},
         * create only one split on rack1.
         * If rack2 or rack3 is processed first and rack1 is processed second,
         * create one split on rack2 or rack3 and the other split is on rack1.
         * Otherwise create 3 splits for each rack.
         */
            if (splits.size() == 3) {
                // first split is on rack3, contains file3 and file4
                if (split.equals(splits.get(0))) {
                    assertEquals(6, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts3[0], fileSplit.getLocations()[0]);
                }
                // second split is on rack2, contains file2
                if (split.equals(splits.get(1))) {
                    assertEquals(2, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file2.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(file2.getName(), fileSplit.getPath(1).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getOffset(1));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(1));
                    assertEquals(hosts2[0], fileSplit.getLocations()[0]);
                }
                // third split is on rack1, contains file1
                if (split.equals(splits.get(2))) {
                    assertEquals(1, fileSplit.getNumPaths());
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(file1.getName(), fileSplit.getPath(0).getName());
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 2) {
                // first split is on rack2 or rack3, contains two or three files.
                if (split.equals(splits.get(0))) {
                    assertEquals(1, fileSplit.getLocations().length);
                    if (fileSplit.getLocations()[0].equals(hosts2[0])) {
                        assertEquals(5, fileSplit.getNumPaths());
                    } else if (fileSplit.getLocations()[0].equals(hosts3[0])) {
                        assertEquals(6, fileSplit.getNumPaths());
                    } else {
                        fail("First split should be on rack2 or rack3.");
                    }
                }
                // second split is on rack1, contains the rest files.
                if (split.equals(splits.get(1))) {
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
            } else if (splits.size() == 1) {
                // first split is rack1, contains all four files.
                assertEquals(1, fileSplit.getLocations().length);
                assertEquals(9, fileSplit.getNumPaths());
                assertEquals(hosts1[0], fileSplit.getLocations()[0]);
            } else {
                fail("Split size should be 1, 2, or 3.");
            }
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        // maximum split size is 2 blocks 
        inFormat = new DummyInputFormat();
        inFormat.setMinSplitSizeNode(BLOCKSIZE);
        inFormat.setMaxSplitSize(2 * BLOCKSIZE);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test4): " + split);
        }
        assertEquals(5, splits.size());
        actual.clear();
        reset(mockList);
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
            mockList.add(fileSplit.getLocations()[0]);
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        // verify the splits are on all the racks
        verify(mockList, atLeastOnce()).add(hosts1[0]);
        verify(mockList, atLeastOnce()).add(hosts2[0]);
        verify(mockList, atLeastOnce()).add(hosts3[0]);
        // maximum split size is 3 blocks 
        inFormat = new DummyInputFormat();
        inFormat.setMinSplitSizeNode(BLOCKSIZE);
        inFormat.setMaxSplitSize(3 * BLOCKSIZE);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test5): " + split);
        }
        assertEquals(3, splits.size());
        actual.clear();
        reset(mockList);
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
            mockList.add(fileSplit.getLocations()[0]);
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        verify(mockList, atLeastOnce()).add(hosts1[0]);
        verify(mockList, atLeastOnce()).add(hosts2[0]);
        // maximum split size is 4 blocks 
        inFormat = new DummyInputFormat();
        inFormat.setMaxSplitSize(4 * BLOCKSIZE);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test6): " + split);
        }
        assertEquals(3, splits.size());
        actual.clear();
        reset(mockList);
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
            mockList.add(fileSplit.getLocations()[0]);
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        verify(mockList, atLeastOnce()).add(hosts1[0]);
        // maximum split size is 7 blocks and min is 3 blocks
        inFormat = new DummyInputFormat();
        inFormat.setMaxSplitSize(7 * BLOCKSIZE);
        inFormat.setMinSplitSizeNode(3 * BLOCKSIZE);
        inFormat.setMinSplitSizeRack(3 * BLOCKSIZE);
        FileInputFormat.setInputPaths(job, dir1 + "," + dir2 + "," + dir3 + "," + dir4);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test7): " + split);
        }
        assertEquals(2, splits.size());
        actual.clear();
        reset(mockList);
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            for (int i = 0; i < fileSplit.getNumPaths(); i++) {
                String name = fileSplit.getPath(i).getName();
                long length = fileSplit.getLength(i);
                long offset = fileSplit.getOffset(i);
                actual.add(new Split(name, length, offset));
            }
            mockList.add(fileSplit.getLocations()[0]);
        }
        assertEquals(9, actual.size());
        assertTrue(actual.containsAll(expected));
        verify(mockList, atLeastOnce()).add(hosts1[0]);
        // Rack 1 has file1, file2 and file3 and file4
        // Rack 2 has file2 and file3 and file4
        // Rack 3 has file3 and file4
        // setup a filter so that only (file1 and file2) or (file3 and file4)
        // can be combined
        inFormat = new DummyInputFormat();
        FileInputFormat.addInputPath(job, inDir);
        // everything is at least rack local
        inFormat.setMinSplitSizeRack(1);
        inFormat.createPool(new TestFilter(dir1), new TestFilter(dir2));
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test1): " + split);
        }
        for (InputSplit split : splits) {
            fileSplit = (CombineFileSplit) split;
            if (splits.size() == 2) {
                // first split is on rack1, contains file1 and file2.
                if (split.equals(splits.get(0))) {
                    assertEquals(3, fileSplit.getNumPaths());
                    expected.clear();
                    expected.add(new Split(file1.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file2.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file2.getName(), BLOCKSIZE, BLOCKSIZE));
                    actual.clear();
                    for (int i = 0; i < 3; i++) {
                        String name = fileSplit.getPath(i).getName();
                        long length = fileSplit.getLength(i);
                        long offset = fileSplit.getOffset(i);
                        actual.add(new Split(name, length, offset));
                    }
                    assertTrue(actual.containsAll(expected));
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
                if (split.equals(splits.get(1))) {
                    // second split contains the file3 and file4, however,
                    // the locations is undetermined.
                    assertEquals(6, fileSplit.getNumPaths());
                    expected.clear();
                    expected.add(new Split(file3.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE));
                    expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE * 2));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE * 2));
                    actual.clear();
                    for (int i = 0; i < 6; i++) {
                        String name = fileSplit.getPath(i).getName();
                        long length = fileSplit.getLength(i);
                        long offset = fileSplit.getOffset(i);
                        actual.add(new Split(name, length, offset));
                    }
                    assertTrue(actual.containsAll(expected));
                    assertEquals(1, fileSplit.getLocations().length);
                }
            } else if (splits.size() == 3) {
                if (split.equals(splits.get(0))) {
                    // first split is on rack2, contains file2
                    assertEquals(2, fileSplit.getNumPaths());
                    expected.clear();
                    expected.add(new Split(file2.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file2.getName(), BLOCKSIZE, BLOCKSIZE));
                    actual.clear();
                    for (int i = 0; i < 2; i++) {
                        String name = fileSplit.getPath(i).getName();
                        long length = fileSplit.getLength(i);
                        long offset = fileSplit.getOffset(i);
                        actual.add(new Split(name, length, offset));
                    }
                    assertTrue(actual.containsAll(expected));
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts2[0], fileSplit.getLocations()[0]);
                }
                if (split.equals(splits.get(1))) {
                    // second split is on rack1, contains file1
                    assertEquals(1, fileSplit.getNumPaths());
                    assertEquals(file1.getName(), fileSplit.getPath(0).getName());
                    assertEquals(BLOCKSIZE, fileSplit.getLength(0));
                    assertEquals(0, fileSplit.getOffset(0));
                    assertEquals(1, fileSplit.getLocations().length);
                    assertEquals(hosts1[0], fileSplit.getLocations()[0]);
                }
                if (split.equals(splits.get(2))) {
                    // third split contains file3 and file4, however,
                    // the locations is undetermined.
                    assertEquals(6, fileSplit.getNumPaths());
                    expected.clear();
                    expected.add(new Split(file3.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE));
                    expected.add(new Split(file3.getName(), BLOCKSIZE, BLOCKSIZE * 2));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, 0));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE));
                    expected.add(new Split(file4.getName(), BLOCKSIZE, BLOCKSIZE * 2));
                    actual.clear();
                    for (int i = 0; i < 6; i++) {
                        String name = fileSplit.getPath(i).getName();
                        long length = fileSplit.getLength(i);
                        long offset = fileSplit.getOffset(i);
                        actual.add(new Split(name, length, offset));
                    }
                    assertTrue(actual.containsAll(expected));
                    assertEquals(1, fileSplit.getLocations().length);
                }
            } else {
                fail("Split size should be 2 or 3.");
            }
        }
        // measure performance when there are multiple pools and
        // many files in each pool.
        int numPools = 100;
        int numFiles = 1000;
        DummyInputFormat1 inFormat1 = new DummyInputFormat1();
        for (int i = 0; i < numFiles; i++) {
            FileInputFormat.setInputPaths(job, file1);
        }
        // everything is at least rack local
        inFormat1.setMinSplitSizeRack(1);
        final Path dirNoMatch1 = new Path(inDir, "/dirxx");
        final Path dirNoMatch2 = new Path(inDir, "/diryy");
        for (int i = 0; i < numPools; i++) {
            inFormat1.createPool(new TestFilter(dirNoMatch1), new TestFilter(dirNoMatch2));
        }
        long start = System.currentTimeMillis();
        splits = inFormat1.getSplits(job);
        long end = System.currentTimeMillis();
        System.out.println("Elapsed time for " + numPools + " pools " + " and " + numFiles + " files is " + ((end - start) / 1000) + " seconds.");
        // This file has three whole blocks. If the maxsplit size is
        // half the block size, then there should be six splits.
        inFormat = new DummyInputFormat();
        inFormat.setMaxSplitSize(BLOCKSIZE / 2);
        FileInputFormat.setInputPaths(job, dir3);
        splits = inFormat.getSplits(job);
        for (InputSplit split : splits) {
            System.out.println("File split(Test8): " + split);
        }
        assertEquals(splits.size(), 6);
    } finally {
        if (dfs != null) {
            dfs.shutdown();
        }
    }
}

Also used : Set(java.util.Set) HashSet(java.util.HashSet) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) List(java.util.List) ArrayList(java.util.ArrayList) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) IOException(java.io.IOException) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 13 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestCombineSequenceFileInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws IOException, InterruptedException {
    Job job = Job.getInstance(conf);
    Random random = new Random();
    long seed = random.nextLong();
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create files with a variety of lengths
    createFiles(length, numFiles, random, job);
    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    // create a combine split for the files
    InputFormat<IntWritable, BytesWritable> format = new CombineSequenceFileInputFormat<IntWritable, BytesWritable>();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        List<InputSplit> splits = format.getSplits(job);
        LOG.info("splitting: got =        " + splits.size());
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.size());
        InputSplit split = splits.get(0);
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check the split
        BitSet bits = new BitSet(length);
        RecordReader<IntWritable, BytesWritable> reader = format.createRecordReader(split, context);
        MapContext<IntWritable, BytesWritable, IntWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, BytesWritable, IntWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        assertEquals("reader class is CombineFileRecordReader.", CombineFileRecordReader.class, reader.getClass());
        try {
            while (reader.nextKeyValue()) {
                IntWritable key = reader.getCurrentKey();
                BytesWritable value = reader.getCurrentValue();
                assertNotNull("Value should not be null.", value);
                final int k = key.get();
                LOG.debug("read " + k);
                assertFalse("Key in multiple partitions.", bits.get(k));
                bits.set(k);
            }
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}

Also used : MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BytesWritable(org.apache.hadoop.io.BytesWritable) Random(java.util.Random) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 14 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestCombineTextInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws Exception {
    Job job = Job.getInstance(new Configuration(defaultConf));
    Random random = new Random();
    long seed = random.nextLong();
    LOG.info("seed = " + seed);
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create files with various lengths
    createFiles(length, numFiles, random);
    // create a combined split for the files
    CombineTextInputFormat format = new CombineTextInputFormat();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / 20) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        List<InputSplit> splits = format.getSplits(job);
        LOG.info("splitting: got =        " + splits.size());
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.size());
        InputSplit split = splits.get(0);
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check the split
        BitSet bits = new BitSet(length);
        LOG.debug("split= " + split);
        TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<LongWritable, Text> reader = format.createRecordReader(split, context);
        assertEquals("reader class is CombineFileRecordReader.", CombineFileRecordReader.class, reader.getClass());
        MapContext<LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl<LongWritable, Text, LongWritable, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        try {
            int count = 0;
            while (reader.nextKeyValue()) {
                LongWritable key = reader.getCurrentKey();
                assertNotNull("Key should not be null.", key);
                Text value = reader.getCurrentValue();
                final int v = Integer.parseInt(value.toString());
                LOG.debug("read " + v);
                assertFalse("Key in multiple partitions.", bits.get(v));
                bits.set(v);
                count++;
            }
            LOG.debug("split=" + split + " count=" + count);
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}

Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) Random(java.util.Random) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 15 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestCombineTextInputFormat method testGzip.

/**
   * Test using the gzip codec for reading
   */
@Test(timeout = 10000)
public void testGzip() throws IOException, InterruptedException {
    Configuration conf = new Configuration(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, conf);
    localFs.delete(workDir, true);
    writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
    Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, workDir);
    CombineTextInputFormat format = new CombineTextInputFormat();
    List<InputSplit> splits = format.getSplits(job);
    assertEquals("compressed splits == 1", 1, splits.size());
    List<Text> results = readSplit(format, splits.get(0), job);
    assertEquals("splits[0] length", 8, results.size());
    final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
    final String[] secondList = { "this is a test", "of gzip" };
    String first = results.get(0).toString();
    if (first.equals(firstList[0])) {
        testResults(results, firstList, secondList);
    } else if (first.equals(secondList[0])) {
        testResults(results, secondList, firstList);
    } else {
        fail("unexpected first token!");
    }
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9