Examples with InputSplit - org.apache.hadoop.mapreduce.InputSplit

Example 21 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestMRSequenceFileAsTextInputFormat method testFormat.

@Test
public void testFormat() throws Exception {
    Job job = Job.getInstance(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path file = new Path(dir, "test.seq");
    int seed = new Random().nextInt();
    Random random = new Random(seed);
    fs.delete(dir, true);
    FileInputFormat.setInputPaths(job, dir);
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
        // create a file with length entries
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, LongWritable.class);
        try {
            for (int i = 0; i < length; i++) {
                IntWritable key = new IntWritable(i);
                LongWritable value = new LongWritable(10 * i);
                writer.append(key, value);
            }
        } finally {
            writer.close();
        }
        TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
        // try splitting the file in a variety of sizes
        InputFormat<Text, Text> format = new SequenceFileAsTextInputFormat();
        for (int i = 0; i < 3; i++) {
            // check each split
            BitSet bits = new BitSet(length);
            int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
            FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(file).getLen() / numSplits);
            for (InputSplit split : format.getSplits(job)) {
                RecordReader<Text, Text> reader = format.createRecordReader(split, context);
                MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
                reader.initialize(split, mcontext);
                Class<?> readerClass = reader.getClass();
                assertEquals("reader class is SequenceFileAsTextRecordReader.", SequenceFileAsTextRecordReader.class, readerClass);
                Text key;
                try {
                    int count = 0;
                    while (reader.nextKeyValue()) {
                        key = reader.getCurrentKey();
                        int keyInt = Integer.parseInt(key.toString());
                        assertFalse("Key in multiple partitions.", bits.get(keyInt));
                        bits.set(keyInt);
                        count++;
                    }
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}

Also used : Path(org.apache.hadoop.fs.Path) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) Random(java.util.Random) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 22 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestMRSequenceFileAsBinaryOutputFormat method testBinary.

@Test
public void testBinary() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);
    FileOutputFormat.setOutputPath(job, outdir);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);
    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();
    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat();
    OutputCommitter committer = outputFormat.getOutputCommitter(context);
    committer.setupJob(job);
    RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);
    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
        for (int i = 0; i < RECORDS; ++i) {
            iwritable = new IntWritable(r.nextInt());
            iwritable.write(outbuf);
            bkey.set(outbuf.getData(), 0, outbuf.getLength());
            outbuf.reset();
            dwritable = new DoubleWritable(r.nextDouble());
            dwritable.write(outbuf);
            bval.set(outbuf.getData(), 0, outbuf.getLength());
            outbuf.reset();
            writer.write(bkey, bval);
        }
    } finally {
        writer.close(context);
    }
    committer.commitTask(context);
    committer.commitJob(job);
    InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    SequenceFileInputFormat.setInputPaths(job, outdir);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job)) {
        RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
        MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        try {
            int sourceInt;
            double sourceDouble;
            while (reader.nextKeyValue()) {
                sourceInt = r.nextInt();
                sourceDouble = r.nextDouble();
                iwritable = reader.getCurrentKey();
                dwritable = reader.getCurrentValue();
                assertEquals("Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get());
                assertTrue("Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0);
                ++count;
            }
        } finally {
            reader.close();
        }
    }
    assertEquals("Some records not found", RECORDS, count);
}

Also used : Path(org.apache.hadoop.fs.Path) OutputCommitter(org.apache.hadoop.mapreduce.OutputCommitter) Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) SequenceFileInputFormat(org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat) BytesWritable(org.apache.hadoop.io.BytesWritable) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) DoubleWritable(org.apache.hadoop.io.DoubleWritable) Random(java.util.Random) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 23 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestCombineFileInputFormat method testNodeDistribution.

@Test
public void testNodeDistribution() throws IOException, InterruptedException {
    DummyInputFormat inFormat = new DummyInputFormat();
    int numBlocks = 60;
    long totLength = 0;
    long blockSize = 100;
    int numNodes = 10;
    long minSizeNode = 50;
    long minSizeRack = 50;
    // 4 blocks per split.
    int maxSplitSize = 200;
    String[] locations = new String[numNodes];
    for (int i = 0; i < numNodes; i++) {
        locations[i] = "h" + i;
    }
    String[] racks = new String[0];
    Path path = new Path("hdfs://file");
    OneBlockInfo[] blocks = new OneBlockInfo[numBlocks];
    int hostCountBase = 0;
    // Generate block list. Replication 3 per block.
    for (int i = 0; i < numBlocks; i++) {
        int localHostCount = hostCountBase;
        String[] blockHosts = new String[3];
        for (int j = 0; j < 3; j++) {
            int hostNum = localHostCount % numNodes;
            blockHosts[j] = "h" + hostNum;
            localHostCount++;
        }
        hostCountBase++;
        blocks[i] = new OneBlockInfo(path, i * blockSize, blockSize, blockHosts, racks);
        totLength += blockSize;
    }
    List<InputSplit> splits = new ArrayList<InputSplit>();
    HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();
    HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();
    HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();
    Map<String, Set<OneBlockInfo>> nodeToBlocks = new TreeMap<String, Set<OneBlockInfo>>();
    OneFileInfo.populateBlockInfo(blocks, rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes);
    inFormat.createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength, maxSplitSize, minSizeNode, minSizeRack, splits);
    int expectedSplitCount = (int) (totLength / maxSplitSize);
    assertEquals(expectedSplitCount, splits.size());
    // Ensure 90+% of the splits have node local blocks.
    // 100% locality may not always be achieved.
    int numLocalSplits = 0;
    for (InputSplit inputSplit : splits) {
        assertEquals(maxSplitSize, inputSplit.getLength());
        if (inputSplit.getLocations().length == 1) {
            numLocalSplits++;
        }
    }
    assertTrue(numLocalSplits >= 0.9 * splits.size());
}

Also used : Path(org.apache.hadoop.fs.Path) Set(java.util.Set) HashSet(java.util.HashSet) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) OneBlockInfo(org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat.OneBlockInfo) TreeMap(java.util.TreeMap) List(java.util.List) ArrayList(java.util.ArrayList) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 24 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestDelegatingInputFormat method testSplitting.

@SuppressWarnings("unchecked")
public void testSplitting() throws Exception {
    Job job = Job.getInstance();
    MiniDFSCluster dfs = null;
    try {
        dfs = new MiniDFSCluster.Builder(job.getConfiguration()).numDataNodes(4).racks(new String[] { "/rack0", "/rack0", "/rack1", "/rack1" }).hosts(new String[] { "host0", "host1", "host2", "host3" }).build();
        FileSystem fs = dfs.getFileSystem();
        Path path = getPath("/foo/bar", fs);
        Path path2 = getPath("/foo/baz", fs);
        Path path3 = getPath("/bar/bar", fs);
        Path path4 = getPath("/bar/baz", fs);
        final int numSplits = 100;
        FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(path).getLen() / numSplits);
        MultipleInputs.addInputPath(job, path, TextInputFormat.class, MapClass.class);
        MultipleInputs.addInputPath(job, path2, TextInputFormat.class, MapClass2.class);
        MultipleInputs.addInputPath(job, path3, KeyValueTextInputFormat.class, MapClass.class);
        MultipleInputs.addInputPath(job, path4, TextInputFormat.class, MapClass2.class);
        DelegatingInputFormat inFormat = new DelegatingInputFormat();
        int[] bins = new int[3];
        for (InputSplit split : (List<InputSplit>) inFormat.getSplits(job)) {
            assertTrue(split instanceof TaggedInputSplit);
            final TaggedInputSplit tis = (TaggedInputSplit) split;
            int index = -1;
            if (tis.getInputFormatClass().equals(KeyValueTextInputFormat.class)) {
                // path3
                index = 0;
            } else if (tis.getMapperClass().equals(MapClass.class)) {
                // path
                index = 1;
            } else {
                // path2 and path4
                index = 2;
            }
            bins[index]++;
        }
        assertEquals("count is not equal to num splits", numSplits, bins[0]);
        assertEquals("count is not equal to num splits", numSplits, bins[1]);
        assertEquals("count is not equal to 2 * num splits", numSplits * 2, bins[2]);
    } finally {
        if (dfs != null) {
            dfs.shutdown();
        }
    }
}

Also used : Path(org.apache.hadoop.fs.Path) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) FileSystem(org.apache.hadoop.fs.FileSystem) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 25 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestFixedLengthInputFormat method testGzipWithTwoInputs.

/**
   * Test using the gzip codec with two input files.
   */
@Test(timeout = 5000)
public void testGzipWithTwoInputs() throws Exception {
    CompressionCodec gzip = new GzipCodec();
    localFs.delete(workDir, true);
    Job job = Job.getInstance(defaultConf);
    FixedLengthInputFormat format = new FixedLengthInputFormat();
    format.setRecordLength(job.getConfiguration(), 5);
    ReflectionUtils.setConf(gzip, job.getConfiguration());
    FileInputFormat.setInputPaths(job, workDir);
    // Create files with fixed length records with 5 byte long records.
    writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one  two  threefour five six  seveneightnine ten  ");
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten  nine eightsevensix  five four threetwo  one  ");
    List<InputSplit> splits = format.getSplits(job);
    assertEquals("compressed splits == 2", 2, splits.size());
    FileSplit tmp = (FileSplit) splits.get(0);
    if (tmp.getPath().getName().equals("part2.txt.gz")) {
        splits.set(0, splits.get(1));
        splits.set(1, tmp);
    }
    List<String> results = readSplit(format, splits.get(0), job);
    assertEquals("splits[0] length", 10, results.size());
    assertEquals("splits[0][5]", "six  ", results.get(5));
    results = readSplit(format, splits.get(1), job);
    assertEquals("splits[1] length", 10, results.size());
    assertEquals("splits[1][0]", "ten  ", results.get(0));
    assertEquals("splits[1][1]", "nine ", results.get(1));
}

Also used : Path(org.apache.hadoop.fs.Path) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9