Search in sources :

Example 16 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestMRCJCFileInputFormat method testLastInputSplitAtSplitBoundary.

@Test
@SuppressWarnings({ "rawtypes", "unchecked" })
public void testLastInputSplitAtSplitBoundary() throws Exception {
    FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024, 128l * 1024 * 1024);
    Configuration conf = new Configuration();
    JobContext jobContext = mock(JobContext.class);
    when(jobContext.getConfiguration()).thenReturn(conf);
    List<InputSplit> splits = fif.getSplits(jobContext);
    assertEquals(8, splits.size());
    for (int i = 0; i < splits.size(); i++) {
        InputSplit split = splits.get(i);
        assertEquals(("host" + i), split.getLocations()[0]);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) JobContext(org.apache.hadoop.mapreduce.JobContext) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 17 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestMRCJCFileInputFormat method testForEmptyFile.

/**
   * Test when the input file's length is 0.
   */
@Test
public void testForEmptyFile() throws Exception {
    Configuration conf = new Configuration();
    FileSystem fileSys = FileSystem.get(conf);
    Path file = new Path("test" + "/file");
    FSDataOutputStream out = fileSys.create(file, true, conf.getInt("io.file.buffer.size", 4096), (short) 1, (long) 1024);
    out.write(new byte[0]);
    out.close();
    // split it using a File input format
    DummyInputFormat inFormat = new DummyInputFormat();
    Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, "test");
    List<InputSplit> splits = inFormat.getSplits(job);
    assertEquals(1, splits.size());
    FileSplit fileSplit = (FileSplit) splits.get(0);
    assertEquals(0, fileSplit.getLocations().length);
    assertEquals(file.getName(), fileSplit.getPath().getName());
    assertEquals(0, fileSplit.getStart());
    assertEquals(0, fileSplit.getLength());
    fileSys.delete(file.getParent(), true);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 18 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestMRKeyValueTextInputFormat method testFormat.

@Test
public void testFormat() throws Exception {
    Job job = Job.getInstance(new Configuration(defaultConf));
    Path file = new Path(workDir, "test.txt");
    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int MAX_LENGTH = 10000;
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
        LOG.debug("creating; entries = " + length);
        // create a file with length entries
        Writer writer = new OutputStreamWriter(localFs.create(file));
        try {
            for (int i = 0; i < length; i++) {
                writer.write(Integer.toString(i * 2));
                writer.write("\t");
                writer.write(Integer.toString(i));
                writer.write("\n");
            }
        } finally {
            writer.close();
        }
        // try splitting the file in a variety of sizes
        KeyValueTextInputFormat format = new KeyValueTextInputFormat();
        for (int i = 0; i < 3; i++) {
            int numSplits = random.nextInt(MAX_LENGTH / 20) + 1;
            LOG.debug("splitting: requesting = " + numSplits);
            List<InputSplit> splits = format.getSplits(job);
            LOG.debug("splitting: got =        " + splits.size());
            // check each split
            BitSet bits = new BitSet(length);
            for (int j = 0; j < splits.size(); j++) {
                LOG.debug("split[" + j + "]= " + splits.get(j));
                TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
                RecordReader<Text, Text> reader = format.createRecordReader(splits.get(j), context);
                Class<?> clazz = reader.getClass();
                assertEquals("reader class is KeyValueLineRecordReader.", KeyValueLineRecordReader.class, clazz);
                MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(j));
                reader.initialize(splits.get(j), mcontext);
                Text key = null;
                Text value = null;
                try {
                    int count = 0;
                    while (reader.nextKeyValue()) {
                        key = reader.getCurrentKey();
                        clazz = key.getClass();
                        assertEquals("Key class is Text.", Text.class, clazz);
                        value = reader.getCurrentValue();
                        clazz = value.getClass();
                        assertEquals("Value class is Text.", Text.class, clazz);
                        final int k = Integer.parseInt(key.toString());
                        final int v = Integer.parseInt(value.toString());
                        assertEquals("Bad key", 0, k % 2);
                        assertEquals("Mismatched key/value", k / 2, v);
                        LOG.debug("read " + v);
                        assertFalse("Key in multiple partitions.", bits.get(v));
                        bits.set(v);
                        count++;
                    }
                    LOG.debug("splits[" + j + "]=" + splits.get(j) + " count=" + count);
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Random(java.util.Random) OutputStreamWriter(java.io.OutputStreamWriter) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) Test(org.junit.Test)

Example 19 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestMRKeyValueTextInputFormat method testSplitableCodecs.

@Test
public void testSplitableCodecs() throws Exception {
    final Job job = Job.getInstance(defaultConf);
    final Configuration conf = job.getConfiguration();
    // Create the codec
    CompressionCodec codec = null;
    try {
        codec = (CompressionCodec) ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
    } catch (ClassNotFoundException cnfe) {
        throw new IOException("Illegal codec!");
    }
    Path file = new Path(workDir, "test" + codec.getDefaultExtension());
    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int MAX_LENGTH = 500000;
    FileInputFormat.setMaxInputSplitSize(job, MAX_LENGTH / 20);
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 4) + 1) {
        LOG.info("creating; entries = " + length);
        // create a file with length entries
        Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
        try {
            for (int i = 0; i < length; i++) {
                writer.write(Integer.toString(i * 2));
                writer.write("\t");
                writer.write(Integer.toString(i));
                writer.write("\n");
            }
        } finally {
            writer.close();
        }
        // try splitting the file in a variety of sizes
        KeyValueTextInputFormat format = new KeyValueTextInputFormat();
        assertTrue("KVTIF claims not splittable", format.isSplitable(job, file));
        for (int i = 0; i < 3; i++) {
            int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
            LOG.info("splitting: requesting = " + numSplits);
            List<InputSplit> splits = format.getSplits(job);
            LOG.info("splitting: got =        " + splits.size());
            // check each split
            BitSet bits = new BitSet(length);
            for (int j = 0; j < splits.size(); j++) {
                LOG.debug("split[" + j + "]= " + splits.get(j));
                TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
                RecordReader<Text, Text> reader = format.createRecordReader(splits.get(j), context);
                Class<?> clazz = reader.getClass();
                MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(j));
                reader.initialize(splits.get(j), mcontext);
                Text key = null;
                Text value = null;
                try {
                    int count = 0;
                    while (reader.nextKeyValue()) {
                        key = reader.getCurrentKey();
                        value = reader.getCurrentValue();
                        final int k = Integer.parseInt(key.toString());
                        final int v = Integer.parseInt(value.toString());
                        assertEquals("Bad key", 0, k % 2);
                        assertEquals("Mismatched key/value", k / 2, v);
                        LOG.debug("read " + k + "," + v);
                        assertFalse(k + "," + v + " in multiple partitions.", bits.get(v));
                        bits.set(v);
                        count++;
                    }
                    if (count > 0) {
                        LOG.info("splits[" + j + "]=" + splits.get(j) + " count=" + count);
                    } else {
                        LOG.debug("splits[" + j + "]=" + splits.get(j) + " count=" + count);
                    }
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) Random(java.util.Random) OutputStreamWriter(java.io.OutputStreamWriter) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) Test(org.junit.Test)

Example 20 with InputSplit

use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.

the class TestMRSequenceFileAsBinaryInputFormat method testBinary.

@Test
public void testBinary() throws IOException, InterruptedException {
    Job job = Job.getInstance();
    FileSystem fs = FileSystem.getLocal(job.getConfiguration());
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path file = new Path(dir, "testbinary.seq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);
    fs.delete(dir, true);
    FileInputFormat.setInputPaths(job, dir);
    Text tkey = new Text();
    Text tval = new Text();
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, job.getConfiguration(), file, Text.class, Text.class);
    try {
        for (int i = 0; i < RECORDS; ++i) {
            tkey.set(Integer.toString(r.nextInt(), 36));
            tval.set(Long.toString(r.nextLong(), 36));
            writer.append(tkey, tval);
        }
    } finally {
        writer.close();
    }
    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    InputFormat<BytesWritable, BytesWritable> bformat = new SequenceFileAsBinaryInputFormat();
    int count = 0;
    r.setSeed(seed);
    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();
    Text cmpkey = new Text();
    Text cmpval = new Text();
    DataInputBuffer buf = new DataInputBuffer();
    FileInputFormat.setInputPaths(job, file);
    for (InputSplit split : bformat.getSplits(job)) {
        RecordReader<BytesWritable, BytesWritable> reader = bformat.createRecordReader(split, context);
        MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<BytesWritable, BytesWritable, BytesWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        try {
            while (reader.nextKeyValue()) {
                bkey = reader.getCurrentKey();
                bval = reader.getCurrentValue();
                tkey.set(Integer.toString(r.nextInt(), 36));
                tval.set(Long.toString(r.nextLong(), 36));
                buf.reset(bkey.getBytes(), bkey.getLength());
                cmpkey.readFields(buf);
                buf.reset(bval.getBytes(), bval.getLength());
                cmpval.readFields(buf);
                assertTrue("Keys don't match: " + "*" + cmpkey.toString() + ":" + tkey.toString() + "*", cmpkey.toString().equals(tkey.toString()));
                assertTrue("Vals don't match: " + "*" + cmpval.toString() + ":" + tval.toString() + "*", cmpval.toString().equals(tval.toString()));
                ++count;
            }
        } finally {
            reader.close();
        }
    }
    assertEquals("Some records not found", RECORDS, count);
}
Also used : Path(org.apache.hadoop.fs.Path) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BytesWritable(org.apache.hadoop.io.BytesWritable) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) Random(java.util.Random) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Aggregations

InputSplit (org.apache.hadoop.mapreduce.InputSplit)160 Configuration (org.apache.hadoop.conf.Configuration)70 Test (org.junit.Test)68 ArrayList (java.util.ArrayList)51 Path (org.apache.hadoop.fs.Path)43 Job (org.apache.hadoop.mapreduce.Job)42 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)38 IOException (java.io.IOException)33 JobContext (org.apache.hadoop.mapreduce.JobContext)20 LongWritable (org.apache.hadoop.io.LongWritable)19 FileSystem (org.apache.hadoop.fs.FileSystem)16 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)14 MongoInputSplit (com.mongodb.hadoop.input.MongoInputSplit)13 List (java.util.List)13 Text (org.apache.hadoop.io.Text)13 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)13 DBObject (com.mongodb.DBObject)10 File (java.io.File)10 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)10 BaseHadoopTest (com.mongodb.hadoop.testutils.BaseHadoopTest)9