Search in sources :

Example 1 with MapContextImpl

use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.

the class TestCombineSequenceFileInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws IOException, InterruptedException {
    Job job = Job.getInstance(conf);
    Random random = new Random();
    long seed = random.nextLong();
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create files with a variety of lengths
    createFiles(length, numFiles, random, job);
    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    // create a combine split for the files
    InputFormat<IntWritable, BytesWritable> format = new CombineSequenceFileInputFormat<IntWritable, BytesWritable>();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        List<InputSplit> splits = format.getSplits(job);
        LOG.info("splitting: got =        " + splits.size());
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.size());
        InputSplit split = splits.get(0);
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check the split
        BitSet bits = new BitSet(length);
        RecordReader<IntWritable, BytesWritable> reader = format.createRecordReader(split, context);
        MapContext<IntWritable, BytesWritable, IntWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, BytesWritable, IntWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        assertEquals("reader class is CombineFileRecordReader.", CombineFileRecordReader.class, reader.getClass());
        try {
            while (reader.nextKeyValue()) {
                IntWritable key = reader.getCurrentKey();
                BytesWritable value = reader.getCurrentValue();
                assertNotNull("Value should not be null.", value);
                final int k = key.get();
                LOG.debug("read " + k);
                assertFalse("Key in multiple partitions.", bits.get(k));
                bits.set(k);
            }
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}
Also used : MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BytesWritable(org.apache.hadoop.io.BytesWritable) Random(java.util.Random) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 2 with MapContextImpl

use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.

the class TestCombineTextInputFormat method readSplit.

private static List<Text> readSplit(InputFormat<LongWritable, Text> format, InputSplit split, Job job) throws IOException, InterruptedException {
    List<Text> result = new ArrayList<Text>();
    Configuration conf = job.getConfiguration();
    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(conf);
    RecordReader<LongWritable, Text> reader = format.createRecordReader(split, MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
    MapContext<LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl<LongWritable, Text, LongWritable, Text>(conf, context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    while (reader.nextKeyValue()) {
        result.add(new Text(reader.getCurrentValue()));
    }
    return result;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) LongWritable(org.apache.hadoop.io.LongWritable)

Example 3 with MapContextImpl

use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.

the class TestCombineTextInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws Exception {
    Job job = Job.getInstance(new Configuration(defaultConf));
    Random random = new Random();
    long seed = random.nextLong();
    LOG.info("seed = " + seed);
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create files with various lengths
    createFiles(length, numFiles, random);
    // create a combined split for the files
    CombineTextInputFormat format = new CombineTextInputFormat();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / 20) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        List<InputSplit> splits = format.getSplits(job);
        LOG.info("splitting: got =        " + splits.size());
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.size());
        InputSplit split = splits.get(0);
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check the split
        BitSet bits = new BitSet(length);
        LOG.debug("split= " + split);
        TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<LongWritable, Text> reader = format.createRecordReader(split, context);
        assertEquals("reader class is CombineFileRecordReader.", CombineFileRecordReader.class, reader.getClass());
        MapContext<LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl<LongWritable, Text, LongWritable, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        try {
            int count = 0;
            while (reader.nextKeyValue()) {
                LongWritable key = reader.getCurrentKey();
                assertNotNull("Key should not be null.", key);
                Text value = reader.getCurrentValue();
                final int v = Integer.parseInt(value.toString());
                LOG.debug("read " + v);
                assertFalse("Key in multiple partitions.", bits.get(v));
                bits.set(v);
                count++;
            }
            LOG.debug("split=" + split + " count=" + count);
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) Random(java.util.Random) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 4 with MapContextImpl

use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.

the class TestMRKeyValueTextInputFormat method testFormat.

@Test
public void testFormat() throws Exception {
    Job job = Job.getInstance(new Configuration(defaultConf));
    Path file = new Path(workDir, "test.txt");
    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int MAX_LENGTH = 10000;
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
        LOG.debug("creating; entries = " + length);
        // create a file with length entries
        Writer writer = new OutputStreamWriter(localFs.create(file));
        try {
            for (int i = 0; i < length; i++) {
                writer.write(Integer.toString(i * 2));
                writer.write("\t");
                writer.write(Integer.toString(i));
                writer.write("\n");
            }
        } finally {
            writer.close();
        }
        // try splitting the file in a variety of sizes
        KeyValueTextInputFormat format = new KeyValueTextInputFormat();
        for (int i = 0; i < 3; i++) {
            int numSplits = random.nextInt(MAX_LENGTH / 20) + 1;
            LOG.debug("splitting: requesting = " + numSplits);
            List<InputSplit> splits = format.getSplits(job);
            LOG.debug("splitting: got =        " + splits.size());
            // check each split
            BitSet bits = new BitSet(length);
            for (int j = 0; j < splits.size(); j++) {
                LOG.debug("split[" + j + "]= " + splits.get(j));
                TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
                RecordReader<Text, Text> reader = format.createRecordReader(splits.get(j), context);
                Class<?> clazz = reader.getClass();
                assertEquals("reader class is KeyValueLineRecordReader.", KeyValueLineRecordReader.class, clazz);
                MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(j));
                reader.initialize(splits.get(j), mcontext);
                Text key = null;
                Text value = null;
                try {
                    int count = 0;
                    while (reader.nextKeyValue()) {
                        key = reader.getCurrentKey();
                        clazz = key.getClass();
                        assertEquals("Key class is Text.", Text.class, clazz);
                        value = reader.getCurrentValue();
                        clazz = value.getClass();
                        assertEquals("Value class is Text.", Text.class, clazz);
                        final int k = Integer.parseInt(key.toString());
                        final int v = Integer.parseInt(value.toString());
                        assertEquals("Bad key", 0, k % 2);
                        assertEquals("Mismatched key/value", k / 2, v);
                        LOG.debug("read " + v);
                        assertFalse("Key in multiple partitions.", bits.get(v));
                        bits.set(v);
                        count++;
                    }
                    LOG.debug("splits[" + j + "]=" + splits.get(j) + " count=" + count);
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Random(java.util.Random) OutputStreamWriter(java.io.OutputStreamWriter) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) Test(org.junit.Test)

Example 5 with MapContextImpl

use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.

the class TestMRKeyValueTextInputFormat method testSplitableCodecs.

@Test
public void testSplitableCodecs() throws Exception {
    final Job job = Job.getInstance(defaultConf);
    final Configuration conf = job.getConfiguration();
    // Create the codec
    CompressionCodec codec = null;
    try {
        codec = (CompressionCodec) ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
    } catch (ClassNotFoundException cnfe) {
        throw new IOException("Illegal codec!");
    }
    Path file = new Path(workDir, "test" + codec.getDefaultExtension());
    int seed = new Random().nextInt();
    LOG.info("seed = " + seed);
    Random random = new Random(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int MAX_LENGTH = 500000;
    FileInputFormat.setMaxInputSplitSize(job, MAX_LENGTH / 20);
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 4) + 1) {
        LOG.info("creating; entries = " + length);
        // create a file with length entries
        Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
        try {
            for (int i = 0; i < length; i++) {
                writer.write(Integer.toString(i * 2));
                writer.write("\t");
                writer.write(Integer.toString(i));
                writer.write("\n");
            }
        } finally {
            writer.close();
        }
        // try splitting the file in a variety of sizes
        KeyValueTextInputFormat format = new KeyValueTextInputFormat();
        assertTrue("KVTIF claims not splittable", format.isSplitable(job, file));
        for (int i = 0; i < 3; i++) {
            int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
            LOG.info("splitting: requesting = " + numSplits);
            List<InputSplit> splits = format.getSplits(job);
            LOG.info("splitting: got =        " + splits.size());
            // check each split
            BitSet bits = new BitSet(length);
            for (int j = 0; j < splits.size(); j++) {
                LOG.debug("split[" + j + "]= " + splits.get(j));
                TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
                RecordReader<Text, Text> reader = format.createRecordReader(splits.get(j), context);
                Class<?> clazz = reader.getClass();
                MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(j));
                reader.initialize(splits.get(j), mcontext);
                Text key = null;
                Text value = null;
                try {
                    int count = 0;
                    while (reader.nextKeyValue()) {
                        key = reader.getCurrentKey();
                        value = reader.getCurrentValue();
                        final int k = Integer.parseInt(key.toString());
                        final int v = Integer.parseInt(value.toString());
                        assertEquals("Bad key", 0, k % 2);
                        assertEquals("Mismatched key/value", k / 2, v);
                        LOG.debug("read " + k + "," + v);
                        assertFalse(k + "," + v + " in multiple partitions.", bits.get(v));
                        bits.set(v);
                        count++;
                    }
                    if (count > 0) {
                        LOG.info("splits[" + j + "]=" + splits.get(j) + " count=" + count);
                    } else {
                        LOG.debug("splits[" + j + "]=" + splits.get(j) + " count=" + count);
                    }
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) IOException(java.io.IOException) Random(java.util.Random) OutputStreamWriter(java.io.OutputStreamWriter) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) OutputStreamWriter(java.io.OutputStreamWriter) Writer(java.io.Writer) Test(org.junit.Test)

Aggregations

MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)22 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)20 InputSplit (org.apache.hadoop.mapreduce.InputSplit)14 Test (org.junit.Test)13 LongWritable (org.apache.hadoop.io.LongWritable)12 Job (org.apache.hadoop.mapreduce.Job)11 BytesWritable (org.apache.hadoop.io.BytesWritable)10 Configuration (org.apache.hadoop.conf.Configuration)9 Path (org.apache.hadoop.fs.Path)9 Random (java.util.Random)8 Text (org.apache.hadoop.io.Text)6 BitSet (java.util.BitSet)5 IOException (java.io.IOException)4 IntWritable (org.apache.hadoop.io.IntWritable)4 ArrayList (java.util.ArrayList)3 FileSystem (org.apache.hadoop.fs.FileSystem)3 NullWritable (org.apache.hadoop.io.NullWritable)3 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)3 StatusReporter (org.apache.hadoop.mapreduce.StatusReporter)3 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)3