Search in sources :

Example 21 with Job

use of org.apache.hadoop.mapreduce.Job in project hadoop by apache.

the class TestCombineSequenceFileInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws IOException, InterruptedException {
    Job job = Job.getInstance(conf);
    Random random = new Random();
    long seed = random.nextLong();
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create files with a variety of lengths
    createFiles(length, numFiles, random, job);
    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    // create a combine split for the files
    InputFormat<IntWritable, BytesWritable> format = new CombineSequenceFileInputFormat<IntWritable, BytesWritable>();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        List<InputSplit> splits = format.getSplits(job);
        LOG.info("splitting: got =        " + splits.size());
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.size());
        InputSplit split = splits.get(0);
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check the split
        BitSet bits = new BitSet(length);
        RecordReader<IntWritable, BytesWritable> reader = format.createRecordReader(split, context);
        MapContext<IntWritable, BytesWritable, IntWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, BytesWritable, IntWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        assertEquals("reader class is CombineFileRecordReader.", CombineFileRecordReader.class, reader.getClass());
        try {
            while (reader.nextKeyValue()) {
                IntWritable key = reader.getCurrentKey();
                BytesWritable value = reader.getCurrentValue();
                assertNotNull("Value should not be null.", value);
                final int k = key.get();
                LOG.debug("read " + k);
                assertFalse("Key in multiple partitions.", bits.get(k));
                bits.set(k);
            }
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}
Also used : MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BytesWritable(org.apache.hadoop.io.BytesWritable) Random(java.util.Random) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 22 with Job

use of org.apache.hadoop.mapreduce.Job in project hadoop by apache.

the class TestCombineTextInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws Exception {
    Job job = Job.getInstance(new Configuration(defaultConf));
    Random random = new Random();
    long seed = random.nextLong();
    LOG.info("seed = " + seed);
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create files with various lengths
    createFiles(length, numFiles, random);
    // create a combined split for the files
    CombineTextInputFormat format = new CombineTextInputFormat();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / 20) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        List<InputSplit> splits = format.getSplits(job);
        LOG.info("splitting: got =        " + splits.size());
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.size());
        InputSplit split = splits.get(0);
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check the split
        BitSet bits = new BitSet(length);
        LOG.debug("split= " + split);
        TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
        RecordReader<LongWritable, Text> reader = format.createRecordReader(split, context);
        assertEquals("reader class is CombineFileRecordReader.", CombineFileRecordReader.class, reader.getClass());
        MapContext<LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl<LongWritable, Text, LongWritable, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        try {
            int count = 0;
            while (reader.nextKeyValue()) {
                LongWritable key = reader.getCurrentKey();
                assertNotNull("Key should not be null.", key);
                Text value = reader.getCurrentValue();
                final int v = Integer.parseInt(value.toString());
                LOG.debug("read " + v);
                assertFalse("Key in multiple partitions.", bits.get(v));
                bits.set(v);
                count++;
            }
            LOG.debug("split=" + split + " count=" + count);
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) Random(java.util.Random) LongWritable(org.apache.hadoop.io.LongWritable) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 23 with Job

use of org.apache.hadoop.mapreduce.Job in project hadoop by apache.

the class TestCombineTextInputFormat method testGzip.

/**
   * Test using the gzip codec for reading
   */
@Test(timeout = 10000)
public void testGzip() throws IOException, InterruptedException {
    Configuration conf = new Configuration(defaultConf);
    CompressionCodec gzip = new GzipCodec();
    ReflectionUtils.setConf(gzip, conf);
    localFs.delete(workDir, true);
    writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "the quick\nbrown\nfox jumped\nover\n the lazy\n dog\n");
    writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "this is a test\nof gzip\n");
    Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, workDir);
    CombineTextInputFormat format = new CombineTextInputFormat();
    List<InputSplit> splits = format.getSplits(job);
    assertEquals("compressed splits == 1", 1, splits.size());
    List<Text> results = readSplit(format, splits.get(0), job);
    assertEquals("splits[0] length", 8, results.size());
    final String[] firstList = { "the quick", "brown", "fox jumped", "over", " the lazy", " dog" };
    final String[] secondList = { "this is a test", "of gzip" };
    String first = results.get(0).toString();
    if (first.equals(firstList[0])) {
        testResults(results, firstList, secondList);
    } else if (first.equals(secondList[0])) {
        testResults(results, secondList, firstList);
    } else {
        fail("unexpected first token!");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) GzipCodec(org.apache.hadoop.io.compress.GzipCodec) Text(org.apache.hadoop.io.Text) CompressionCodec(org.apache.hadoop.io.compress.CompressionCodec) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 24 with Job

use of org.apache.hadoop.mapreduce.Job in project hadoop by apache.

the class TestMRCJCFileInputFormat method testForEmptyFile.

/**
   * Test when the input file's length is 0.
   */
@Test
public void testForEmptyFile() throws Exception {
    Configuration conf = new Configuration();
    FileSystem fileSys = FileSystem.get(conf);
    Path file = new Path("test" + "/file");
    FSDataOutputStream out = fileSys.create(file, true, conf.getInt("io.file.buffer.size", 4096), (short) 1, (long) 1024);
    out.write(new byte[0]);
    out.close();
    // split it using a File input format
    DummyInputFormat inFormat = new DummyInputFormat();
    Job job = Job.getInstance(conf);
    FileInputFormat.setInputPaths(job, "test");
    List<InputSplit> splits = inFormat.getSplits(job);
    assertEquals(1, splits.size());
    FileSplit fileSplit = (FileSplit) splits.get(0);
    assertEquals(0, fileSplit.getLocations().length);
    assertEquals(file.getName(), fileSplit.getPath().getName());
    assertEquals(0, fileSplit.getStart());
    assertEquals(0, fileSplit.getLength());
    fileSys.delete(file.getParent(), true);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) Test(org.junit.Test)

Example 25 with Job

use of org.apache.hadoop.mapreduce.Job in project hadoop by apache.

the class TestMRCJCFileInputFormat method testAddInputPath.

@Test
public void testAddInputPath() throws IOException {
    final Configuration conf = new Configuration();
    conf.set("fs.defaultFS", "file:///abc/");
    final Job j = Job.getInstance(conf);
    //setup default fs
    final FileSystem defaultfs = FileSystem.get(conf);
    System.out.println("defaultfs.getUri() = " + defaultfs.getUri());
    {
        //test addInputPath
        final Path original = new Path("file:/foo");
        System.out.println("original = " + original);
        FileInputFormat.addInputPath(j, original);
        final Path[] results = FileInputFormat.getInputPaths(j);
        System.out.println("results = " + Arrays.asList(results));
        assertEquals(1, results.length);
        assertEquals(original, results[0]);
    }
    {
        //test setInputPaths
        final Path original = new Path("file:/bar");
        System.out.println("original = " + original);
        FileInputFormat.setInputPaths(j, original);
        final Path[] results = FileInputFormat.getInputPaths(j);
        System.out.println("results = " + Arrays.asList(results));
        assertEquals(1, results.length);
        assertEquals(original, results[0]);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) FileSystem(org.apache.hadoop.fs.FileSystem) Job(org.apache.hadoop.mapreduce.Job) Test(org.junit.Test)

Aggregations

Job (org.apache.hadoop.mapreduce.Job)886 Path (org.apache.hadoop.fs.Path)498 Configuration (org.apache.hadoop.conf.Configuration)434 Test (org.junit.Test)259 IOException (java.io.IOException)135 FileSystem (org.apache.hadoop.fs.FileSystem)128 File (java.io.File)77 InputSplit (org.apache.hadoop.mapreduce.InputSplit)58 ArrayList (java.util.ArrayList)55 TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)55 Scan (org.apache.hadoop.hbase.client.Scan)45 FileStatus (org.apache.hadoop.fs.FileStatus)44 NutchJob (org.apache.nutch.util.NutchJob)43 JobConf (org.apache.hadoop.mapred.JobConf)42 Text (org.apache.hadoop.io.Text)39 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)36 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)35 JobContext (org.apache.hadoop.mapreduce.JobContext)35 GenericOptionsParser (org.apache.hadoop.util.GenericOptionsParser)35 CommandLine (org.apache.commons.cli.CommandLine)33