Search in sources :

Example 6 with TaskAttemptContext

use of org.apache.hadoop.mapreduce.TaskAttemptContext in project hadoop by apache.

the class TestLineRecordReader method testUncompressedInputDefaultDelimiterPosValue.

@Test
public void testUncompressedInputDefaultDelimiterPosValue() throws Exception {
    Configuration conf = new Configuration();
    String inputData = "1234567890\r\n12\r\n345";
    Path inputFile = createInputFile(conf, inputData);
    conf.setInt("io.file.buffer.size", 10);
    conf.setInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);
    FileSplit split = new FileSplit(inputFile, 0, 15, (String[]) null);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    LineRecordReader reader = new LineRecordReader(null);
    reader.initialize(split, context);
    LongWritable key;
    Text value;
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get first record:"1234567890"
    assertEquals(10, value.getLength());
    assertEquals(0, key.get());
    reader.nextKeyValue();
    // Get second record:"12"
    assertEquals(2, value.getLength());
    // Key should be 12 right after "1234567890\r\n"
    assertEquals(12, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 16 right after "1234567890\r\n12\r\n"
    assertEquals(16, key.get());
    split = new FileSplit(inputFile, 15, 4, (String[]) null);
    reader = new LineRecordReader(null);
    reader.initialize(split, context);
    // The second split dropped the first record "\n"
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get third record:"345"
    assertEquals(3, value.getLength());
    // Key should be 16 right after "1234567890\r\n12\r\n"
    assertEquals(16, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 19 right after "1234567890\r\n12\r\n345"
    assertEquals(19, key.get());
    inputData = "123456789\r\r\n";
    inputFile = createInputFile(conf, inputData);
    split = new FileSplit(inputFile, 0, 12, (String[]) null);
    reader = new LineRecordReader(null);
    reader.initialize(split, context);
    reader.nextKeyValue();
    key = reader.getCurrentKey();
    value = reader.getCurrentValue();
    // Get first record:"123456789"
    assertEquals(9, value.getLength());
    assertEquals(0, key.get());
    reader.nextKeyValue();
    // Get second record:""
    assertEquals(0, value.getLength());
    // Key should be 10 right after "123456789\r"
    assertEquals(10, key.get());
    assertFalse(reader.nextKeyValue());
    // Key should be 12 right after "123456789\r\r\n"
    assertEquals(12, key.get());
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) Test(org.junit.Test)

Example 7 with TaskAttemptContext

use of org.apache.hadoop.mapreduce.TaskAttemptContext in project hadoop by apache.

the class TestLineRecordReader method readRecords.

// Use the LineRecordReader to read records from the file
public ArrayList<String> readRecords(URL testFileUrl, int splitSize) throws IOException {
    // Set up context
    File testFile = new File(testFileUrl.getFile());
    long testFileSize = testFile.length();
    Path testFilePath = new Path(testFile.getAbsolutePath());
    Configuration conf = new Configuration();
    conf.setInt("io.file.buffer.size", 1);
    TaskAttemptContext context = new TaskAttemptContextImpl(conf, new TaskAttemptID());
    // Gather the records returned by the record reader
    ArrayList<String> records = new ArrayList<String>();
    long offset = 0;
    while (offset < testFileSize) {
        FileSplit split = new FileSplit(testFilePath, offset, splitSize, null);
        LineRecordReader reader = new LineRecordReader();
        reader.initialize(split, context);
        while (reader.nextKeyValue()) {
            records.add(reader.getCurrentValue().toString());
        }
        offset += splitSize;
    }
    return records;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) TaskAttemptID(org.apache.hadoop.mapreduce.TaskAttemptID) TaskAttemptContextImpl(org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl) ArrayList(java.util.ArrayList) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) File(java.io.File)

Example 8 with TaskAttemptContext

use of org.apache.hadoop.mapreduce.TaskAttemptContext in project hadoop by apache.

the class StreamInputFormat method createRecordReader.

@Override
public RecordReader<Text, Text> createRecordReader(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String c = conf.get("stream.recordreader.class");
    if (c == null || c.indexOf("LineRecordReader") >= 0) {
        return super.createRecordReader(genericSplit, context);
    }
    // handling non-standard record reader (likely StreamXmlRecordReader)
    FileSplit split = (FileSplit) genericSplit;
    // LOG.info("getRecordReader start.....split=" + split);
    context.setStatus(split.toString());
    context.progress();
    // Open the file and seek to the start of the split
    FileSystem fs = split.getPath().getFileSystem(conf);
    FSDataInputStream in = fs.open(split.getPath());
    // Factory dispatch based on available params..
    Class readerClass;
    {
        readerClass = StreamUtil.goodClassOrNull(conf, c, null);
        if (readerClass == null) {
            throw new RuntimeException("Class not found: " + c);
        }
    }
    Constructor ctor;
    try {
        ctor = readerClass.getConstructor(new Class[] { FSDataInputStream.class, FileSplit.class, TaskAttemptContext.class, Configuration.class, FileSystem.class });
    } catch (NoSuchMethodException nsm) {
        throw new RuntimeException(nsm);
    }
    RecordReader<Text, Text> reader;
    try {
        reader = (RecordReader<Text, Text>) ctor.newInstance(new Object[] { in, split, context, conf, fs });
    } catch (Exception nsm) {
        throw new RuntimeException(nsm);
    }
    return reader;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Constructor(java.lang.reflect.Constructor) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) Text(org.apache.hadoop.io.Text) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) IOException(java.io.IOException) FileSystem(org.apache.hadoop.fs.FileSystem) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Example 9 with TaskAttemptContext

use of org.apache.hadoop.mapreduce.TaskAttemptContext in project hadoop by apache.

the class TestCombineSequenceFileInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws IOException, InterruptedException {
    Job job = Job.getInstance(conf);
    Random random = new Random();
    long seed = random.nextLong();
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create files with a variety of lengths
    createFiles(length, numFiles, random, job);
    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
    // create a combine split for the files
    InputFormat<IntWritable, BytesWritable> format = new CombineSequenceFileInputFormat<IntWritable, BytesWritable>();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        List<InputSplit> splits = format.getSplits(job);
        LOG.info("splitting: got =        " + splits.size());
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.size());
        InputSplit split = splits.get(0);
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check the split
        BitSet bits = new BitSet(length);
        RecordReader<IntWritable, BytesWritable> reader = format.createRecordReader(split, context);
        MapContext<IntWritable, BytesWritable, IntWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, BytesWritable, IntWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
        reader.initialize(split, mcontext);
        assertEquals("reader class is CombineFileRecordReader.", CombineFileRecordReader.class, reader.getClass());
        try {
            while (reader.nextKeyValue()) {
                IntWritable key = reader.getCurrentKey();
                BytesWritable value = reader.getCurrentValue();
                assertNotNull("Value should not be null.", value);
                final int k = key.get();
                LOG.debug("read " + k);
                assertFalse("Key in multiple partitions.", bits.get(k));
                bits.set(k);
            }
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}
Also used : MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) BitSet(java.util.BitSet) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) BytesWritable(org.apache.hadoop.io.BytesWritable) Random(java.util.Random) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapreduce.InputSplit) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 10 with TaskAttemptContext

use of org.apache.hadoop.mapreduce.TaskAttemptContext in project hadoop by apache.

the class TestCombineTextInputFormat method readSplit.

private static List<Text> readSplit(InputFormat<LongWritable, Text> format, InputSplit split, Job job) throws IOException, InterruptedException {
    List<Text> result = new ArrayList<Text>();
    Configuration conf = job.getConfiguration();
    TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(conf);
    RecordReader<LongWritable, Text> reader = format.createRecordReader(split, MapReduceTestUtil.createDummyMapTaskAttemptContext(conf));
    MapContext<LongWritable, Text, LongWritable, Text> mcontext = new MapContextImpl<LongWritable, Text, LongWritable, Text>(conf, context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
    reader.initialize(split, mcontext);
    while (reader.nextKeyValue()) {
        result.add(new Text(reader.getCurrentValue()));
    }
    return result;
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) MapContextImpl(org.apache.hadoop.mapreduce.task.MapContextImpl) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) TaskAttemptContext(org.apache.hadoop.mapreduce.TaskAttemptContext) LongWritable(org.apache.hadoop.io.LongWritable)

Aggregations

TaskAttemptContext (org.apache.hadoop.mapreduce.TaskAttemptContext)110 Configuration (org.apache.hadoop.conf.Configuration)58 Job (org.apache.hadoop.mapreduce.Job)44 Path (org.apache.hadoop.fs.Path)39 TaskAttemptContextImpl (org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl)38 InputSplit (org.apache.hadoop.mapreduce.InputSplit)36 Test (org.junit.Test)35 TaskAttemptID (org.apache.hadoop.mapreduce.TaskAttemptID)33 JobContext (org.apache.hadoop.mapreduce.JobContext)28 IOException (java.io.IOException)27 File (java.io.File)22 LongWritable (org.apache.hadoop.io.LongWritable)22 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)21 RecordWriter (org.apache.hadoop.mapreduce.RecordWriter)19 MapContextImpl (org.apache.hadoop.mapreduce.task.MapContextImpl)17 FileSystem (org.apache.hadoop.fs.FileSystem)16 OutputCommitter (org.apache.hadoop.mapreduce.OutputCommitter)12 ArrayList (java.util.ArrayList)11 BytesWritable (org.apache.hadoop.io.BytesWritable)10 MapFile (org.apache.hadoop.io.MapFile)10