use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.
the class TestMRKeyValueTextInputFormat method testSplitableCodecs.
@Test
public void testSplitableCodecs() throws Exception {
final Job job = Job.getInstance(defaultConf);
final Configuration conf = job.getConfiguration();
// Create the codec
CompressionCodec codec = null;
try {
codec = (CompressionCodec) ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
} catch (ClassNotFoundException cnfe) {
throw new IOException("Illegal codec!");
}
Path file = new Path(workDir, "test" + codec.getDefaultExtension());
int seed = new Random().nextInt();
LOG.info("seed = " + seed);
Random random = new Random(seed);
localFs.delete(workDir, true);
FileInputFormat.setInputPaths(job, workDir);
final int MAX_LENGTH = 500000;
FileInputFormat.setMaxInputSplitSize(job, MAX_LENGTH / 20);
// for a variety of lengths
for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 4) + 1) {
LOG.info("creating; entries = " + length);
// create a file with length entries
Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
try {
for (int i = 0; i < length; i++) {
writer.write(Integer.toString(i * 2));
writer.write("\t");
writer.write(Integer.toString(i));
writer.write("\n");
}
} finally {
writer.close();
}
// try splitting the file in a variety of sizes
KeyValueTextInputFormat format = new KeyValueTextInputFormat();
assertTrue("KVTIF claims not splittable", format.isSplitable(job, file));
for (int i = 0; i < 3; i++) {
int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
LOG.info("splitting: requesting = " + numSplits);
List<InputSplit> splits = format.getSplits(job);
LOG.info("splitting: got = " + splits.size());
// check each split
BitSet bits = new BitSet(length);
for (int j = 0; j < splits.size(); j++) {
LOG.debug("split[" + j + "]= " + splits.get(j));
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
RecordReader<Text, Text> reader = format.createRecordReader(splits.get(j), context);
Class<?> clazz = reader.getClass();
MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(j));
reader.initialize(splits.get(j), mcontext);
Text key = null;
Text value = null;
try {
int count = 0;
while (reader.nextKeyValue()) {
key = reader.getCurrentKey();
value = reader.getCurrentValue();
final int k = Integer.parseInt(key.toString());
final int v = Integer.parseInt(value.toString());
assertEquals("Bad key", 0, k % 2);
assertEquals("Mismatched key/value", k / 2, v);
LOG.debug("read " + k + "," + v);
assertFalse(k + "," + v + " in multiple partitions.", bits.get(v));
bits.set(v);
count++;
}
if (count > 0) {
LOG.info("splits[" + j + "]=" + splits.get(j) + " count=" + count);
} else {
LOG.debug("splits[" + j + "]=" + splits.get(j) + " count=" + count);
}
} finally {
reader.close();
}
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
}
use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.
the class TestMRSequenceFileAsBinaryInputFormat method testBinary.
@Test
public void testBinary() throws IOException, InterruptedException {
Job job = Job.getInstance();
FileSystem fs = FileSystem.getLocal(job.getConfiguration());
Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
Path file = new Path(dir, "testbinary.seq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
Text tkey = new Text();
Text tval = new Text();
SequenceFile.Writer writer = new SequenceFile.Writer(fs, job.getConfiguration(), file, Text.class, Text.class);
try {
for (int i = 0; i < RECORDS; ++i) {
tkey.set(Integer.toString(r.nextInt(), 36));
tval.set(Long.toString(r.nextLong(), 36));
writer.append(tkey, tval);
}
} finally {
writer.close();
}
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
InputFormat<BytesWritable, BytesWritable> bformat = new SequenceFileAsBinaryInputFormat();
int count = 0;
r.setSeed(seed);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
Text cmpkey = new Text();
Text cmpval = new Text();
DataInputBuffer buf = new DataInputBuffer();
FileInputFormat.setInputPaths(job, file);
for (InputSplit split : bformat.getSplits(job)) {
RecordReader<BytesWritable, BytesWritable> reader = bformat.createRecordReader(split, context);
MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<BytesWritable, BytesWritable, BytesWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
try {
while (reader.nextKeyValue()) {
bkey = reader.getCurrentKey();
bval = reader.getCurrentValue();
tkey.set(Integer.toString(r.nextInt(), 36));
tval.set(Long.toString(r.nextLong(), 36));
buf.reset(bkey.getBytes(), bkey.getLength());
cmpkey.readFields(buf);
buf.reset(bval.getBytes(), bval.getLength());
cmpval.readFields(buf);
assertTrue("Keys don't match: " + "*" + cmpkey.toString() + ":" + tkey.toString() + "*", cmpkey.toString().equals(tkey.toString()));
assertTrue("Vals don't match: " + "*" + cmpval.toString() + ":" + tval.toString() + "*", cmpval.toString().equals(tval.toString()));
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.
the class TestMRSequenceFileAsTextInputFormat method testFormat.
@Test
public void testFormat() throws Exception {
Job job = Job.getInstance(conf);
FileSystem fs = FileSystem.getLocal(conf);
Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
Path file = new Path(dir, "test.seq");
int seed = new Random().nextInt();
Random random = new Random(seed);
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
// for a variety of lengths
for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
// create a file with length entries
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, LongWritable.class);
try {
for (int i = 0; i < length; i++) {
IntWritable key = new IntWritable(i);
LongWritable value = new LongWritable(10 * i);
writer.append(key, value);
}
} finally {
writer.close();
}
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
// try splitting the file in a variety of sizes
InputFormat<Text, Text> format = new SequenceFileAsTextInputFormat();
for (int i = 0; i < 3; i++) {
// check each split
BitSet bits = new BitSet(length);
int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(file).getLen() / numSplits);
for (InputSplit split : format.getSplits(job)) {
RecordReader<Text, Text> reader = format.createRecordReader(split, context);
MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
Class<?> readerClass = reader.getClass();
assertEquals("reader class is SequenceFileAsTextRecordReader.", SequenceFileAsTextRecordReader.class, readerClass);
Text key;
try {
int count = 0;
while (reader.nextKeyValue()) {
key = reader.getCurrentKey();
int keyInt = Integer.parseInt(key.toString());
assertFalse("Key in multiple partitions.", bits.get(keyInt));
bits.set(keyInt);
count++;
}
} finally {
reader.close();
}
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
}
use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.
the class TestMRSequenceFileAsBinaryOutputFormat method testBinary.
@Test
public void testBinary() throws IOException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
FileOutputFormat.setOutputPath(job, outdir);
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);
SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat();
OutputCommitter committer = outputFormat.getOutputCommitter(context);
committer.setupJob(job);
RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);
IntWritable iwritable = new IntWritable();
DoubleWritable dwritable = new DoubleWritable();
DataOutputBuffer outbuf = new DataOutputBuffer();
LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
try {
for (int i = 0; i < RECORDS; ++i) {
iwritable = new IntWritable(r.nextInt());
iwritable.write(outbuf);
bkey.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
dwritable = new DoubleWritable(r.nextDouble());
dwritable.write(outbuf);
bval.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
writer.write(bkey, bval);
}
} finally {
writer.close(context);
}
committer.commitTask(context);
committer.commitJob(job);
InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>();
int count = 0;
r.setSeed(seed);
SequenceFileInputFormat.setInputPaths(job, outdir);
LOG.info("Reading data by SequenceFileInputFormat");
for (InputSplit split : iformat.getSplits(job)) {
RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
try {
int sourceInt;
double sourceDouble;
while (reader.nextKeyValue()) {
sourceInt = r.nextInt();
sourceDouble = r.nextDouble();
iwritable = reader.getCurrentKey();
dwritable = reader.getCurrentValue();
assertEquals("Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get());
assertTrue("Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0);
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
use of org.apache.hadoop.mapreduce.task.MapContextImpl in project hadoop by apache.
the class TestFixedLengthInputFormat method testZeroRecordLength.
/**
* Test with record length set to 0
*/
@Test(timeout = 5000)
public void testZeroRecordLength() throws Exception {
localFs.delete(workDir, true);
Path file = new Path(workDir, new String("testFormat.txt"));
createFile(file, null, 10, 10);
Job job = Job.getInstance(defaultConf);
// Set the fixed length record length config property
FixedLengthInputFormat format = new FixedLengthInputFormat();
format.setRecordLength(job.getConfiguration(), 0);
FileInputFormat.setInputPaths(job, workDir);
List<InputSplit> splits = format.getSplits(job);
boolean exceptionThrown = false;
for (InputSplit split : splits) {
try {
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
RecordReader<LongWritable, BytesWritable> reader = format.createRecordReader(split, context);
MapContext<LongWritable, BytesWritable, LongWritable, BytesWritable> mcontext = new MapContextImpl<LongWritable, BytesWritable, LongWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
} catch (IOException ioe) {
exceptionThrown = true;
LOG.info("Exception message:" + ioe.getMessage());
}
}
assertTrue("Exception for zero record length:", exceptionThrown);
}
Aggregations