use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestMRCJCFileInputFormat method testLastInputSplitAtSplitBoundary.
@Test
@SuppressWarnings({ "rawtypes", "unchecked" })
public void testLastInputSplitAtSplitBoundary() throws Exception {
FileInputFormat fif = new FileInputFormatForTest(1024l * 1024 * 1024, 128l * 1024 * 1024);
Configuration conf = new Configuration();
JobContext jobContext = mock(JobContext.class);
when(jobContext.getConfiguration()).thenReturn(conf);
List<InputSplit> splits = fif.getSplits(jobContext);
assertEquals(8, splits.size());
for (int i = 0; i < splits.size(); i++) {
InputSplit split = splits.get(i);
assertEquals(("host" + i), split.getLocations()[0]);
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestMRCJCFileInputFormat method testForEmptyFile.
/**
* Test when the input file's length is 0.
*/
@Test
public void testForEmptyFile() throws Exception {
Configuration conf = new Configuration();
FileSystem fileSys = FileSystem.get(conf);
Path file = new Path("test" + "/file");
FSDataOutputStream out = fileSys.create(file, true, conf.getInt("io.file.buffer.size", 4096), (short) 1, (long) 1024);
out.write(new byte[0]);
out.close();
// split it using a File input format
DummyInputFormat inFormat = new DummyInputFormat();
Job job = Job.getInstance(conf);
FileInputFormat.setInputPaths(job, "test");
List<InputSplit> splits = inFormat.getSplits(job);
assertEquals(1, splits.size());
FileSplit fileSplit = (FileSplit) splits.get(0);
assertEquals(0, fileSplit.getLocations().length);
assertEquals(file.getName(), fileSplit.getPath().getName());
assertEquals(0, fileSplit.getStart());
assertEquals(0, fileSplit.getLength());
fileSys.delete(file.getParent(), true);
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestMRKeyValueTextInputFormat method testFormat.
@Test
public void testFormat() throws Exception {
Job job = Job.getInstance(new Configuration(defaultConf));
Path file = new Path(workDir, "test.txt");
int seed = new Random().nextInt();
LOG.info("seed = " + seed);
Random random = new Random(seed);
localFs.delete(workDir, true);
FileInputFormat.setInputPaths(job, workDir);
final int MAX_LENGTH = 10000;
// for a variety of lengths
for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
LOG.debug("creating; entries = " + length);
// create a file with length entries
Writer writer = new OutputStreamWriter(localFs.create(file));
try {
for (int i = 0; i < length; i++) {
writer.write(Integer.toString(i * 2));
writer.write("\t");
writer.write(Integer.toString(i));
writer.write("\n");
}
} finally {
writer.close();
}
// try splitting the file in a variety of sizes
KeyValueTextInputFormat format = new KeyValueTextInputFormat();
for (int i = 0; i < 3; i++) {
int numSplits = random.nextInt(MAX_LENGTH / 20) + 1;
LOG.debug("splitting: requesting = " + numSplits);
List<InputSplit> splits = format.getSplits(job);
LOG.debug("splitting: got = " + splits.size());
// check each split
BitSet bits = new BitSet(length);
for (int j = 0; j < splits.size(); j++) {
LOG.debug("split[" + j + "]= " + splits.get(j));
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
RecordReader<Text, Text> reader = format.createRecordReader(splits.get(j), context);
Class<?> clazz = reader.getClass();
assertEquals("reader class is KeyValueLineRecordReader.", KeyValueLineRecordReader.class, clazz);
MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(j));
reader.initialize(splits.get(j), mcontext);
Text key = null;
Text value = null;
try {
int count = 0;
while (reader.nextKeyValue()) {
key = reader.getCurrentKey();
clazz = key.getClass();
assertEquals("Key class is Text.", Text.class, clazz);
value = reader.getCurrentValue();
clazz = value.getClass();
assertEquals("Value class is Text.", Text.class, clazz);
final int k = Integer.parseInt(key.toString());
final int v = Integer.parseInt(value.toString());
assertEquals("Bad key", 0, k % 2);
assertEquals("Mismatched key/value", k / 2, v);
LOG.debug("read " + v);
assertFalse("Key in multiple partitions.", bits.get(v));
bits.set(v);
count++;
}
LOG.debug("splits[" + j + "]=" + splits.get(j) + " count=" + count);
} finally {
reader.close();
}
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestMRKeyValueTextInputFormat method testSplitableCodecs.
@Test
public void testSplitableCodecs() throws Exception {
final Job job = Job.getInstance(defaultConf);
final Configuration conf = job.getConfiguration();
// Create the codec
CompressionCodec codec = null;
try {
codec = (CompressionCodec) ReflectionUtils.newInstance(conf.getClassByName("org.apache.hadoop.io.compress.BZip2Codec"), conf);
} catch (ClassNotFoundException cnfe) {
throw new IOException("Illegal codec!");
}
Path file = new Path(workDir, "test" + codec.getDefaultExtension());
int seed = new Random().nextInt();
LOG.info("seed = " + seed);
Random random = new Random(seed);
localFs.delete(workDir, true);
FileInputFormat.setInputPaths(job, workDir);
final int MAX_LENGTH = 500000;
FileInputFormat.setMaxInputSplitSize(job, MAX_LENGTH / 20);
// for a variety of lengths
for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 4) + 1) {
LOG.info("creating; entries = " + length);
// create a file with length entries
Writer writer = new OutputStreamWriter(codec.createOutputStream(localFs.create(file)));
try {
for (int i = 0; i < length; i++) {
writer.write(Integer.toString(i * 2));
writer.write("\t");
writer.write(Integer.toString(i));
writer.write("\n");
}
} finally {
writer.close();
}
// try splitting the file in a variety of sizes
KeyValueTextInputFormat format = new KeyValueTextInputFormat();
assertTrue("KVTIF claims not splittable", format.isSplitable(job, file));
for (int i = 0; i < 3; i++) {
int numSplits = random.nextInt(MAX_LENGTH / 2000) + 1;
LOG.info("splitting: requesting = " + numSplits);
List<InputSplit> splits = format.getSplits(job);
LOG.info("splitting: got = " + splits.size());
// check each split
BitSet bits = new BitSet(length);
for (int j = 0; j < splits.size(); j++) {
LOG.debug("split[" + j + "]= " + splits.get(j));
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
RecordReader<Text, Text> reader = format.createRecordReader(splits.get(j), context);
Class<?> clazz = reader.getClass();
MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), splits.get(j));
reader.initialize(splits.get(j), mcontext);
Text key = null;
Text value = null;
try {
int count = 0;
while (reader.nextKeyValue()) {
key = reader.getCurrentKey();
value = reader.getCurrentValue();
final int k = Integer.parseInt(key.toString());
final int v = Integer.parseInt(value.toString());
assertEquals("Bad key", 0, k % 2);
assertEquals("Mismatched key/value", k / 2, v);
LOG.debug("read " + k + "," + v);
assertFalse(k + "," + v + " in multiple partitions.", bits.get(v));
bits.set(v);
count++;
}
if (count > 0) {
LOG.info("splits[" + j + "]=" + splits.get(j) + " count=" + count);
} else {
LOG.debug("splits[" + j + "]=" + splits.get(j) + " count=" + count);
}
} finally {
reader.close();
}
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestMRSequenceFileAsBinaryInputFormat method testBinary.
@Test
public void testBinary() throws IOException, InterruptedException {
Job job = Job.getInstance();
FileSystem fs = FileSystem.getLocal(job.getConfiguration());
Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
Path file = new Path(dir, "testbinary.seq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
Text tkey = new Text();
Text tval = new Text();
SequenceFile.Writer writer = new SequenceFile.Writer(fs, job.getConfiguration(), file, Text.class, Text.class);
try {
for (int i = 0; i < RECORDS; ++i) {
tkey.set(Integer.toString(r.nextInt(), 36));
tval.set(Long.toString(r.nextLong(), 36));
writer.append(tkey, tval);
}
} finally {
writer.close();
}
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
InputFormat<BytesWritable, BytesWritable> bformat = new SequenceFileAsBinaryInputFormat();
int count = 0;
r.setSeed(seed);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
Text cmpkey = new Text();
Text cmpval = new Text();
DataInputBuffer buf = new DataInputBuffer();
FileInputFormat.setInputPaths(job, file);
for (InputSplit split : bformat.getSplits(job)) {
RecordReader<BytesWritable, BytesWritable> reader = bformat.createRecordReader(split, context);
MapContext<BytesWritable, BytesWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<BytesWritable, BytesWritable, BytesWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
try {
while (reader.nextKeyValue()) {
bkey = reader.getCurrentKey();
bval = reader.getCurrentValue();
tkey.set(Integer.toString(r.nextInt(), 36));
tval.set(Long.toString(r.nextLong(), 36));
buf.reset(bkey.getBytes(), bkey.getLength());
cmpkey.readFields(buf);
buf.reset(bval.getBytes(), bval.getLength());
cmpval.readFields(buf);
assertTrue("Keys don't match: " + "*" + cmpkey.toString() + ":" + tkey.toString() + "*", cmpkey.toString().equals(tkey.toString()));
assertTrue("Vals don't match: " + "*" + cmpval.toString() + ":" + tval.toString() + "*", cmpval.toString().equals(tval.toString()));
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
Aggregations