use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestMRSequenceFileAsTextInputFormat method testFormat.
@Test
public void testFormat() throws Exception {
Job job = Job.getInstance(conf);
FileSystem fs = FileSystem.getLocal(conf);
Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
Path file = new Path(dir, "test.seq");
int seed = new Random().nextInt();
Random random = new Random(seed);
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
// for a variety of lengths
for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
// create a file with length entries
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, LongWritable.class);
try {
for (int i = 0; i < length; i++) {
IntWritable key = new IntWritable(i);
LongWritable value = new LongWritable(10 * i);
writer.append(key, value);
}
} finally {
writer.close();
}
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
// try splitting the file in a variety of sizes
InputFormat<Text, Text> format = new SequenceFileAsTextInputFormat();
for (int i = 0; i < 3; i++) {
// check each split
BitSet bits = new BitSet(length);
int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(file).getLen() / numSplits);
for (InputSplit split : format.getSplits(job)) {
RecordReader<Text, Text> reader = format.createRecordReader(split, context);
MapContext<Text, Text, Text, Text> mcontext = new MapContextImpl<Text, Text, Text, Text>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
Class<?> readerClass = reader.getClass();
assertEquals("reader class is SequenceFileAsTextRecordReader.", SequenceFileAsTextRecordReader.class, readerClass);
Text key;
try {
int count = 0;
while (reader.nextKeyValue()) {
key = reader.getCurrentKey();
int keyInt = Integer.parseInt(key.toString());
assertFalse("Key in multiple partitions.", bits.get(keyInt));
bits.set(keyInt);
count++;
}
} finally {
reader.close();
}
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestMRSequenceFileAsBinaryOutputFormat method testBinary.
@Test
public void testBinary() throws IOException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
Path outdir = new Path(System.getProperty("test.build.data", "/tmp"), "outseq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
FileOutputFormat.setOutputPath(job, outdir);
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);
SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
TaskAttemptContext context = MapReduceTestUtil.createDummyMapTaskAttemptContext(job.getConfiguration());
OutputFormat<BytesWritable, BytesWritable> outputFormat = new SequenceFileAsBinaryOutputFormat();
OutputCommitter committer = outputFormat.getOutputCommitter(context);
committer.setupJob(job);
RecordWriter<BytesWritable, BytesWritable> writer = outputFormat.getRecordWriter(context);
IntWritable iwritable = new IntWritable();
DoubleWritable dwritable = new DoubleWritable();
DataOutputBuffer outbuf = new DataOutputBuffer();
LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
try {
for (int i = 0; i < RECORDS; ++i) {
iwritable = new IntWritable(r.nextInt());
iwritable.write(outbuf);
bkey.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
dwritable = new DoubleWritable(r.nextDouble());
dwritable.write(outbuf);
bval.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
writer.write(bkey, bval);
}
} finally {
writer.close(context);
}
committer.commitTask(context);
committer.commitJob(job);
InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>();
int count = 0;
r.setSeed(seed);
SequenceFileInputFormat.setInputPaths(job, outdir);
LOG.info("Reading data by SequenceFileInputFormat");
for (InputSplit split : iformat.getSplits(job)) {
RecordReader<IntWritable, DoubleWritable> reader = iformat.createRecordReader(split, context);
MapContext<IntWritable, DoubleWritable, BytesWritable, BytesWritable> mcontext = new MapContextImpl<IntWritable, DoubleWritable, BytesWritable, BytesWritable>(job.getConfiguration(), context.getTaskAttemptID(), reader, null, null, MapReduceTestUtil.createDummyReporter(), split);
reader.initialize(split, mcontext);
try {
int sourceInt;
double sourceDouble;
while (reader.nextKeyValue()) {
sourceInt = r.nextInt();
sourceDouble = r.nextDouble();
iwritable = reader.getCurrentKey();
dwritable = reader.getCurrentValue();
assertEquals("Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get());
assertTrue("Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0);
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestCombineFileInputFormat method testNodeDistribution.
@Test
public void testNodeDistribution() throws IOException, InterruptedException {
DummyInputFormat inFormat = new DummyInputFormat();
int numBlocks = 60;
long totLength = 0;
long blockSize = 100;
int numNodes = 10;
long minSizeNode = 50;
long minSizeRack = 50;
// 4 blocks per split.
int maxSplitSize = 200;
String[] locations = new String[numNodes];
for (int i = 0; i < numNodes; i++) {
locations[i] = "h" + i;
}
String[] racks = new String[0];
Path path = new Path("hdfs://file");
OneBlockInfo[] blocks = new OneBlockInfo[numBlocks];
int hostCountBase = 0;
// Generate block list. Replication 3 per block.
for (int i = 0; i < numBlocks; i++) {
int localHostCount = hostCountBase;
String[] blockHosts = new String[3];
for (int j = 0; j < 3; j++) {
int hostNum = localHostCount % numNodes;
blockHosts[j] = "h" + hostNum;
localHostCount++;
}
hostCountBase++;
blocks[i] = new OneBlockInfo(path, i * blockSize, blockSize, blockHosts, racks);
totLength += blockSize;
}
List<InputSplit> splits = new ArrayList<InputSplit>();
HashMap<String, Set<String>> rackToNodes = new HashMap<String, Set<String>>();
HashMap<String, List<OneBlockInfo>> rackToBlocks = new HashMap<String, List<OneBlockInfo>>();
HashMap<OneBlockInfo, String[]> blockToNodes = new HashMap<OneBlockInfo, String[]>();
Map<String, Set<OneBlockInfo>> nodeToBlocks = new TreeMap<String, Set<OneBlockInfo>>();
OneFileInfo.populateBlockInfo(blocks, rackToBlocks, blockToNodes, nodeToBlocks, rackToNodes);
inFormat.createSplits(nodeToBlocks, blockToNodes, rackToBlocks, totLength, maxSplitSize, minSizeNode, minSizeRack, splits);
int expectedSplitCount = (int) (totLength / maxSplitSize);
assertEquals(expectedSplitCount, splits.size());
// Ensure 90+% of the splits have node local blocks.
// 100% locality may not always be achieved.
int numLocalSplits = 0;
for (InputSplit inputSplit : splits) {
assertEquals(maxSplitSize, inputSplit.getLength());
if (inputSplit.getLocations().length == 1) {
numLocalSplits++;
}
}
assertTrue(numLocalSplits >= 0.9 * splits.size());
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestDelegatingInputFormat method testSplitting.
@SuppressWarnings("unchecked")
public void testSplitting() throws Exception {
Job job = Job.getInstance();
MiniDFSCluster dfs = null;
try {
dfs = new MiniDFSCluster.Builder(job.getConfiguration()).numDataNodes(4).racks(new String[] { "/rack0", "/rack0", "/rack1", "/rack1" }).hosts(new String[] { "host0", "host1", "host2", "host3" }).build();
FileSystem fs = dfs.getFileSystem();
Path path = getPath("/foo/bar", fs);
Path path2 = getPath("/foo/baz", fs);
Path path3 = getPath("/bar/bar", fs);
Path path4 = getPath("/bar/baz", fs);
final int numSplits = 100;
FileInputFormat.setMaxInputSplitSize(job, fs.getFileStatus(path).getLen() / numSplits);
MultipleInputs.addInputPath(job, path, TextInputFormat.class, MapClass.class);
MultipleInputs.addInputPath(job, path2, TextInputFormat.class, MapClass2.class);
MultipleInputs.addInputPath(job, path3, KeyValueTextInputFormat.class, MapClass.class);
MultipleInputs.addInputPath(job, path4, TextInputFormat.class, MapClass2.class);
DelegatingInputFormat inFormat = new DelegatingInputFormat();
int[] bins = new int[3];
for (InputSplit split : (List<InputSplit>) inFormat.getSplits(job)) {
assertTrue(split instanceof TaggedInputSplit);
final TaggedInputSplit tis = (TaggedInputSplit) split;
int index = -1;
if (tis.getInputFormatClass().equals(KeyValueTextInputFormat.class)) {
// path3
index = 0;
} else if (tis.getMapperClass().equals(MapClass.class)) {
// path
index = 1;
} else {
// path2 and path4
index = 2;
}
bins[index]++;
}
assertEquals("count is not equal to num splits", numSplits, bins[0]);
assertEquals("count is not equal to num splits", numSplits, bins[1]);
assertEquals("count is not equal to 2 * num splits", numSplits * 2, bins[2]);
} finally {
if (dfs != null) {
dfs.shutdown();
}
}
}
use of org.apache.hadoop.mapreduce.InputSplit in project hadoop by apache.
the class TestFixedLengthInputFormat method testGzipWithTwoInputs.
/**
* Test using the gzip codec with two input files.
*/
@Test(timeout = 5000)
public void testGzipWithTwoInputs() throws Exception {
CompressionCodec gzip = new GzipCodec();
localFs.delete(workDir, true);
Job job = Job.getInstance(defaultConf);
FixedLengthInputFormat format = new FixedLengthInputFormat();
format.setRecordLength(job.getConfiguration(), 5);
ReflectionUtils.setConf(gzip, job.getConfiguration());
FileInputFormat.setInputPaths(job, workDir);
// Create files with fixed length records with 5 byte long records.
writeFile(localFs, new Path(workDir, "part1.txt.gz"), gzip, "one two threefour five six seveneightnine ten ");
writeFile(localFs, new Path(workDir, "part2.txt.gz"), gzip, "ten nine eightsevensix five four threetwo one ");
List<InputSplit> splits = format.getSplits(job);
assertEquals("compressed splits == 2", 2, splits.size());
FileSplit tmp = (FileSplit) splits.get(0);
if (tmp.getPath().getName().equals("part2.txt.gz")) {
splits.set(0, splits.get(1));
splits.set(1, tmp);
}
List<String> results = readSplit(format, splits.get(0), job);
assertEquals("splits[0] length", 10, results.size());
assertEquals("splits[0][5]", "six ", results.get(5));
results = readSplit(format, splits.get(1), job);
assertEquals("splits[1] length", 10, results.size());
assertEquals("splits[1][0]", "ten ", results.get(0));
assertEquals("splits[1][1]", "nine ", results.get(1));
}
Aggregations