use of org.apache.hadoop.io.IntWritable in project hadoop by apache.
the class TestSequenceFileAsBinaryOutputFormat method testBinary.
@Test
public void testBinary() throws IOException {
JobConf job = new JobConf();
FileSystem fs = FileSystem.getLocal(job);
Path dir = new Path(new Path(new Path(System.getProperty("test.build.data", ".")), FileOutputCommitter.TEMP_DIR_NAME), "_" + attempt);
Path file = new Path(dir, "testbinary.seq");
Random r = new Random();
long seed = r.nextLong();
r.setSeed(seed);
fs.delete(dir, true);
if (!fs.mkdirs(dir)) {
fail("Failed to create output directory");
}
job.set(JobContext.TASK_ATTEMPT_ID, attempt);
FileOutputFormat.setOutputPath(job, dir.getParent().getParent());
FileOutputFormat.setWorkOutputPath(job, dir);
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);
SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
BytesWritable bkey = new BytesWritable();
BytesWritable bval = new BytesWritable();
RecordWriter<BytesWritable, BytesWritable> writer = new SequenceFileAsBinaryOutputFormat().getRecordWriter(fs, job, file.toString(), Reporter.NULL);
IntWritable iwritable = new IntWritable();
DoubleWritable dwritable = new DoubleWritable();
DataOutputBuffer outbuf = new DataOutputBuffer();
LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
try {
for (int i = 0; i < RECORDS; ++i) {
iwritable = new IntWritable(r.nextInt());
iwritable.write(outbuf);
bkey.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
dwritable = new DoubleWritable(r.nextDouble());
dwritable.write(outbuf);
bval.set(outbuf.getData(), 0, outbuf.getLength());
outbuf.reset();
writer.write(bkey, bval);
}
} finally {
writer.close(Reporter.NULL);
}
InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>();
int count = 0;
r.setSeed(seed);
DataInputBuffer buf = new DataInputBuffer();
final int NUM_SPLITS = 3;
SequenceFileInputFormat.addInputPath(job, file);
LOG.info("Reading data by SequenceFileInputFormat");
for (InputSplit split : iformat.getSplits(job, NUM_SPLITS)) {
RecordReader<IntWritable, DoubleWritable> reader = iformat.getRecordReader(split, job, Reporter.NULL);
try {
int sourceInt;
double sourceDouble;
while (reader.next(iwritable, dwritable)) {
sourceInt = r.nextInt();
sourceDouble = r.nextDouble();
assertEquals("Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get());
assertTrue("Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0);
++count;
}
} finally {
reader.close();
}
}
assertEquals("Some records not found", RECORDS, count);
}
use of org.apache.hadoop.io.IntWritable in project hadoop by apache.
the class TestSequenceFileAsTextInputFormat method testFormat.
@Test
public void testFormat() throws Exception {
JobConf job = new JobConf(conf);
FileSystem fs = FileSystem.getLocal(conf);
Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
Path file = new Path(dir, "test.seq");
Reporter reporter = Reporter.NULL;
int seed = new Random().nextInt();
//LOG.info("seed = "+seed);
Random random = new Random(seed);
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
// for a variety of lengths
for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
//LOG.info("creating; entries = " + length);
// create a file with length entries
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, LongWritable.class);
try {
for (int i = 0; i < length; i++) {
IntWritable key = new IntWritable(i);
LongWritable value = new LongWritable(10 * i);
writer.append(key, value);
}
} finally {
writer.close();
}
// try splitting the file in a variety of sizes
InputFormat<Text, Text> format = new SequenceFileAsTextInputFormat();
for (int i = 0; i < 3; i++) {
int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
//LOG.info("splitting: requesting = " + numSplits);
InputSplit[] splits = format.getSplits(job, numSplits);
//LOG.info("splitting: got = " + splits.length);
// check each split
BitSet bits = new BitSet(length);
for (int j = 0; j < splits.length; j++) {
RecordReader<Text, Text> reader = format.getRecordReader(splits[j], job, reporter);
Class readerClass = reader.getClass();
assertEquals("reader class is SequenceFileAsTextRecordReader.", SequenceFileAsTextRecordReader.class, readerClass);
Text value = reader.createValue();
Text key = reader.createKey();
try {
int count = 0;
while (reader.next(key, value)) {
// if (bits.get(key.get())) {
// LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());
// LOG.info("@"+reader.getPos());
// }
int keyInt = Integer.parseInt(key.toString());
assertFalse("Key in multiple partitions.", bits.get(keyInt));
bits.set(keyInt);
count++;
}
//LOG.info("splits["+j+"]="+splits[j]+" count=" + count);
} finally {
reader.close();
}
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
}
use of org.apache.hadoop.io.IntWritable in project hadoop by apache.
the class TestSequenceFileInputFormat method testFormat.
@Test
public void testFormat() throws Exception {
JobConf job = new JobConf(conf);
FileSystem fs = FileSystem.getLocal(conf);
Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
Path file = new Path(dir, "test.seq");
Reporter reporter = Reporter.NULL;
int seed = new Random().nextInt();
//LOG.info("seed = "+seed);
Random random = new Random(seed);
fs.delete(dir, true);
FileInputFormat.setInputPaths(job, dir);
// for a variety of lengths
for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
//LOG.info("creating; entries = " + length);
// create a file with length entries
SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class);
try {
for (int i = 0; i < length; i++) {
IntWritable key = new IntWritable(i);
byte[] data = new byte[random.nextInt(10)];
random.nextBytes(data);
BytesWritable value = new BytesWritable(data);
writer.append(key, value);
}
} finally {
writer.close();
}
// try splitting the file in a variety of sizes
InputFormat<IntWritable, BytesWritable> format = new SequenceFileInputFormat<IntWritable, BytesWritable>();
IntWritable key = new IntWritable();
BytesWritable value = new BytesWritable();
for (int i = 0; i < 3; i++) {
int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
//LOG.info("splitting: requesting = " + numSplits);
InputSplit[] splits = format.getSplits(job, numSplits);
//LOG.info("splitting: got = " + splits.length);
// check each split
BitSet bits = new BitSet(length);
for (int j = 0; j < splits.length; j++) {
RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(splits[j], job, reporter);
try {
int count = 0;
while (reader.next(key, value)) {
// if (bits.get(key.get())) {
// LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());
// LOG.info("@"+reader.getPos());
// }
assertFalse("Key in multiple partitions.", bits.get(key.get()));
bits.set(key.get());
count++;
}
//LOG.info("splits["+j+"]="+splits[j]+" count=" + count);
} finally {
reader.close();
}
}
assertEquals("Some keys in no partition.", length, bits.cardinality());
}
}
}
use of org.apache.hadoop.io.IntWritable in project hadoop by apache.
the class TestDatamerge method testNestedJoin.
@Test
public void testNestedJoin() throws Exception {
// outer(inner(S1,...,Sn),outer(S1,...Sn))
final int SOURCES = 3;
final int ITEMS = (SOURCES + 1) * (SOURCES + 1);
JobConf job = new JobConf();
Path base = cluster.getFileSystem().makeQualified(new Path("/nested"));
int[][] source = new int[SOURCES][];
for (int i = 0; i < SOURCES; ++i) {
source[i] = new int[ITEMS];
for (int j = 0; j < ITEMS; ++j) {
source[i][j] = (i + 2) * (j + 1);
}
}
Path[] src = new Path[SOURCES];
SequenceFile.Writer[] out = createWriters(base, job, SOURCES, src);
IntWritable k = new IntWritable();
for (int i = 0; i < SOURCES; ++i) {
IntWritable v = new IntWritable();
v.set(i);
for (int j = 0; j < ITEMS; ++j) {
k.set(source[i][j]);
out[i].append(k, v);
}
out[i].close();
}
out = null;
StringBuilder sb = new StringBuilder();
sb.append("outer(inner(");
for (int i = 0; i < SOURCES; ++i) {
sb.append(CompositeInputFormat.compose(SequenceFileInputFormat.class, src[i].toString()));
if (i + 1 != SOURCES)
sb.append(",");
}
sb.append("),outer(");
sb.append(CompositeInputFormat.compose(Fake_IF.class, "foobar"));
sb.append(",");
for (int i = 0; i < SOURCES; ++i) {
sb.append(CompositeInputFormat.compose(SequenceFileInputFormat.class, src[i].toString()));
sb.append(",");
}
sb.append(CompositeInputFormat.compose(Fake_IF.class, "raboof") + "))");
job.set("mapreduce.join.expr", sb.toString());
job.setInputFormat(CompositeInputFormat.class);
Path outf = new Path(base, "out");
FileOutputFormat.setOutputPath(job, outf);
Fake_IF.setKeyClass(job, IntWritable.class);
Fake_IF.setValClass(job, IntWritable.class);
job.setMapperClass(IdentityMapper.class);
job.setReducerClass(IdentityReducer.class);
job.setNumReduceTasks(0);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(TupleWritable.class);
job.setOutputFormat(SequenceFileOutputFormat.class);
JobClient.runJob(job);
FileStatus[] outlist = cluster.getFileSystem().listStatus(outf, new Utils.OutputFileUtils.OutputFilesFilter());
assertEquals(1, outlist.length);
assertTrue(0 < outlist[0].getLen());
SequenceFile.Reader r = new SequenceFile.Reader(cluster.getFileSystem(), outlist[0].getPath(), job);
TupleWritable v = new TupleWritable();
while (r.next(k, v)) {
assertFalse(((TupleWritable) v.get(1)).has(0));
assertFalse(((TupleWritable) v.get(1)).has(SOURCES + 1));
boolean chk = true;
int ki = k.get();
for (int i = 2; i < SOURCES + 2; ++i) {
if ((ki % i) == 0 && ki <= i * ITEMS) {
assertEquals(i - 2, ((IntWritable) ((TupleWritable) v.get(1)).get((i - 1))).get());
} else
chk = false;
}
if (chk) {
// present in all sources; chk inner
assertTrue(v.has(0));
for (int i = 0; i < SOURCES; ++i) assertTrue(((TupleWritable) v.get(0)).has(i));
} else {
// should not be present in inner join
assertFalse(v.has(0));
}
}
r.close();
base.getFileSystem(job).delete(base, true);
}
use of org.apache.hadoop.io.IntWritable in project hadoop by apache.
the class TestTupleWritable method makeRandomWritables.
private Writable[] makeRandomWritables() {
Random r = new Random();
Writable[] writs = { new BooleanWritable(r.nextBoolean()), new FloatWritable(r.nextFloat()), new FloatWritable(r.nextFloat()), new IntWritable(r.nextInt()), new LongWritable(r.nextLong()), new BytesWritable("dingo".getBytes()), new LongWritable(r.nextLong()), new IntWritable(r.nextInt()), new BytesWritable("yak".getBytes()), new IntWritable(r.nextInt()) };
return writs;
}
Aggregations