Search in sources :

Example 6 with IntWritable

use of org.apache.hadoop.io.IntWritable in project hadoop by apache.

the class TestSequenceFileAsBinaryOutputFormat method testBinary.

@Test
public void testBinary() throws IOException {
    JobConf job = new JobConf();
    FileSystem fs = FileSystem.getLocal(job);
    Path dir = new Path(new Path(new Path(System.getProperty("test.build.data", ".")), FileOutputCommitter.TEMP_DIR_NAME), "_" + attempt);
    Path file = new Path(dir, "testbinary.seq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);
    fs.delete(dir, true);
    if (!fs.mkdirs(dir)) {
        fail("Failed to create output directory");
    }
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    FileOutputFormat.setOutputPath(job, dir.getParent().getParent());
    FileOutputFormat.setWorkOutputPath(job, dir);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);
    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();
    RecordWriter<BytesWritable, BytesWritable> writer = new SequenceFileAsBinaryOutputFormat().getRecordWriter(fs, job, file.toString(), Reporter.NULL);
    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
        for (int i = 0; i < RECORDS; ++i) {
            iwritable = new IntWritable(r.nextInt());
            iwritable.write(outbuf);
            bkey.set(outbuf.getData(), 0, outbuf.getLength());
            outbuf.reset();
            dwritable = new DoubleWritable(r.nextDouble());
            dwritable.write(outbuf);
            bval.set(outbuf.getData(), 0, outbuf.getLength());
            outbuf.reset();
            writer.write(bkey, bval);
        }
    } finally {
        writer.close(Reporter.NULL);
    }
    InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    DataInputBuffer buf = new DataInputBuffer();
    final int NUM_SPLITS = 3;
    SequenceFileInputFormat.addInputPath(job, file);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job, NUM_SPLITS)) {
        RecordReader<IntWritable, DoubleWritable> reader = iformat.getRecordReader(split, job, Reporter.NULL);
        try {
            int sourceInt;
            double sourceDouble;
            while (reader.next(iwritable, dwritable)) {
                sourceInt = r.nextInt();
                sourceDouble = r.nextDouble();
                assertEquals("Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get());
                assertTrue("Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0);
                ++count;
            }
        } finally {
            reader.close();
        }
    }
    assertEquals("Some records not found", RECORDS, count);
}
Also used : Path(org.apache.hadoop.fs.Path) BytesWritable(org.apache.hadoop.io.BytesWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 7 with IntWritable

use of org.apache.hadoop.io.IntWritable in project hadoop by apache.

the class TestSequenceFileAsTextInputFormat method testFormat.

@Test
public void testFormat() throws Exception {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path file = new Path(dir, "test.seq");
    Reporter reporter = Reporter.NULL;
    int seed = new Random().nextInt();
    //LOG.info("seed = "+seed);
    Random random = new Random(seed);
    fs.delete(dir, true);
    FileInputFormat.setInputPaths(job, dir);
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
        //LOG.info("creating; entries = " + length);
        // create a file with length entries
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, LongWritable.class);
        try {
            for (int i = 0; i < length; i++) {
                IntWritable key = new IntWritable(i);
                LongWritable value = new LongWritable(10 * i);
                writer.append(key, value);
            }
        } finally {
            writer.close();
        }
        // try splitting the file in a variety of sizes
        InputFormat<Text, Text> format = new SequenceFileAsTextInputFormat();
        for (int i = 0; i < 3; i++) {
            int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
            //LOG.info("splitting: requesting = " + numSplits);
            InputSplit[] splits = format.getSplits(job, numSplits);
            //LOG.info("splitting: got =        " + splits.length);
            // check each split
            BitSet bits = new BitSet(length);
            for (int j = 0; j < splits.length; j++) {
                RecordReader<Text, Text> reader = format.getRecordReader(splits[j], job, reporter);
                Class readerClass = reader.getClass();
                assertEquals("reader class is SequenceFileAsTextRecordReader.", SequenceFileAsTextRecordReader.class, readerClass);
                Text value = reader.createValue();
                Text key = reader.createKey();
                try {
                    int count = 0;
                    while (reader.next(key, value)) {
                        // if (bits.get(key.get())) {
                        // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());
                        // LOG.info("@"+reader.getPos());
                        // }
                        int keyInt = Integer.parseInt(key.toString());
                        assertFalse("Key in multiple partitions.", bits.get(keyInt));
                        bits.set(keyInt);
                        count++;
                    }
                //LOG.info("splits["+j+"]="+splits[j]+" count=" + count);
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) Text(org.apache.hadoop.io.Text) Random(java.util.Random) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 8 with IntWritable

use of org.apache.hadoop.io.IntWritable in project hadoop by apache.

the class TestSequenceFileInputFormat method testFormat.

@Test
public void testFormat() throws Exception {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path file = new Path(dir, "test.seq");
    Reporter reporter = Reporter.NULL;
    int seed = new Random().nextInt();
    //LOG.info("seed = "+seed);
    Random random = new Random(seed);
    fs.delete(dir, true);
    FileInputFormat.setInputPaths(job, dir);
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
        //LOG.info("creating; entries = " + length);
        // create a file with length entries
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class);
        try {
            for (int i = 0; i < length; i++) {
                IntWritable key = new IntWritable(i);
                byte[] data = new byte[random.nextInt(10)];
                random.nextBytes(data);
                BytesWritable value = new BytesWritable(data);
                writer.append(key, value);
            }
        } finally {
            writer.close();
        }
        // try splitting the file in a variety of sizes
        InputFormat<IntWritable, BytesWritable> format = new SequenceFileInputFormat<IntWritable, BytesWritable>();
        IntWritable key = new IntWritable();
        BytesWritable value = new BytesWritable();
        for (int i = 0; i < 3; i++) {
            int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
            //LOG.info("splitting: requesting = " + numSplits);
            InputSplit[] splits = format.getSplits(job, numSplits);
            //LOG.info("splitting: got =        " + splits.length);
            // check each split
            BitSet bits = new BitSet(length);
            for (int j = 0; j < splits.length; j++) {
                RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(splits[j], job, reporter);
                try {
                    int count = 0;
                    while (reader.next(key, value)) {
                        // if (bits.get(key.get())) {
                        // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());
                        // LOG.info("@"+reader.getPos());
                        // }
                        assertFalse("Key in multiple partitions.", bits.get(key.get()));
                        bits.set(key.get());
                        count++;
                    }
                //LOG.info("splits["+j+"]="+splits[j]+" count=" + count);
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) BytesWritable(org.apache.hadoop.io.BytesWritable) Random(java.util.Random) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 9 with IntWritable

use of org.apache.hadoop.io.IntWritable in project hadoop by apache.

the class TestDatamerge method testNestedJoin.

@Test
public void testNestedJoin() throws Exception {
    // outer(inner(S1,...,Sn),outer(S1,...Sn))
    final int SOURCES = 3;
    final int ITEMS = (SOURCES + 1) * (SOURCES + 1);
    JobConf job = new JobConf();
    Path base = cluster.getFileSystem().makeQualified(new Path("/nested"));
    int[][] source = new int[SOURCES][];
    for (int i = 0; i < SOURCES; ++i) {
        source[i] = new int[ITEMS];
        for (int j = 0; j < ITEMS; ++j) {
            source[i][j] = (i + 2) * (j + 1);
        }
    }
    Path[] src = new Path[SOURCES];
    SequenceFile.Writer[] out = createWriters(base, job, SOURCES, src);
    IntWritable k = new IntWritable();
    for (int i = 0; i < SOURCES; ++i) {
        IntWritable v = new IntWritable();
        v.set(i);
        for (int j = 0; j < ITEMS; ++j) {
            k.set(source[i][j]);
            out[i].append(k, v);
        }
        out[i].close();
    }
    out = null;
    StringBuilder sb = new StringBuilder();
    sb.append("outer(inner(");
    for (int i = 0; i < SOURCES; ++i) {
        sb.append(CompositeInputFormat.compose(SequenceFileInputFormat.class, src[i].toString()));
        if (i + 1 != SOURCES)
            sb.append(",");
    }
    sb.append("),outer(");
    sb.append(CompositeInputFormat.compose(Fake_IF.class, "foobar"));
    sb.append(",");
    for (int i = 0; i < SOURCES; ++i) {
        sb.append(CompositeInputFormat.compose(SequenceFileInputFormat.class, src[i].toString()));
        sb.append(",");
    }
    sb.append(CompositeInputFormat.compose(Fake_IF.class, "raboof") + "))");
    job.set("mapreduce.join.expr", sb.toString());
    job.setInputFormat(CompositeInputFormat.class);
    Path outf = new Path(base, "out");
    FileOutputFormat.setOutputPath(job, outf);
    Fake_IF.setKeyClass(job, IntWritable.class);
    Fake_IF.setValClass(job, IntWritable.class);
    job.setMapperClass(IdentityMapper.class);
    job.setReducerClass(IdentityReducer.class);
    job.setNumReduceTasks(0);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(TupleWritable.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    JobClient.runJob(job);
    FileStatus[] outlist = cluster.getFileSystem().listStatus(outf, new Utils.OutputFileUtils.OutputFilesFilter());
    assertEquals(1, outlist.length);
    assertTrue(0 < outlist[0].getLen());
    SequenceFile.Reader r = new SequenceFile.Reader(cluster.getFileSystem(), outlist[0].getPath(), job);
    TupleWritable v = new TupleWritable();
    while (r.next(k, v)) {
        assertFalse(((TupleWritable) v.get(1)).has(0));
        assertFalse(((TupleWritable) v.get(1)).has(SOURCES + 1));
        boolean chk = true;
        int ki = k.get();
        for (int i = 2; i < SOURCES + 2; ++i) {
            if ((ki % i) == 0 && ki <= i * ITEMS) {
                assertEquals(i - 2, ((IntWritable) ((TupleWritable) v.get(1)).get((i - 1))).get());
            } else
                chk = false;
        }
        if (chk) {
            // present in all sources; chk inner
            assertTrue(v.has(0));
            for (int i = 0; i < SOURCES; ++i) assertTrue(((TupleWritable) v.get(0)).has(i));
        } else {
            // should not be present in inner join
            assertFalse(v.has(0));
        }
    }
    r.close();
    base.getFileSystem(job).delete(base, true);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) SequenceFileInputFormat(org.apache.hadoop.mapred.SequenceFileInputFormat) RecordReader(org.apache.hadoop.mapred.RecordReader) SequenceFile(org.apache.hadoop.io.SequenceFile) JobConf(org.apache.hadoop.mapred.JobConf) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 10 with IntWritable

use of org.apache.hadoop.io.IntWritable in project hadoop by apache.

the class TestTupleWritable method makeRandomWritables.

private Writable[] makeRandomWritables() {
    Random r = new Random();
    Writable[] writs = { new BooleanWritable(r.nextBoolean()), new FloatWritable(r.nextFloat()), new FloatWritable(r.nextFloat()), new IntWritable(r.nextInt()), new LongWritable(r.nextLong()), new BytesWritable("dingo".getBytes()), new LongWritable(r.nextLong()), new IntWritable(r.nextInt()), new BytesWritable("yak".getBytes()), new IntWritable(r.nextInt()) };
    return writs;
}
Also used : FloatWritable(org.apache.hadoop.io.FloatWritable) Random(java.util.Random) BooleanWritable(org.apache.hadoop.io.BooleanWritable) Writable(org.apache.hadoop.io.Writable) LongWritable(org.apache.hadoop.io.LongWritable) BooleanWritable(org.apache.hadoop.io.BooleanWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) LongWritable(org.apache.hadoop.io.LongWritable) IntWritable(org.apache.hadoop.io.IntWritable)

Aggregations

IntWritable (org.apache.hadoop.io.IntWritable)312 Test (org.junit.Test)116 Text (org.apache.hadoop.io.Text)102 LongWritable (org.apache.hadoop.io.LongWritable)70 Path (org.apache.hadoop.fs.Path)64 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)55 FloatWritable (org.apache.hadoop.io.FloatWritable)48 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)47 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)43 BooleanWritable (org.apache.hadoop.io.BooleanWritable)42 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)40 SequenceFile (org.apache.hadoop.io.SequenceFile)39 BytesWritable (org.apache.hadoop.io.BytesWritable)37 Writable (org.apache.hadoop.io.Writable)35 ArrayList (java.util.ArrayList)34 Configuration (org.apache.hadoop.conf.Configuration)33 Random (java.util.Random)29 TimestampWritable (org.apache.hadoop.hive.serde2.io.TimestampWritable)29 IOException (java.io.IOException)28 DeferredJavaObject (org.apache.hadoop.hive.ql.udf.generic.GenericUDF.DeferredJavaObject)28