Search in sources :

Example 61 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hive by apache.

the class CreateSequenceFile method main.

public static void main(String[] args) throws Exception {
    // Read parameters
    int lines = 10;
    List<String> extraArgs = new ArrayList<String>();
    for (int ai = 0; ai < args.length; ai++) {
        if (args[ai].equals("-line") && ai + 1 < args.length) {
            lines = Integer.parseInt(args[ai + 1]);
            ai++;
        } else {
            extraArgs.add(args[ai]);
        }
    }
    if (extraArgs.size() != 1) {
        usage();
    }
    JobConf conf = new JobConf(CreateSequenceFile.class);
    ThriftSerializer serializer = new ThriftSerializer();
    // Open files
    SequenceFile.Writer writer = new SequenceFile.Writer(FileSystem.get(conf), conf, new Path(extraArgs.get(0)), BytesWritable.class, BytesWritable.class);
    // write to file
    BytesWritable key = new BytesWritable();
    Random rand = new Random(20081215);
    for (int i = 0; i < lines; i++) {
        ArrayList<Integer> alist = new ArrayList<Integer>();
        alist.add(i);
        alist.add(i * 2);
        alist.add(i * 3);
        ArrayList<String> slist = new ArrayList<String>();
        slist.add("" + i * 10);
        slist.add("" + i * 100);
        slist.add("" + i * 1000);
        ArrayList<IntString> islist = new ArrayList<IntString>();
        islist.add(new IntString(i * i, "" + i * i * i, i));
        HashMap<String, String> hash = new HashMap<String, String>();
        hash.put("key_" + i, "value_" + i);
        Map<String, Map<String, Map<String, PropValueUnion>>> unionMap = new HashMap<String, Map<String, Map<String, PropValueUnion>>>();
        Map<String, Map<String, PropValueUnion>> erMap = new HashMap<String, Map<String, PropValueUnion>>();
        Map<String, PropValueUnion> attrMap = new HashMap<String, PropValueUnion>();
        erMap.put("erVal" + i, attrMap);
        attrMap.put("value_" + i, PropValueUnion.doubleValue(1.0));
        unionMap.put("key_" + i, erMap);
        Complex complex = new Complex(rand.nextInt(), "record_" + (new Integer(i)).toString(), alist, slist, islist, hash, unionMap, PropValueUnion.stringValue("test" + i), PropValueUnion.unionMStringString(hash), PropValueUnion.lString(slist));
        Writable value = serializer.serialize(complex);
        writer.append(key, value);
    }
    // Add an all-null record
    Complex complex = new Complex(0, null, null, null, null, null, null, null, null, null);
    Writable value = serializer.serialize(complex);
    writer.append(key, value);
    // Close files
    writer.close();
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) PropValueUnion(org.apache.hadoop.hive.serde2.thrift.test.PropValueUnion) Writable(org.apache.hadoop.io.Writable) BytesWritable(org.apache.hadoop.io.BytesWritable) IntString(org.apache.hadoop.hive.serde2.thrift.test.IntString) Complex(org.apache.hadoop.hive.serde2.thrift.test.Complex) IntString(org.apache.hadoop.hive.serde2.thrift.test.IntString) SequenceFile(org.apache.hadoop.io.SequenceFile) Random(java.util.Random) JobConf(org.apache.hadoop.mapred.JobConf) Path(org.apache.hadoop.fs.Path) BytesWritable(org.apache.hadoop.io.BytesWritable) HashMap(java.util.HashMap) Map(java.util.Map)

Example 62 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class BinaryProtocol method writeObject.

/**
   * Write the given object to the stream. If it is a Text or BytesWritable,
   * write it directly. Otherwise, write it to a buffer and then write the
   * length and data to the stream.
   * @param obj the object to write
   * @throws IOException
   */
private void writeObject(Writable obj) throws IOException {
    // in C++ as the natural translations.
    if (obj instanceof Text) {
        Text t = (Text) obj;
        int len = t.getLength();
        WritableUtils.writeVInt(stream, len);
        stream.write(t.getBytes(), 0, len);
    } else if (obj instanceof BytesWritable) {
        BytesWritable b = (BytesWritable) obj;
        int len = b.getLength();
        WritableUtils.writeVInt(stream, len);
        stream.write(b.getBytes(), 0, len);
    } else {
        buffer.reset();
        obj.write(buffer);
        int length = buffer.getLength();
        WritableUtils.writeVInt(stream, length);
        stream.write(buffer.getData(), 0, length);
    }
}
Also used : Text(org.apache.hadoop.io.Text) BytesWritable(org.apache.hadoop.io.BytesWritable)

Example 63 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestSequenceFileAsBinaryInputFormat method testBinary.

@Test
public void testBinary() throws IOException {
    JobConf job = new JobConf();
    FileSystem fs = FileSystem.getLocal(job);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path file = new Path(dir, "testbinary.seq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);
    fs.delete(dir, true);
    FileInputFormat.setInputPaths(job, dir);
    Text tkey = new Text();
    Text tval = new Text();
    SequenceFile.Writer writer = new SequenceFile.Writer(fs, job, file, Text.class, Text.class);
    try {
        for (int i = 0; i < RECORDS; ++i) {
            tkey.set(Integer.toString(r.nextInt(), 36));
            tval.set(Long.toString(r.nextLong(), 36));
            writer.append(tkey, tval);
        }
    } finally {
        writer.close();
    }
    InputFormat<BytesWritable, BytesWritable> bformat = new SequenceFileAsBinaryInputFormat();
    int count = 0;
    r.setSeed(seed);
    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();
    Text cmpkey = new Text();
    Text cmpval = new Text();
    DataInputBuffer buf = new DataInputBuffer();
    final int NUM_SPLITS = 3;
    FileInputFormat.setInputPaths(job, file);
    for (InputSplit split : bformat.getSplits(job, NUM_SPLITS)) {
        RecordReader<BytesWritable, BytesWritable> reader = bformat.getRecordReader(split, job, Reporter.NULL);
        try {
            while (reader.next(bkey, bval)) {
                tkey.set(Integer.toString(r.nextInt(), 36));
                tval.set(Long.toString(r.nextLong(), 36));
                buf.reset(bkey.getBytes(), bkey.getLength());
                cmpkey.readFields(buf);
                buf.reset(bval.getBytes(), bval.getLength());
                cmpval.readFields(buf);
                assertTrue("Keys don't match: " + "*" + cmpkey.toString() + ":" + tkey.toString() + "*", cmpkey.toString().equals(tkey.toString()));
                assertTrue("Vals don't match: " + "*" + cmpval.toString() + ":" + tval.toString() + "*", cmpval.toString().equals(tval.toString()));
                ++count;
            }
        } finally {
            reader.close();
        }
    }
    assertEquals("Some records not found", RECORDS, count);
}
Also used : Path(org.apache.hadoop.fs.Path) Text(org.apache.hadoop.io.Text) BytesWritable(org.apache.hadoop.io.BytesWritable) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) Random(java.util.Random) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.junit.Test)

Example 64 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestSequenceFileAsBinaryOutputFormat method testBinary.

@Test
public void testBinary() throws IOException {
    JobConf job = new JobConf();
    FileSystem fs = FileSystem.getLocal(job);
    Path dir = new Path(new Path(new Path(System.getProperty("test.build.data", ".")), FileOutputCommitter.TEMP_DIR_NAME), "_" + attempt);
    Path file = new Path(dir, "testbinary.seq");
    Random r = new Random();
    long seed = r.nextLong();
    r.setSeed(seed);
    fs.delete(dir, true);
    if (!fs.mkdirs(dir)) {
        fail("Failed to create output directory");
    }
    job.set(JobContext.TASK_ATTEMPT_ID, attempt);
    FileOutputFormat.setOutputPath(job, dir.getParent().getParent());
    FileOutputFormat.setWorkOutputPath(job, dir);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputKeyClass(job, IntWritable.class);
    SequenceFileAsBinaryOutputFormat.setSequenceFileOutputValueClass(job, DoubleWritable.class);
    SequenceFileAsBinaryOutputFormat.setCompressOutput(job, true);
    SequenceFileAsBinaryOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK);
    BytesWritable bkey = new BytesWritable();
    BytesWritable bval = new BytesWritable();
    RecordWriter<BytesWritable, BytesWritable> writer = new SequenceFileAsBinaryOutputFormat().getRecordWriter(fs, job, file.toString(), Reporter.NULL);
    IntWritable iwritable = new IntWritable();
    DoubleWritable dwritable = new DoubleWritable();
    DataOutputBuffer outbuf = new DataOutputBuffer();
    LOG.info("Creating data by SequenceFileAsBinaryOutputFormat");
    try {
        for (int i = 0; i < RECORDS; ++i) {
            iwritable = new IntWritable(r.nextInt());
            iwritable.write(outbuf);
            bkey.set(outbuf.getData(), 0, outbuf.getLength());
            outbuf.reset();
            dwritable = new DoubleWritable(r.nextDouble());
            dwritable.write(outbuf);
            bval.set(outbuf.getData(), 0, outbuf.getLength());
            outbuf.reset();
            writer.write(bkey, bval);
        }
    } finally {
        writer.close(Reporter.NULL);
    }
    InputFormat<IntWritable, DoubleWritable> iformat = new SequenceFileInputFormat<IntWritable, DoubleWritable>();
    int count = 0;
    r.setSeed(seed);
    DataInputBuffer buf = new DataInputBuffer();
    final int NUM_SPLITS = 3;
    SequenceFileInputFormat.addInputPath(job, file);
    LOG.info("Reading data by SequenceFileInputFormat");
    for (InputSplit split : iformat.getSplits(job, NUM_SPLITS)) {
        RecordReader<IntWritable, DoubleWritable> reader = iformat.getRecordReader(split, job, Reporter.NULL);
        try {
            int sourceInt;
            double sourceDouble;
            while (reader.next(iwritable, dwritable)) {
                sourceInt = r.nextInt();
                sourceDouble = r.nextDouble();
                assertEquals("Keys don't match: " + "*" + iwritable.get() + ":" + sourceInt + "*", sourceInt, iwritable.get());
                assertTrue("Vals don't match: " + "*" + dwritable.get() + ":" + sourceDouble + "*", Double.compare(dwritable.get(), sourceDouble) == 0);
                ++count;
            }
        } finally {
            reader.close();
        }
    }
    assertEquals("Some records not found", RECORDS, count);
}
Also used : Path(org.apache.hadoop.fs.Path) BytesWritable(org.apache.hadoop.io.BytesWritable) DoubleWritable(org.apache.hadoop.io.DoubleWritable) DataInputBuffer(org.apache.hadoop.io.DataInputBuffer) Random(java.util.Random) FileSystem(org.apache.hadoop.fs.FileSystem) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 65 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestSequenceFileInputFormat method testFormat.

@Test
public void testFormat() throws Exception {
    JobConf job = new JobConf(conf);
    FileSystem fs = FileSystem.getLocal(conf);
    Path dir = new Path(System.getProperty("test.build.data", ".") + "/mapred");
    Path file = new Path(dir, "test.seq");
    Reporter reporter = Reporter.NULL;
    int seed = new Random().nextInt();
    //LOG.info("seed = "+seed);
    Random random = new Random(seed);
    fs.delete(dir, true);
    FileInputFormat.setInputPaths(job, dir);
    // for a variety of lengths
    for (int length = 0; length < MAX_LENGTH; length += random.nextInt(MAX_LENGTH / 10) + 1) {
        //LOG.info("creating; entries = " + length);
        // create a file with length entries
        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, BytesWritable.class);
        try {
            for (int i = 0; i < length; i++) {
                IntWritable key = new IntWritable(i);
                byte[] data = new byte[random.nextInt(10)];
                random.nextBytes(data);
                BytesWritable value = new BytesWritable(data);
                writer.append(key, value);
            }
        } finally {
            writer.close();
        }
        // try splitting the file in a variety of sizes
        InputFormat<IntWritable, BytesWritable> format = new SequenceFileInputFormat<IntWritable, BytesWritable>();
        IntWritable key = new IntWritable();
        BytesWritable value = new BytesWritable();
        for (int i = 0; i < 3; i++) {
            int numSplits = random.nextInt(MAX_LENGTH / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
            //LOG.info("splitting: requesting = " + numSplits);
            InputSplit[] splits = format.getSplits(job, numSplits);
            //LOG.info("splitting: got =        " + splits.length);
            // check each split
            BitSet bits = new BitSet(length);
            for (int j = 0; j < splits.length; j++) {
                RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(splits[j], job, reporter);
                try {
                    int count = 0;
                    while (reader.next(key, value)) {
                        // if (bits.get(key.get())) {
                        // LOG.info("splits["+j+"]="+splits[j]+" : " + key.get());
                        // LOG.info("@"+reader.getPos());
                        // }
                        assertFalse("Key in multiple partitions.", bits.get(key.get()));
                        bits.set(key.get());
                        count++;
                    }
                //LOG.info("splits["+j+"]="+splits[j]+" count=" + count);
                } finally {
                    reader.close();
                }
            }
            assertEquals("Some keys in no partition.", length, bits.cardinality());
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BitSet(java.util.BitSet) BytesWritable(org.apache.hadoop.io.BytesWritable) Random(java.util.Random) SequenceFile(org.apache.hadoop.io.SequenceFile) FileSystem(org.apache.hadoop.fs.FileSystem) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Aggregations

BytesWritable (org.apache.hadoop.io.BytesWritable)275 Text (org.apache.hadoop.io.Text)73 LongWritable (org.apache.hadoop.io.LongWritable)59 Test (org.junit.Test)53 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)46 IntWritable (org.apache.hadoop.io.IntWritable)44 ArrayList (java.util.ArrayList)39 Path (org.apache.hadoop.fs.Path)38 IOException (java.io.IOException)36 Configuration (org.apache.hadoop.conf.Configuration)33 FloatWritable (org.apache.hadoop.io.FloatWritable)33 Writable (org.apache.hadoop.io.Writable)32 BooleanWritable (org.apache.hadoop.io.BooleanWritable)31 List (java.util.List)30 SequenceFile (org.apache.hadoop.io.SequenceFile)27 Random (java.util.Random)24 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)24 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)23 ByteWritable (org.apache.hadoop.hive.serde2.io.ByteWritable)22 FileSystem (org.apache.hadoop.fs.FileSystem)21