Search in sources :

Example 21 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestFixedLengthInputFormat method testNoRecordLength.

/**
   * Test with no record length set.
   */
@Test(timeout = 5000)
public void testNoRecordLength() throws IOException {
    localFs.delete(workDir, true);
    Path file = new Path(workDir, new String("testFormat.txt"));
    createFile(file, null, 10, 10);
    // Set the fixed length record length config property 
    JobConf job = new JobConf(defaultConf);
    FileInputFormat.setInputPaths(job, workDir);
    FixedLengthInputFormat format = new FixedLengthInputFormat();
    format.configure(job);
    InputSplit[] splits = format.getSplits(job, 1);
    boolean exceptionThrown = false;
    for (InputSplit split : splits) {
        try {
            RecordReader<LongWritable, BytesWritable> reader = format.getRecordReader(split, job, voidReporter);
        } catch (IOException ioe) {
            exceptionThrown = true;
            LOG.info("Exception message:" + ioe.getMessage());
        }
    }
    assertTrue("Exception for not setting record length:", exceptionThrown);
}
Also used : Path(org.apache.hadoop.fs.Path) BytesWritable(org.apache.hadoop.io.BytesWritable) LongWritable(org.apache.hadoop.io.LongWritable) IOException(java.io.IOException) Test(org.junit.Test)

Example 22 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestFixedLengthInputFormat method readSplit.

private static List<String> readSplit(FixedLengthInputFormat format, InputSplit split, JobConf job) throws IOException {
    List<String> result = new ArrayList<String>();
    RecordReader<LongWritable, BytesWritable> reader = format.getRecordReader(split, job, voidReporter);
    LongWritable key = reader.createKey();
    BytesWritable value = reader.createValue();
    try {
        while (reader.next(key, value)) {
            result.add(new String(value.getBytes(), 0, value.getLength()));
        }
    } finally {
        reader.close();
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) BytesWritable(org.apache.hadoop.io.BytesWritable) LongWritable(org.apache.hadoop.io.LongWritable)

Example 23 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestFixedLengthInputFormat method testZeroRecordLength.

/**
   * Test with record length set to 0
   */
@Test(timeout = 5000)
public void testZeroRecordLength() throws IOException {
    localFs.delete(workDir, true);
    Path file = new Path(workDir, new String("testFormat.txt"));
    createFile(file, null, 10, 10);
    // Set the fixed length record length config property 
    JobConf job = new JobConf(defaultConf);
    FileInputFormat.setInputPaths(job, workDir);
    FixedLengthInputFormat format = new FixedLengthInputFormat();
    format.setRecordLength(job, 0);
    format.configure(job);
    InputSplit[] splits = format.getSplits(job, 1);
    boolean exceptionThrown = false;
    for (InputSplit split : splits) {
        try {
            RecordReader<LongWritable, BytesWritable> reader = format.getRecordReader(split, job, voidReporter);
        } catch (IOException ioe) {
            exceptionThrown = true;
            LOG.info("Exception message:" + ioe.getMessage());
        }
    }
    assertTrue("Exception for zero record length:", exceptionThrown);
}
Also used : Path(org.apache.hadoop.fs.Path) BytesWritable(org.apache.hadoop.io.BytesWritable) LongWritable(org.apache.hadoop.io.LongWritable) IOException(java.io.IOException) Test(org.junit.Test)

Example 24 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hadoop by apache.

the class TestCombineSequenceFileInputFormat method testFormat.

@Test(timeout = 10000)
public void testFormat() throws Exception {
    JobConf job = new JobConf(conf);
    Reporter reporter = Reporter.NULL;
    Random random = new Random();
    long seed = random.nextLong();
    LOG.info("seed = " + seed);
    random.setSeed(seed);
    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, workDir);
    final int length = 10000;
    final int numFiles = 10;
    // create a file with various lengths
    createFiles(length, numFiles, random);
    // create a combine split for the files
    InputFormat<IntWritable, BytesWritable> format = new CombineSequenceFileInputFormat<IntWritable, BytesWritable>();
    IntWritable key = new IntWritable();
    BytesWritable value = new BytesWritable();
    for (int i = 0; i < 3; i++) {
        int numSplits = random.nextInt(length / (SequenceFile.SYNC_INTERVAL / 20)) + 1;
        LOG.info("splitting: requesting = " + numSplits);
        InputSplit[] splits = format.getSplits(job, numSplits);
        LOG.info("splitting: got =        " + splits.length);
        // we should have a single split as the length is comfortably smaller than
        // the block size
        assertEquals("We got more than one splits!", 1, splits.length);
        InputSplit split = splits[0];
        assertEquals("It should be CombineFileSplit", CombineFileSplit.class, split.getClass());
        // check each split
        BitSet bits = new BitSet(length);
        RecordReader<IntWritable, BytesWritable> reader = format.getRecordReader(split, job, reporter);
        try {
            while (reader.next(key, value)) {
                assertFalse("Key in multiple partitions.", bits.get(key.get()));
                bits.set(key.get());
            }
        } finally {
            reader.close();
        }
        assertEquals("Some keys in no partition.", length, bits.cardinality());
    }
}
Also used : CombineSequenceFileInputFormat(org.apache.hadoop.mapred.lib.CombineSequenceFileInputFormat) BitSet(java.util.BitSet) BytesWritable(org.apache.hadoop.io.BytesWritable) Random(java.util.Random) IntWritable(org.apache.hadoop.io.IntWritable) Test(org.junit.Test)

Example 25 with BytesWritable

use of org.apache.hadoop.io.BytesWritable in project hive by apache.

the class TestLazyBinaryFast method testLazyBinaryFast.

private void testLazyBinaryFast(SerdeRandomRowSource source, Object[][] rows, AbstractSerDe serde, StructObjectInspector rowOI, AbstractSerDe serde_fewer, StructObjectInspector writeRowOI, PrimitiveTypeInfo[] primitiveTypeInfos, boolean useIncludeColumns, boolean doWriteFewerColumns, Random r) throws Throwable {
    int rowCount = rows.length;
    int columnCount = primitiveTypeInfos.length;
    boolean[] columnsToInclude = null;
    if (useIncludeColumns) {
        columnsToInclude = new boolean[columnCount];
        for (int i = 0; i < columnCount; i++) {
            columnsToInclude[i] = r.nextBoolean();
        }
    }
    int writeColumnCount = columnCount;
    PrimitiveTypeInfo[] writePrimitiveTypeInfos = primitiveTypeInfos;
    if (doWriteFewerColumns) {
        writeColumnCount = writeRowOI.getAllStructFieldRefs().size();
        writePrimitiveTypeInfos = Arrays.copyOf(primitiveTypeInfos, writeColumnCount);
    }
    LazyBinarySerializeWrite lazyBinarySerializeWrite = new LazyBinarySerializeWrite(writeColumnCount);
    // Try to serialize
    BytesWritable[] serializeWriteBytes = new BytesWritable[rowCount];
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        Output output = new Output();
        lazyBinarySerializeWrite.set(output);
        for (int index = 0; index < writeColumnCount; index++) {
            Writable writable = (Writable) row[index];
            VerifyFast.serializeWrite(lazyBinarySerializeWrite, primitiveTypeInfos[index], writable);
        }
        BytesWritable bytesWritable = new BytesWritable();
        bytesWritable.set(output.getData(), 0, output.getLength());
        serializeWriteBytes[i] = bytesWritable;
    }
    // Try to deserialize
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        // Specifying the right type info length tells LazyBinaryDeserializeRead which is the last
        // column.
        LazyBinaryDeserializeRead lazyBinaryDeserializeRead = new LazyBinaryDeserializeRead(writePrimitiveTypeInfos, /* useExternalBuffer */
        false);
        BytesWritable bytesWritable = serializeWriteBytes[i];
        lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());
        for (int index = 0; index < columnCount; index++) {
            if (useIncludeColumns && !columnsToInclude[index]) {
                lazyBinaryDeserializeRead.skipNextField();
            } else if (index >= writeColumnCount) {
                // Should come back a null.
                VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], null);
            } else {
                Writable writable = (Writable) row[index];
                VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], writable);
            }
        }
        if (writeColumnCount == columnCount) {
            TestCase.assertTrue(lazyBinaryDeserializeRead.isEndOfInputReached());
        }
    }
    // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite.
    for (int i = 0; i < rowCount; i++) {
        BytesWritable bytesWritable = serializeWriteBytes[i];
        LazyBinaryStruct lazyBinaryStruct;
        if (doWriteFewerColumns) {
            lazyBinaryStruct = (LazyBinaryStruct) serde_fewer.deserialize(bytesWritable);
        } else {
            lazyBinaryStruct = (LazyBinaryStruct) serde.deserialize(bytesWritable);
        }
        Object[] row = rows[i];
        for (int index = 0; index < writeColumnCount; index++) {
            PrimitiveTypeInfo primitiveTypeInfo = primitiveTypeInfos[index];
            Writable writable = (Writable) row[index];
            Object object = lazyBinaryStruct.getField(index);
            if (writable == null || object == null) {
                if (writable != null || object != null) {
                    fail("SerDe deserialized NULL column mismatch");
                }
            } else {
                if (!object.equals(writable)) {
                    fail("SerDe deserialized value does not match");
                }
            }
        }
    }
    // One Writable per row.
    BytesWritable[] serdeBytes = new BytesWritable[rowCount];
    // Serialize using the SerDe, then below deserialize using DeserializeRead.
    Object[] serdeRow = new Object[writeColumnCount];
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        // LazyBinary seems to work better with an row object array instead of a Java object...
        for (int index = 0; index < writeColumnCount; index++) {
            serdeRow[index] = row[index];
        }
        BytesWritable serialized;
        if (doWriteFewerColumns) {
            serialized = (BytesWritable) serde_fewer.serialize(serdeRow, writeRowOI);
        } else {
            serialized = (BytesWritable) serde.serialize(serdeRow, rowOI);
        }
        BytesWritable bytesWritable = new BytesWritable(Arrays.copyOfRange(serialized.getBytes(), 0, serialized.getLength()));
        byte[] bytes1 = bytesWritable.getBytes();
        BytesWritable lazySerializedWriteBytes = serializeWriteBytes[i];
        byte[] bytes2 = Arrays.copyOfRange(lazySerializedWriteBytes.getBytes(), 0, lazySerializedWriteBytes.getLength());
        if (bytes1.length != bytes2.length) {
            fail("SerializeWrite length " + bytes2.length + " and " + "SerDe serialization length " + bytes1.length + " do not match (" + Arrays.toString(primitiveTypeInfos) + ")");
        }
        if (!Arrays.equals(bytes1, bytes2)) {
            fail("SerializeWrite and SerDe serialization does not match (" + Arrays.toString(primitiveTypeInfos) + ")");
        }
        serdeBytes[i] = bytesWritable;
    }
    // Try to deserialize using DeserializeRead our Writable row objects created by SerDe.
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        // When doWriteFewerColumns, try to read more fields than exist in buffer.
        LazyBinaryDeserializeRead lazyBinaryDeserializeRead = new LazyBinaryDeserializeRead(primitiveTypeInfos, /* useExternalBuffer */
        false);
        BytesWritable bytesWritable = serdeBytes[i];
        lazyBinaryDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());
        for (int index = 0; index < columnCount; index++) {
            if (useIncludeColumns && !columnsToInclude[index]) {
                lazyBinaryDeserializeRead.skipNextField();
            } else if (index >= writeColumnCount) {
                // Should come back a null.
                VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], null);
            } else {
                Writable writable = (Writable) row[index];
                VerifyFast.verifyDeserializeRead(lazyBinaryDeserializeRead, primitiveTypeInfos[index], writable);
            }
        }
        if (writeColumnCount == columnCount) {
            TestCase.assertTrue(lazyBinaryDeserializeRead.isEndOfInputReached());
        }
    }
}
Also used : LazyBinarySerializeWrite(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite) Writable(org.apache.hadoop.io.Writable) BytesWritable(org.apache.hadoop.io.BytesWritable) BytesWritable(org.apache.hadoop.io.BytesWritable) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) Output(org.apache.hadoop.hive.serde2.ByteStream.Output) LazyBinaryDeserializeRead(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead)

Aggregations

BytesWritable (org.apache.hadoop.io.BytesWritable)339 Test (org.junit.Test)92 Text (org.apache.hadoop.io.Text)81 LongWritable (org.apache.hadoop.io.LongWritable)66 IntWritable (org.apache.hadoop.io.IntWritable)54 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)51 ArrayList (java.util.ArrayList)48 List (java.util.List)48 Path (org.apache.hadoop.fs.Path)47 IOException (java.io.IOException)42 Configuration (org.apache.hadoop.conf.Configuration)41 FloatWritable (org.apache.hadoop.io.FloatWritable)37 Writable (org.apache.hadoop.io.Writable)36 BooleanWritable (org.apache.hadoop.io.BooleanWritable)35 FileSystem (org.apache.hadoop.fs.FileSystem)28 SequenceFile (org.apache.hadoop.io.SequenceFile)27 DoubleWritable (org.apache.hadoop.hive.serde2.io.DoubleWritable)26 ShortWritable (org.apache.hadoop.hive.serde2.io.ShortWritable)26 HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)25 Random (java.util.Random)24