Search in sources :

Example 6 with LazySimpleDeserializeRead

use of org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead in project hive by apache.

the class TestVectorSerDeRow method innerTestVectorSerializeRow.

void innerTestVectorSerializeRow(Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException {
    String[] emptyScratchTypeNames = new String[0];
    VectorRandomRowSource source = new VectorRandomRowSource();
    // FUTURE: try NULLs and UNICODE.
    source.init(r, VectorRandomRowSource.SupportedTypes.ALL, 4, /* allowNulls */
    false, /* isUnicodeOk */
    false);
    VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx();
    batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames);
    VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
    VectorAssignRow vectorAssignRow = new VectorAssignRow();
    vectorAssignRow.init(source.typeNames());
    int fieldCount = source.typeNames().size();
    DeserializeRead deserializeRead;
    SerializeWrite serializeWrite;
    switch(serializationType) {
        case BINARY_SORTABLE:
            deserializeRead = BinarySortableDeserializeRead.ascendingNullsFirst(source.typeInfos(), false);
            serializeWrite = new BinarySortableSerializeWrite(fieldCount);
            break;
        case LAZY_BINARY:
            deserializeRead = new LazyBinaryDeserializeRead(source.typeInfos(), /* useExternalBuffer */
            false);
            serializeWrite = new LazyBinarySerializeWrite(fieldCount);
            break;
        case LAZY_SIMPLE:
            {
                StructObjectInspector rowObjectInspector = source.rowStructObjectInspector();
                // Use different separator values.
                byte[] separators = new byte[] { (byte) 9, (byte) 2, (byte) 3, (byte) 4, (byte) 5, (byte) 6, (byte) 7, (byte) 8 };
                LazySerDeParameters lazySerDeParams = getSerDeParams(rowObjectInspector, separators);
                deserializeRead = new LazySimpleDeserializeRead(source.typeInfos(), /* useExternalBuffer */
                false, lazySerDeParams);
                serializeWrite = new LazySimpleSerializeWrite(fieldCount, lazySerDeParams);
            }
            break;
        default:
            throw new Error("Unknown serialization type " + serializationType);
    }
    VectorSerializeRow vectorSerializeRow = new VectorSerializeRow(serializeWrite);
    vectorSerializeRow.init(source.typeNames());
    Object[][] randomRows = source.randomRows(2000);
    int firstRandomRowIndex = 0;
    for (int i = 0; i < randomRows.length; i++) {
        Object[] row = randomRows[i];
        vectorAssignRow.assignRow(batch, batch.size, row);
        batch.size++;
        if (batch.size == batch.DEFAULT_SIZE) {
            serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex);
            firstRandomRowIndex = i + 1;
            batch.reset();
        }
    }
    if (batch.size > 0) {
        serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex);
    }
}
Also used : LazySerDeParameters(org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters) DeserializeRead(org.apache.hadoop.hive.serde2.fast.DeserializeRead) LazyBinaryDeserializeRead(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead) BinarySortableDeserializeRead(org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead) LazySimpleDeserializeRead(org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead) LazyBinarySerializeWrite(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite) LazySimpleDeserializeRead(org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead) BinarySortableSerializeWrite(org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite) LazySimpleSerializeWrite(org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleSerializeWrite) UnionObject(org.apache.hadoop.hive.serde2.objectinspector.UnionObject) LazyBinaryDeserializeRead(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead) SerializeWrite(org.apache.hadoop.hive.serde2.fast.SerializeWrite) BinarySortableSerializeWrite(org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite) LazyBinarySerializeWrite(org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite) LazySimpleSerializeWrite(org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleSerializeWrite) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 7 with LazySimpleDeserializeRead

use of org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead in project hive by apache.

the class TestLazySimpleDeserializeRead method testEscaping.

/**
 * Test for escaping.
 */
@Test
public void testEscaping() throws Exception {
    HiveConf hconf = new HiveConf();
    // set the escaping related properties
    Properties props = new Properties();
    props.setProperty(serdeConstants.FIELD_DELIM, "|");
    props.setProperty(serdeConstants.ESCAPE_CHAR, "\\");
    props.setProperty(serdeConstants.SERIALIZATION_ESCAPE_CRLF, "true");
    LazySerDeParameters lazyParams = new LazySerDeParameters(hconf, props, LazySimpleSerDe.class.getName());
    TypeInfo[] typeInfos = new TypeInfo[2];
    typeInfos[0] = TypeInfoFactory.getPrimitiveTypeInfo("string");
    typeInfos[1] = TypeInfoFactory.getPrimitiveTypeInfo("string");
    LazySimpleDeserializeRead deserializeRead = new LazySimpleDeserializeRead(typeInfos, null, true, lazyParams);
    // set and parse the row
    String s = "This\\nis\\rthe first\\r\\nmulti-line field\\n|field1-2";
    Text row = new Text(s.getBytes("UTF-8"));
    deserializeRead.set(row.getBytes(), 0, row.getLength());
    assertTrue(deserializeRead.readNextField());
    assertTrue(deserializeRead.currentExternalBufferNeeded);
    int externalBufferLen = deserializeRead.currentExternalBufferNeededLen;
    assertEquals("Wrong external buffer length", externalBufferLen, 36);
    byte[] externalBuffer = new byte[externalBufferLen];
    deserializeRead.copyToExternalBuffer(externalBuffer, 0);
    Text field = new Text();
    field.set(externalBuffer, 0, externalBufferLen);
    String f = "This\nis\rthe first\r\nmulti-line field\n";
    Text escaped = new Text(f.getBytes("UTF-8"));
    assertTrue("The escaped result is incorrect", field.compareTo(escaped) == 0);
}
Also used : LazySerDeParameters(org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Text(org.apache.hadoop.io.Text) Properties(java.util.Properties) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) Test(org.junit.Test)

Example 8 with LazySimpleDeserializeRead

use of org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead in project hive by apache.

the class TestLazySimpleFast method testLazySimpleFast.

private void testLazySimpleFast(SerdeRandomRowSource source, Object[][] rows, LazySimpleSerDe serde, StructObjectInspector rowOI, LazySimpleSerDe serde_fewer, StructObjectInspector writeRowOI, LazySerDeParameters serdeParams, LazySerDeParameters serdeParams_fewer, TypeInfo[] typeInfos, boolean useIncludeColumns, boolean doWriteFewerColumns, Random r) throws Throwable {
    int rowCount = rows.length;
    int columnCount = typeInfos.length;
    boolean[] columnsToInclude = null;
    if (useIncludeColumns) {
        columnsToInclude = new boolean[columnCount];
        for (int i = 0; i < columnCount; i++) {
            columnsToInclude[i] = r.nextBoolean();
        }
    }
    int writeColumnCount = columnCount;
    TypeInfo[] writeTypeInfos = typeInfos;
    if (doWriteFewerColumns) {
        writeColumnCount = writeRowOI.getAllStructFieldRefs().size();
        writeTypeInfos = Arrays.copyOf(typeInfos, writeColumnCount);
    }
    // Try to serialize
    BytesWritable[] serializeWriteBytes = new BytesWritable[rowCount];
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        Output output = new Output();
        LazySimpleSerializeWrite lazySimpleSerializeWrite = new LazySimpleSerializeWrite(columnCount, serdeParams);
        lazySimpleSerializeWrite.set(output);
        for (int index = 0; index < columnCount; index++) {
            VerifyFast.serializeWrite(lazySimpleSerializeWrite, typeInfos[index], row[index]);
        }
        BytesWritable bytesWritable = new BytesWritable();
        bytesWritable.set(output.getData(), 0, output.getLength());
        serializeWriteBytes[i] = bytesWritable;
    }
    // Try to deserialize
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        LazySimpleDeserializeRead lazySimpleDeserializeRead = new LazySimpleDeserializeRead(writeTypeInfos, /* useExternalBuffer */
        false, serdeParams);
        BytesWritable bytesWritable = serializeWriteBytes[i];
        byte[] bytes = bytesWritable.getBytes();
        int length = bytesWritable.getLength();
        lazySimpleDeserializeRead.set(bytes, 0, length);
        for (int index = 0; index < columnCount; index++) {
            if (useIncludeColumns && !columnsToInclude[index]) {
                lazySimpleDeserializeRead.skipNextField();
            } else if (index >= writeColumnCount) {
                // Should come back a null.
                verifyReadNull(lazySimpleDeserializeRead, typeInfos[index]);
            } else {
                Object expectedObject = row[index];
                verifyRead(lazySimpleDeserializeRead, typeInfos[index], expectedObject);
            }
        }
        if (writeColumnCount == columnCount) {
            assertTrue(lazySimpleDeserializeRead.isEndOfInputReached());
        }
    }
    // Try to deserialize using SerDe class our Writable row objects created by SerializeWrite.
    for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) {
        BytesWritable bytesWritable = serializeWriteBytes[rowIndex];
        LazyStruct lazySimpleStruct = (LazyStruct) serde.deserialize(bytesWritable);
        Object[] row = rows[rowIndex];
        for (int index = 0; index < columnCount; index++) {
            TypeInfo typeInfo = typeInfos[index];
            Object expectedObject = row[index];
            Object object = lazySimpleStruct.getField(index);
            if (expectedObject == null || object == null) {
                if (expectedObject != null || object != null) {
                    fail("SerDe deserialized NULL column mismatch");
                }
            } else {
                if (!VerifyLazy.lazyCompare(typeInfo, object, expectedObject)) {
                    fail("SerDe deserialized value does not match");
                }
            }
        }
    }
    // One Writable per row.
    byte[][] serdeBytes = new byte[rowCount][];
    // Serialize using the SerDe, then below deserialize using DeserializeRead.
    Object[] serdeRow = new Object[columnCount];
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        // LazySimple seems to work better with an row object array instead of a Java object...
        for (int index = 0; index < columnCount; index++) {
            serdeRow[index] = row[index];
        }
        Text serialized = (Text) serde.serialize(serdeRow, rowOI);
        byte[] bytes1 = Arrays.copyOfRange(serialized.getBytes(), 0, serialized.getLength());
        byte[] bytes2 = Arrays.copyOfRange(serializeWriteBytes[i].getBytes(), 0, serializeWriteBytes[i].getLength());
        if (!Arrays.equals(bytes1, bytes2)) {
            fail("SerializeWrite and SerDe serialization does not match");
        }
        serdeBytes[i] = copyBytes(serialized);
    }
    // Try to deserialize using DeserializeRead our Writable row objects created by SerDe.
    for (int i = 0; i < rowCount; i++) {
        Object[] row = rows[i];
        LazySimpleDeserializeRead lazySimpleDeserializeRead = new LazySimpleDeserializeRead(writeTypeInfos, /* useExternalBuffer */
        false, serdeParams);
        byte[] bytes = serdeBytes[i];
        lazySimpleDeserializeRead.set(bytes, 0, bytes.length);
        for (int index = 0; index < columnCount; index++) {
            if (useIncludeColumns && !columnsToInclude[index]) {
                lazySimpleDeserializeRead.skipNextField();
            } else if (index >= writeColumnCount) {
                // Should come back a null.
                verifyReadNull(lazySimpleDeserializeRead, typeInfos[index]);
            } else {
                Object expectedObject = row[index];
                verifyRead(lazySimpleDeserializeRead, typeInfos[index], expectedObject);
            }
        }
        if (writeColumnCount == columnCount) {
            assertTrue(lazySimpleDeserializeRead.isEndOfInputReached());
        }
    }
}
Also used : LazySimpleDeserializeRead(org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead) BytesWritable(org.apache.hadoop.io.BytesWritable) Text(org.apache.hadoop.io.Text) LazySimpleSerializeWrite(org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleSerializeWrite) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) Output(org.apache.hadoop.hive.serde2.ByteStream.Output) UnionObject(org.apache.hadoop.hive.serde2.objectinspector.UnionObject)

Example 9 with LazySimpleDeserializeRead

use of org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead in project hive by apache.

the class TestLazySimpleFast method testLazySimpleDeserializeRowEmptyArray.

@Test
public void testLazySimpleDeserializeRowEmptyArray() throws Throwable {
    HiveConf hconf = new HiveConf();
    // set the escaping related properties
    Properties props = new Properties();
    props.setProperty(serdeConstants.FIELD_DELIM, ",");
    LazySerDeParameters lazyParams = new LazySerDeParameters(hconf, props, LazySimpleSerDe.class.getName());
    TypeInfo[] typeInfos = new TypeInfo[] { TypeInfoFactory.getListTypeInfo(TypeInfoFactory.intTypeInfo), TypeInfoFactory.getListTypeInfo(TypeInfoFactory.getListTypeInfo(TypeInfoFactory.stringTypeInfo)) };
    LazySimpleDeserializeRead deserializeRead = new LazySimpleDeserializeRead(typeInfos, null, true, lazyParams);
    byte[] bytes = ",".getBytes();
    deserializeRead.set(bytes, 0, bytes.length);
    verifyRead(deserializeRead, typeInfos[0], Collections.emptyList());
    verifyRead(deserializeRead, typeInfos[1], Collections.emptyList());
    assertTrue(deserializeRead.isEndOfInputReached());
}
Also used : LazySimpleDeserializeRead(org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Properties(java.util.Properties) StructTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) Test(org.junit.Test)

Aggregations

LazySimpleDeserializeRead (org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead)8 LazySimpleSerializeWrite (org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleSerializeWrite)7 LazySerDeParameters (org.apache.hadoop.hive.serde2.lazy.LazySerDeParameters)6 Properties (java.util.Properties)5 Output (org.apache.hadoop.hive.serde2.ByteStream.Output)5 BinarySortableDeserializeRead (org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableDeserializeRead)5 BinarySortableSerializeWrite (org.apache.hadoop.hive.serde2.binarysortable.fast.BinarySortableSerializeWrite)5 DeserializeRead (org.apache.hadoop.hive.serde2.fast.DeserializeRead)5 SerializeWrite (org.apache.hadoop.hive.serde2.fast.SerializeWrite)5 LazyBinaryDeserializeRead (org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinaryDeserializeRead)5 LazyBinarySerializeWrite (org.apache.hadoop.hive.serde2.lazybinary.fast.LazyBinarySerializeWrite)5 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)5 TypeInfo (org.apache.hadoop.hive.serde2.typeinfo.TypeInfo)5 UnionObject (org.apache.hadoop.hive.serde2.objectinspector.UnionObject)4 IOException (java.io.IOException)3 Configuration (org.apache.hadoop.conf.Configuration)3 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)3 SerDeException (org.apache.hadoop.hive.serde2.SerDeException)3 Text (org.apache.hadoop.io.Text)3 HiveConf (org.apache.hadoop.hive.conf.HiveConf)2