use of org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead in project hive by apache.
the class TestVectorSerDeRow method innerTestVectorSerializeRow.
void innerTestVectorSerializeRow(Random r, SerializationType serializationType) throws HiveException, IOException, SerDeException {
String[] emptyScratchTypeNames = new String[0];
VectorRandomRowSource source = new VectorRandomRowSource();
// FUTURE: try NULLs and UNICODE.
source.init(r, VectorRandomRowSource.SupportedTypes.ALL, 4, /* allowNulls */
false, /* isUnicodeOk */
false);
VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx();
batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames);
VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
VectorAssignRow vectorAssignRow = new VectorAssignRow();
vectorAssignRow.init(source.typeNames());
int fieldCount = source.typeNames().size();
DeserializeRead deserializeRead;
SerializeWrite serializeWrite;
switch(serializationType) {
case BINARY_SORTABLE:
deserializeRead = BinarySortableDeserializeRead.ascendingNullsFirst(source.typeInfos(), false);
serializeWrite = new BinarySortableSerializeWrite(fieldCount);
break;
case LAZY_BINARY:
deserializeRead = new LazyBinaryDeserializeRead(source.typeInfos(), /* useExternalBuffer */
false);
serializeWrite = new LazyBinarySerializeWrite(fieldCount);
break;
case LAZY_SIMPLE:
{
StructObjectInspector rowObjectInspector = source.rowStructObjectInspector();
// Use different separator values.
byte[] separators = new byte[] { (byte) 9, (byte) 2, (byte) 3, (byte) 4, (byte) 5, (byte) 6, (byte) 7, (byte) 8 };
LazySerDeParameters lazySerDeParams = getSerDeParams(rowObjectInspector, separators);
deserializeRead = new LazySimpleDeserializeRead(source.typeInfos(), /* useExternalBuffer */
false, lazySerDeParams);
serializeWrite = new LazySimpleSerializeWrite(fieldCount, lazySerDeParams);
}
break;
default:
throw new Error("Unknown serialization type " + serializationType);
}
VectorSerializeRow vectorSerializeRow = new VectorSerializeRow(serializeWrite);
vectorSerializeRow.init(source.typeNames());
Object[][] randomRows = source.randomRows(2000);
int firstRandomRowIndex = 0;
for (int i = 0; i < randomRows.length; i++) {
Object[] row = randomRows[i];
vectorAssignRow.assignRow(batch, batch.size, row);
batch.size++;
if (batch.size == batch.DEFAULT_SIZE) {
serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex);
firstRandomRowIndex = i + 1;
batch.reset();
}
}
if (batch.size > 0) {
serializeBatch(batch, vectorSerializeRow, deserializeRead, source, randomRows, firstRandomRowIndex);
}
}
use of org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead in project hive by apache.
the class TestLazySimpleDeserializeRead method testEscaping.
/**
* Test for escaping.
*/
@Test
public void testEscaping() throws Exception {
HiveConf hconf = new HiveConf();
// set the escaping related properties
Properties props = new Properties();
props.setProperty(serdeConstants.FIELD_DELIM, "|");
props.setProperty(serdeConstants.ESCAPE_CHAR, "\\");
props.setProperty(serdeConstants.SERIALIZATION_ESCAPE_CRLF, "true");
LazySerDeParameters lazyParams = new LazySerDeParameters(hconf, props, LazySimpleSerDe.class.getName());
TypeInfo[] typeInfos = new TypeInfo[2];
typeInfos[0] = TypeInfoFactory.getPrimitiveTypeInfo("string");
typeInfos[1] = TypeInfoFactory.getPrimitiveTypeInfo("string");
LazySimpleDeserializeRead deserializeRead = new LazySimpleDeserializeRead(typeInfos, null, true, lazyParams);
// set and parse the row
String s = "This\\nis\\rthe first\\r\\nmulti-line field\\n|field1-2";
Text row = new Text(s.getBytes("UTF-8"));
deserializeRead.set(row.getBytes(), 0, row.getLength());
assertTrue(deserializeRead.readNextField());
assertTrue(deserializeRead.currentExternalBufferNeeded);
int externalBufferLen = deserializeRead.currentExternalBufferNeededLen;
assertEquals("Wrong external buffer length", externalBufferLen, 36);
byte[] externalBuffer = new byte[externalBufferLen];
deserializeRead.copyToExternalBuffer(externalBuffer, 0);
Text field = new Text();
field.set(externalBuffer, 0, externalBufferLen);
String f = "This\nis\rthe first\r\nmulti-line field\n";
Text escaped = new Text(f.getBytes("UTF-8"));
assertTrue("The escaped result is incorrect", field.compareTo(escaped) == 0);
}
use of org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead in project hive by apache.
the class TestLazySimpleFast method testLazySimpleFast.
private void testLazySimpleFast(SerdeRandomRowSource source, Object[][] rows, LazySimpleSerDe serde, StructObjectInspector rowOI, LazySimpleSerDe serde_fewer, StructObjectInspector writeRowOI, LazySerDeParameters serdeParams, LazySerDeParameters serdeParams_fewer, TypeInfo[] typeInfos, boolean useIncludeColumns, boolean doWriteFewerColumns, Random r) throws Throwable {
int rowCount = rows.length;
int columnCount = typeInfos.length;
boolean[] columnsToInclude = null;
if (useIncludeColumns) {
columnsToInclude = new boolean[columnCount];
for (int i = 0; i < columnCount; i++) {
columnsToInclude[i] = r.nextBoolean();
}
}
int writeColumnCount = columnCount;
TypeInfo[] writeTypeInfos = typeInfos;
if (doWriteFewerColumns) {
writeColumnCount = writeRowOI.getAllStructFieldRefs().size();
writeTypeInfos = Arrays.copyOf(typeInfos, writeColumnCount);
}
// Try to serialize
BytesWritable[] serializeWriteBytes = new BytesWritable[rowCount];
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
Output output = new Output();
LazySimpleSerializeWrite lazySimpleSerializeWrite = new LazySimpleSerializeWrite(columnCount, serdeParams);
lazySimpleSerializeWrite.set(output);
for (int index = 0; index < columnCount; index++) {
VerifyFast.serializeWrite(lazySimpleSerializeWrite, typeInfos[index], row[index]);
}
BytesWritable bytesWritable = new BytesWritable();
bytesWritable.set(output.getData(), 0, output.getLength());
serializeWriteBytes[i] = bytesWritable;
}
// Try to deserialize
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
LazySimpleDeserializeRead lazySimpleDeserializeRead = new LazySimpleDeserializeRead(writeTypeInfos, /* useExternalBuffer */
false, serdeParams);
BytesWritable bytesWritable = serializeWriteBytes[i];
byte[] bytes = bytesWritable.getBytes();
int length = bytesWritable.getLength();
lazySimpleDeserializeRead.set(bytes, 0, length);
for (int index = 0; index < columnCount; index++) {
if (useIncludeColumns && !columnsToInclude[index]) {
lazySimpleDeserializeRead.skipNextField();
} else if (index >= writeColumnCount) {
// Should come back a null.
verifyReadNull(lazySimpleDeserializeRead, typeInfos[index]);
} else {
Object expectedObject = row[index];
verifyRead(lazySimpleDeserializeRead, typeInfos[index], expectedObject);
}
}
if (writeColumnCount == columnCount) {
assertTrue(lazySimpleDeserializeRead.isEndOfInputReached());
}
}
// Try to deserialize using SerDe class our Writable row objects created by SerializeWrite.
for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) {
BytesWritable bytesWritable = serializeWriteBytes[rowIndex];
LazyStruct lazySimpleStruct = (LazyStruct) serde.deserialize(bytesWritable);
Object[] row = rows[rowIndex];
for (int index = 0; index < columnCount; index++) {
TypeInfo typeInfo = typeInfos[index];
Object expectedObject = row[index];
Object object = lazySimpleStruct.getField(index);
if (expectedObject == null || object == null) {
if (expectedObject != null || object != null) {
fail("SerDe deserialized NULL column mismatch");
}
} else {
if (!VerifyLazy.lazyCompare(typeInfo, object, expectedObject)) {
fail("SerDe deserialized value does not match");
}
}
}
}
// One Writable per row.
byte[][] serdeBytes = new byte[rowCount][];
// Serialize using the SerDe, then below deserialize using DeserializeRead.
Object[] serdeRow = new Object[columnCount];
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
// LazySimple seems to work better with an row object array instead of a Java object...
for (int index = 0; index < columnCount; index++) {
serdeRow[index] = row[index];
}
Text serialized = (Text) serde.serialize(serdeRow, rowOI);
byte[] bytes1 = Arrays.copyOfRange(serialized.getBytes(), 0, serialized.getLength());
byte[] bytes2 = Arrays.copyOfRange(serializeWriteBytes[i].getBytes(), 0, serializeWriteBytes[i].getLength());
if (!Arrays.equals(bytes1, bytes2)) {
fail("SerializeWrite and SerDe serialization does not match");
}
serdeBytes[i] = copyBytes(serialized);
}
// Try to deserialize using DeserializeRead our Writable row objects created by SerDe.
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
LazySimpleDeserializeRead lazySimpleDeserializeRead = new LazySimpleDeserializeRead(writeTypeInfos, /* useExternalBuffer */
false, serdeParams);
byte[] bytes = serdeBytes[i];
lazySimpleDeserializeRead.set(bytes, 0, bytes.length);
for (int index = 0; index < columnCount; index++) {
if (useIncludeColumns && !columnsToInclude[index]) {
lazySimpleDeserializeRead.skipNextField();
} else if (index >= writeColumnCount) {
// Should come back a null.
verifyReadNull(lazySimpleDeserializeRead, typeInfos[index]);
} else {
Object expectedObject = row[index];
verifyRead(lazySimpleDeserializeRead, typeInfos[index], expectedObject);
}
}
if (writeColumnCount == columnCount) {
assertTrue(lazySimpleDeserializeRead.isEndOfInputReached());
}
}
}
use of org.apache.hadoop.hive.serde2.lazy.fast.LazySimpleDeserializeRead in project hive by apache.
the class TestLazySimpleFast method testLazySimpleDeserializeRowEmptyArray.
@Test
public void testLazySimpleDeserializeRowEmptyArray() throws Throwable {
HiveConf hconf = new HiveConf();
// set the escaping related properties
Properties props = new Properties();
props.setProperty(serdeConstants.FIELD_DELIM, ",");
LazySerDeParameters lazyParams = new LazySerDeParameters(hconf, props, LazySimpleSerDe.class.getName());
TypeInfo[] typeInfos = new TypeInfo[] { TypeInfoFactory.getListTypeInfo(TypeInfoFactory.intTypeInfo), TypeInfoFactory.getListTypeInfo(TypeInfoFactory.getListTypeInfo(TypeInfoFactory.stringTypeInfo)) };
LazySimpleDeserializeRead deserializeRead = new LazySimpleDeserializeRead(typeInfos, null, true, lazyParams);
byte[] bytes = ",".getBytes();
deserializeRead.set(bytes, 0, bytes.length);
verifyRead(deserializeRead, typeInfos[0], Collections.emptyList());
verifyRead(deserializeRead, typeInfos[1], Collections.emptyList());
assertTrue(deserializeRead.isEndOfInputReached());
}
Aggregations