use of org.apache.hadoop.hive.serde2.fast.SerializeWrite in project hive by apache.
the class TestBinarySortableFast method testBinarySortableFast.
private void testBinarySortableFast(SerdeRandomRowSource source, Object[][] rows, boolean[] columnSortOrderIsDesc, byte[] columnNullMarker, byte[] columnNotNullMarker, AbstractSerDe serde, StructObjectInspector rowOI, AbstractSerDe serde_fewer, StructObjectInspector writeRowOI, boolean ascending, PrimitiveTypeInfo[] primitiveTypeInfos, boolean useIncludeColumns, boolean doWriteFewerColumns, Random r) throws Throwable {
int rowCount = rows.length;
int columnCount = primitiveTypeInfos.length;
boolean[] columnsToInclude = null;
if (useIncludeColumns) {
columnsToInclude = new boolean[columnCount];
for (int i = 0; i < columnCount; i++) {
columnsToInclude[i] = r.nextBoolean();
}
}
int writeColumnCount = columnCount;
if (doWriteFewerColumns) {
writeColumnCount = writeRowOI.getAllStructFieldRefs().size();
}
BinarySortableSerializeWrite binarySortableSerializeWrite = new BinarySortableSerializeWrite(columnSortOrderIsDesc, columnNullMarker, columnNotNullMarker);
// Try to serialize
// One Writable per row.
BytesWritable[] serializeWriteBytes = new BytesWritable[rowCount];
int[][] perFieldWriteLengthsArray = new int[rowCount][];
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
Output output = new Output();
binarySortableSerializeWrite.set(output);
int[] perFieldWriteLengths = new int[columnCount];
for (int index = 0; index < writeColumnCount; index++) {
Writable writable = (Writable) row[index];
VerifyFast.serializeWrite(binarySortableSerializeWrite, primitiveTypeInfos[index], writable);
perFieldWriteLengths[index] = output.getLength();
}
perFieldWriteLengthsArray[i] = perFieldWriteLengths;
BytesWritable bytesWritable = new BytesWritable();
bytesWritable.set(output.getData(), 0, output.getLength());
serializeWriteBytes[i] = bytesWritable;
if (i > 0) {
int compareResult = serializeWriteBytes[i - 1].compareTo(serializeWriteBytes[i]);
if ((compareResult < 0 && !ascending) || (compareResult > 0 && ascending)) {
System.out.println("Test failed in " + (ascending ? "ascending" : "descending") + " order with " + (i - 1) + " and " + i);
System.out.println("serialized data [" + (i - 1) + "] = " + TestBinarySortableSerDe.hexString(serializeWriteBytes[i - 1]));
System.out.println("serialized data [" + i + "] = " + TestBinarySortableSerDe.hexString(serializeWriteBytes[i]));
fail("Sort order of serialized " + (i - 1) + " and " + i + " are reversed!");
}
}
}
// Try to deserialize using DeserializeRead our Writable row objects created by SerializeWrite.
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
BinarySortableDeserializeRead binarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos, /* useExternalBuffer */
false, columnSortOrderIsDesc);
BytesWritable bytesWritable = serializeWriteBytes[i];
binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());
for (int index = 0; index < columnCount; index++) {
if (useIncludeColumns && !columnsToInclude[index]) {
binarySortableDeserializeRead.skipNextField();
} else if (index >= writeColumnCount) {
// Should come back a null.
VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead, primitiveTypeInfos[index], null);
} else {
Writable writable = (Writable) row[index];
VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead, primitiveTypeInfos[index], writable);
}
}
if (writeColumnCount == columnCount) {
TestCase.assertTrue(binarySortableDeserializeRead.isEndOfInputReached());
}
/*
* Clip off one byte and expect to get an EOFException on the write field.
*/
BinarySortableDeserializeRead binarySortableDeserializeRead2 = new BinarySortableDeserializeRead(primitiveTypeInfos, /* useExternalBuffer */
false, columnSortOrderIsDesc);
binarySortableDeserializeRead2.set(bytesWritable.getBytes(), 0, // One fewer byte.
bytesWritable.getLength() - 1);
for (int index = 0; index < writeColumnCount; index++) {
Writable writable = (Writable) row[index];
if (index == writeColumnCount - 1) {
boolean threw = false;
try {
VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead2, primitiveTypeInfos[index], writable);
} catch (EOFException e) {
// debugDetailedReadPositionString = binarySortableDeserializeRead2.getDetailedReadPositionString();
// debugStackTrace = e.getStackTrace();
threw = true;
}
TestCase.assertTrue(threw);
} else {
if (useIncludeColumns && !columnsToInclude[index]) {
binarySortableDeserializeRead2.skipNextField();
} else {
VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead2, primitiveTypeInfos[index], writable);
}
}
}
}
// Try to deserialize using SerDe class our Writable row objects created by SerializeWrite.
for (int i = 0; i < rowCount; i++) {
BytesWritable bytesWritable = serializeWriteBytes[i];
// Note that regular SerDe doesn't tolerate fewer columns.
List<Object> deserializedRow;
if (doWriteFewerColumns) {
deserializedRow = (List<Object>) serde_fewer.deserialize(bytesWritable);
} else {
deserializedRow = (List<Object>) serde.deserialize(bytesWritable);
}
Object[] row = rows[i];
for (int index = 0; index < writeColumnCount; index++) {
Object expected = row[index];
Object object = deserializedRow.get(index);
if (expected == null || object == null) {
if (expected != null || object != null) {
fail("SerDe deserialized NULL column mismatch");
}
} else {
if (!object.equals(expected)) {
fail("SerDe deserialized value does not match (expected " + expected.getClass().getName() + " " + expected.toString() + ", actual " + object.getClass().getName() + " " + object.toString() + ")");
}
}
}
}
// One Writable per row.
BytesWritable[] serdeBytes = new BytesWritable[rowCount];
// Serialize using the SerDe, then below deserialize using DeserializeRead.
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
// Since SerDe reuses memory, we will need to make a copy.
BytesWritable serialized;
if (doWriteFewerColumns) {
serialized = (BytesWritable) serde_fewer.serialize(row, rowOI);
} else {
serialized = (BytesWritable) serde.serialize(row, rowOI);
;
}
BytesWritable bytesWritable = new BytesWritable();
bytesWritable.set(serialized);
byte[] serDeOutput = Arrays.copyOfRange(bytesWritable.getBytes(), 0, bytesWritable.getLength());
byte[] serializeWriteExpected = Arrays.copyOfRange(serializeWriteBytes[i].getBytes(), 0, serializeWriteBytes[i].getLength());
if (!Arrays.equals(serDeOutput, serializeWriteExpected)) {
int mismatchPos = -1;
if (serDeOutput.length != serializeWriteExpected.length) {
for (int b = 0; b < Math.min(serDeOutput.length, serializeWriteExpected.length); b++) {
if (serDeOutput[b] != serializeWriteExpected[b]) {
mismatchPos = b;
break;
}
}
fail("Different byte array lengths: serDeOutput.length " + serDeOutput.length + ", serializeWriteExpected.length " + serializeWriteExpected.length + " mismatchPos " + mismatchPos + " perFieldWriteLengths " + Arrays.toString(perFieldWriteLengthsArray[i]));
}
List<Integer> differentPositions = new ArrayList();
for (int b = 0; b < serDeOutput.length; b++) {
if (serDeOutput[b] != serializeWriteExpected[b]) {
differentPositions.add(b);
}
}
if (differentPositions.size() > 0) {
List<String> serializeWriteExpectedFields = new ArrayList<String>();
List<String> serDeFields = new ArrayList<String>();
int f = 0;
int lastBegin = 0;
for (int b = 0; b < serDeOutput.length; b++) {
int writeLength = perFieldWriteLengthsArray[i][f];
if (b + 1 == writeLength) {
serializeWriteExpectedFields.add(displayBytes(serializeWriteExpected, lastBegin, writeLength - lastBegin));
serDeFields.add(displayBytes(serDeOutput, lastBegin, writeLength - lastBegin));
f++;
lastBegin = b + 1;
}
}
fail("SerializeWrite and SerDe serialization does not match at positions " + differentPositions.toString() + "\n(SerializeWrite: " + serializeWriteExpectedFields.toString() + "\nSerDe: " + serDeFields.toString() + "\nperFieldWriteLengths " + Arrays.toString(perFieldWriteLengthsArray[i]) + "\nprimitiveTypeInfos " + Arrays.toString(primitiveTypeInfos) + "\nrow " + Arrays.toString(row));
}
}
serdeBytes[i] = bytesWritable;
}
// Try to deserialize using DeserializeRead our Writable row objects created by SerDe.
for (int i = 0; i < rowCount; i++) {
Object[] row = rows[i];
BinarySortableDeserializeRead binarySortableDeserializeRead = new BinarySortableDeserializeRead(primitiveTypeInfos, /* useExternalBuffer */
false, columnSortOrderIsDesc);
BytesWritable bytesWritable = serdeBytes[i];
binarySortableDeserializeRead.set(bytesWritable.getBytes(), 0, bytesWritable.getLength());
for (int index = 0; index < columnCount; index++) {
if (useIncludeColumns && !columnsToInclude[index]) {
binarySortableDeserializeRead.skipNextField();
} else if (index >= writeColumnCount) {
// Should come back a null.
VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead, primitiveTypeInfos[index], null);
} else {
Writable writable = (Writable) row[index];
VerifyFast.verifyDeserializeRead(binarySortableDeserializeRead, primitiveTypeInfos[index], writable);
}
}
if (writeColumnCount == columnCount) {
TestCase.assertTrue(binarySortableDeserializeRead.isEndOfInputReached());
}
}
}
use of org.apache.hadoop.hive.serde2.fast.SerializeWrite in project hive by apache.
the class TestVectorSerDeRow method testVectorDeserializeRow.
void testVectorDeserializeRow(Random r, SerializationType serializationType, boolean alternate1, boolean alternate2, boolean useExternalBuffer) throws HiveException, IOException, SerDeException {
String[] emptyScratchTypeNames = new String[0];
VectorRandomRowSource source = new VectorRandomRowSource();
source.init(r);
VectorizedRowBatchCtx batchContext = new VectorizedRowBatchCtx();
batchContext.init(source.rowStructObjectInspector(), emptyScratchTypeNames);
VectorizedRowBatch batch = batchContext.createVectorizedRowBatch();
// junk the destination for the 1st pass
for (ColumnVector cv : batch.cols) {
Arrays.fill(cv.isNull, true);
}
PrimitiveTypeInfo[] primitiveTypeInfos = source.primitiveTypeInfos();
int fieldCount = source.typeNames().size();
DeserializeRead deserializeRead;
SerializeWrite serializeWrite;
switch(serializationType) {
case BINARY_SORTABLE:
boolean useColumnSortOrderIsDesc = alternate1;
if (!useColumnSortOrderIsDesc) {
deserializeRead = new BinarySortableDeserializeRead(source.primitiveTypeInfos(), useExternalBuffer);
serializeWrite = new BinarySortableSerializeWrite(fieldCount);
} else {
boolean[] columnSortOrderIsDesc = new boolean[fieldCount];
for (int i = 0; i < fieldCount; i++) {
columnSortOrderIsDesc[i] = r.nextBoolean();
}
deserializeRead = new BinarySortableDeserializeRead(source.primitiveTypeInfos(), useExternalBuffer, columnSortOrderIsDesc);
byte[] columnNullMarker = new byte[fieldCount];
byte[] columnNotNullMarker = new byte[fieldCount];
for (int i = 0; i < fieldCount; i++) {
if (columnSortOrderIsDesc[i]) {
// Descending
// Null last (default for descending order)
columnNullMarker[i] = BinarySortableSerDe.ZERO;
columnNotNullMarker[i] = BinarySortableSerDe.ONE;
} else {
// Ascending
// Null first (default for ascending order)
columnNullMarker[i] = BinarySortableSerDe.ZERO;
columnNotNullMarker[i] = BinarySortableSerDe.ONE;
}
}
serializeWrite = new BinarySortableSerializeWrite(columnSortOrderIsDesc, columnNullMarker, columnNotNullMarker);
}
boolean useBinarySortableCharsNeedingEscape = alternate2;
if (useBinarySortableCharsNeedingEscape) {
source.addBinarySortableAlphabets();
}
break;
case LAZY_BINARY:
deserializeRead = new LazyBinaryDeserializeRead(source.primitiveTypeInfos(), useExternalBuffer);
serializeWrite = new LazyBinarySerializeWrite(fieldCount);
break;
case LAZY_SIMPLE:
{
StructObjectInspector rowObjectInspector = source.rowStructObjectInspector();
Configuration conf = new Configuration();
Properties tbl = new Properties();
tbl.setProperty(serdeConstants.FIELD_DELIM, "\t");
tbl.setProperty(serdeConstants.LINE_DELIM, "\n");
byte separator = (byte) '\t';
boolean useLazySimpleEscapes = alternate1;
if (useLazySimpleEscapes) {
tbl.setProperty(serdeConstants.QUOTE_CHAR, "'");
String escapeString = "\\";
tbl.setProperty(serdeConstants.ESCAPE_CHAR, escapeString);
}
LazySerDeParameters lazySerDeParams = getSerDeParams(conf, tbl, rowObjectInspector);
if (useLazySimpleEscapes) {
// LazySimple seems to throw away everything but \n and \r.
boolean[] needsEscape = lazySerDeParams.getNeedsEscape();
StringBuilder sb = new StringBuilder();
if (needsEscape['\n']) {
sb.append('\n');
}
if (needsEscape['\r']) {
sb.append('\r');
}
// for (int i = 0; i < needsEscape.length; i++) {
// if (needsEscape[i]) {
// sb.append((char) i);
// }
// }
String needsEscapeStr = sb.toString();
if (needsEscapeStr.length() > 0) {
source.addEscapables(needsEscapeStr);
}
}
deserializeRead = new LazySimpleDeserializeRead(source.primitiveTypeInfos(), useExternalBuffer, separator, lazySerDeParams);
serializeWrite = new LazySimpleSerializeWrite(fieldCount, separator, lazySerDeParams);
}
break;
default:
throw new Error("Unknown serialization type " + serializationType);
}
VectorDeserializeRow vectorDeserializeRow = new VectorDeserializeRow(deserializeRead);
vectorDeserializeRow.init();
// junk the destination for the 1st pass
for (ColumnVector cv : batch.cols) {
Arrays.fill(cv.isNull, true);
cv.noNulls = false;
}
VectorExtractRow vectorExtractRow = new VectorExtractRow();
vectorExtractRow.init(source.typeNames());
Object[][] randomRows = source.randomRows(100000);
int firstRandomRowIndex = 0;
for (int i = 0; i < randomRows.length; i++) {
Object[] row = randomRows[i];
Output output = serializeRow(row, source, serializeWrite);
vectorDeserializeRow.setBytes(output.getData(), 0, output.getLength());
try {
vectorDeserializeRow.deserialize(batch, batch.size);
} catch (Exception e) {
throw new HiveException("\nDeserializeRead details: " + vectorDeserializeRow.getDetailedReadPositionString(), e);
}
batch.size++;
if (batch.size == batch.DEFAULT_SIZE) {
examineBatch(batch, vectorExtractRow, primitiveTypeInfos, randomRows, firstRandomRowIndex);
firstRandomRowIndex = i + 1;
batch.reset();
}
}
if (batch.size > 0) {
examineBatch(batch, vectorExtractRow, primitiveTypeInfos, randomRows, firstRandomRowIndex);
}
}
Aggregations