use of org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector in project hive by apache.
the class TestOrcSerDeStats method testStringAndBinaryStatistics.
@Test
public void testStringAndBinaryStatistics() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000));
writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo"));
writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar"));
writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null));
writer.addRow(new SimpleStruct(null, "hi"));
writer.close();
assertEquals(4, writer.getNumberOfRows());
assertEquals(273, writer.getRawDataSize());
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(4, reader.getNumberOfRows());
assertEquals(289, reader.getRawDataSize());
assertEquals(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
assertEquals(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
assertEquals(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(4, stats[0].getNumberOfValues());
assertEquals("count: 4 hasNull: false", stats[0].toString());
assertEquals(3, stats[1].getNumberOfValues());
assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", stats[1].toString());
assertEquals(3, stats[2].getNumberOfValues());
assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
assertEquals("count: 3 hasNull: true bytesOnDisk: 22 min: bar max: hi sum: 8", stats[2].toString());
// check the inspectors
StructObjectInspector readerInspector = (StructObjectInspector) reader.getObjectInspector();
assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory());
assertEquals("struct<bytes1:binary,string1:string>", readerInspector.getTypeName());
List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector.getStructFieldRef("bytes1").getFieldObjectInspector();
StringObjectInspector st = (StringObjectInspector) readerInspector.getStructFieldRef("string1").getFieldObjectInspector();
RecordReader rows = reader.rows();
Object row = rows.next(null);
assertNotNull(row);
// check the contents of the first row
assertEquals(bytes(0, 1, 2, 3, 4), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals("foo", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
// check the contents of second row
assertEquals(true, rows.hasNext());
row = rows.next(row);
assertEquals(bytes(0, 1, 2, 3), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals("bar", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
// check the contents of second row
assertEquals(true, rows.hasNext());
row = rows.next(row);
assertEquals(bytes(0, 1, 2, 3, 4, 5), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
assertNull(st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
// check the contents of second row
assertEquals(true, rows.hasNext());
row = rows.next(row);
assertNull(bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals("hi", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
// handle the close up
assertEquals(false, rows.hasNext());
rows.close();
}
use of org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector in project hive by apache.
the class LazyBinarySerDe method serialize.
/**
* A recursive function that serialize an object to a byte buffer based on its
* object inspector.
*
* @param byteStream
* the byte stream storing the serialization data
* @param obj
* the object to serialize
* @param objInspector
* the object inspector
* @param skipLengthPrefix a boolean indicating whether length prefix is
* needed for list/map/struct
* @param warnedOnceNullMapKey a boolean indicating whether a warning
* has been issued once already when encountering null map keys
*/
public static void serialize(RandomAccessOutput byteStream, Object obj, ObjectInspector objInspector, boolean skipLengthPrefix, BooleanRef warnedOnceNullMapKey) throws SerDeException {
// do nothing for null object
if (null == obj) {
return;
}
switch(objInspector.getCategory()) {
case PRIMITIVE:
{
PrimitiveObjectInspector poi = (PrimitiveObjectInspector) objInspector;
switch(poi.getPrimitiveCategory()) {
case VOID:
{
return;
}
case BOOLEAN:
{
boolean v = ((BooleanObjectInspector) poi).get(obj);
byteStream.write((byte) (v ? 1 : 0));
return;
}
case BYTE:
{
ByteObjectInspector boi = (ByteObjectInspector) poi;
byte v = boi.get(obj);
byteStream.write(v);
return;
}
case SHORT:
{
ShortObjectInspector spoi = (ShortObjectInspector) poi;
short v = spoi.get(obj);
byteStream.write((byte) (v >> 8));
byteStream.write((byte) (v));
return;
}
case INT:
{
IntObjectInspector ioi = (IntObjectInspector) poi;
int v = ioi.get(obj);
LazyBinaryUtils.writeVInt(byteStream, v);
return;
}
case LONG:
{
LongObjectInspector loi = (LongObjectInspector) poi;
long v = loi.get(obj);
LazyBinaryUtils.writeVLong(byteStream, v);
return;
}
case FLOAT:
{
FloatObjectInspector foi = (FloatObjectInspector) poi;
int v = Float.floatToIntBits(foi.get(obj));
byteStream.write((byte) (v >> 24));
byteStream.write((byte) (v >> 16));
byteStream.write((byte) (v >> 8));
byteStream.write((byte) (v));
return;
}
case DOUBLE:
{
DoubleObjectInspector doi = (DoubleObjectInspector) poi;
LazyBinaryUtils.writeDouble(byteStream, doi.get(obj));
return;
}
case STRING:
{
StringObjectInspector soi = (StringObjectInspector) poi;
Text t = soi.getPrimitiveWritableObject(obj);
serializeText(byteStream, t, skipLengthPrefix);
return;
}
case CHAR:
{
HiveCharObjectInspector hcoi = (HiveCharObjectInspector) poi;
Text t = hcoi.getPrimitiveWritableObject(obj).getTextValue();
serializeText(byteStream, t, skipLengthPrefix);
return;
}
case VARCHAR:
{
HiveVarcharObjectInspector hcoi = (HiveVarcharObjectInspector) poi;
Text t = hcoi.getPrimitiveWritableObject(obj).getTextValue();
serializeText(byteStream, t, skipLengthPrefix);
return;
}
case BINARY:
{
BinaryObjectInspector baoi = (BinaryObjectInspector) poi;
BytesWritable bw = baoi.getPrimitiveWritableObject(obj);
int length = bw.getLength();
if (!skipLengthPrefix) {
LazyBinaryUtils.writeVInt(byteStream, length);
} else {
if (length == 0) {
throw new RuntimeException("LazyBinaryColumnarSerde cannot serialize a non-null zero " + "length binary field. Consider using either LazyBinarySerde or ColumnarSerde.");
}
}
byteStream.write(bw.getBytes(), 0, length);
return;
}
case DATE:
{
DateWritableV2 d = ((DateObjectInspector) poi).getPrimitiveWritableObject(obj);
writeDateToByteStream(byteStream, d);
return;
}
case TIMESTAMP:
{
TimestampObjectInspector toi = (TimestampObjectInspector) poi;
TimestampWritableV2 t = toi.getPrimitiveWritableObject(obj);
t.writeToByteStream(byteStream);
return;
}
case TIMESTAMPLOCALTZ:
{
TimestampLocalTZWritable t = ((TimestampLocalTZObjectInspector) poi).getPrimitiveWritableObject(obj);
t.writeToByteStream(byteStream);
return;
}
case INTERVAL_YEAR_MONTH:
{
HiveIntervalYearMonthWritable intervalYearMonth = ((HiveIntervalYearMonthObjectInspector) poi).getPrimitiveWritableObject(obj);
intervalYearMonth.writeToByteStream(byteStream);
return;
}
case INTERVAL_DAY_TIME:
{
HiveIntervalDayTimeWritable intervalDayTime = ((HiveIntervalDayTimeObjectInspector) poi).getPrimitiveWritableObject(obj);
intervalDayTime.writeToByteStream(byteStream);
return;
}
case DECIMAL:
{
HiveDecimalObjectInspector bdoi = (HiveDecimalObjectInspector) poi;
HiveDecimalWritable t = bdoi.getPrimitiveWritableObject(obj);
if (t == null) {
return;
}
writeToByteStream(byteStream, t);
return;
}
default:
{
throw new RuntimeException("Unrecognized type: " + poi.getPrimitiveCategory());
}
}
}
case LIST:
{
ListObjectInspector loi = (ListObjectInspector) objInspector;
ObjectInspector eoi = loi.getListElementObjectInspector();
int byteSizeStart = 0;
int listStart = 0;
if (!skipLengthPrefix) {
// 1/ reserve spaces for the byte size of the list
// which is a integer and takes four bytes
byteSizeStart = byteStream.getLength();
byteStream.reserve(4);
listStart = byteStream.getLength();
}
// 2/ write the size of the list as a VInt
int size = loi.getListLength(obj);
LazyBinaryUtils.writeVInt(byteStream, size);
// 3/ write the null bytes
byte nullByte = 0;
for (int eid = 0; eid < size; eid++) {
// set the bit to 1 if an element is not null
if (null != loi.getListElement(obj, eid)) {
nullByte |= 1 << (eid % 8);
}
// if this is the last element
if (7 == eid % 8 || eid == size - 1) {
byteStream.write(nullByte);
nullByte = 0;
}
}
// 4/ write element by element from the list
for (int eid = 0; eid < size; eid++) {
serialize(byteStream, loi.getListElement(obj, eid), eoi, false, warnedOnceNullMapKey);
}
if (!skipLengthPrefix) {
// 5/ update the list byte size
int listEnd = byteStream.getLength();
int listSize = listEnd - listStart;
writeSizeAtOffset(byteStream, byteSizeStart, listSize);
}
return;
}
case MAP:
{
MapObjectInspector moi = (MapObjectInspector) objInspector;
ObjectInspector koi = moi.getMapKeyObjectInspector();
ObjectInspector voi = moi.getMapValueObjectInspector();
Map<?, ?> map = moi.getMap(obj);
int byteSizeStart = 0;
int mapStart = 0;
if (!skipLengthPrefix) {
// 1/ reserve spaces for the byte size of the map
// which is a integer and takes four bytes
byteSizeStart = byteStream.getLength();
byteStream.reserve(4);
mapStart = byteStream.getLength();
}
// 2/ write the size of the map which is a VInt
int size = map.size();
LazyBinaryUtils.writeVInt(byteStream, size);
// 3/ write the null bytes
int b = 0;
byte nullByte = 0;
for (Map.Entry<?, ?> entry : map.entrySet()) {
// set the bit to 1 if a key is not null
if (null != entry.getKey()) {
nullByte |= 1 << (b % 8);
} else if (warnedOnceNullMapKey != null) {
if (!warnedOnceNullMapKey.value) {
LOG.warn("Null map key encountered! Ignoring similar problems.");
}
warnedOnceNullMapKey.value = true;
}
b++;
// set the bit to 1 if a value is not null
if (null != entry.getValue()) {
nullByte |= 1 << (b % 8);
}
b++;
// or if this is the last key-value pair
if (0 == b % 8 || b == size * 2) {
byteStream.write(nullByte);
nullByte = 0;
}
}
// 4/ write key-value pairs one by one
for (Map.Entry<?, ?> entry : map.entrySet()) {
serialize(byteStream, entry.getKey(), koi, false, warnedOnceNullMapKey);
serialize(byteStream, entry.getValue(), voi, false, warnedOnceNullMapKey);
}
if (!skipLengthPrefix) {
// 5/ update the byte size of the map
int mapEnd = byteStream.getLength();
int mapSize = mapEnd - mapStart;
writeSizeAtOffset(byteStream, byteSizeStart, mapSize);
}
return;
}
case STRUCT:
case UNION:
{
int byteSizeStart = 0;
int typeStart = 0;
if (!skipLengthPrefix) {
// 1/ reserve spaces for the byte size of the struct
// which is a integer and takes four bytes
byteSizeStart = byteStream.getLength();
byteStream.reserve(4);
typeStart = byteStream.getLength();
}
if (ObjectInspector.Category.STRUCT.equals(objInspector.getCategory())) {
// 2/ serialize the struct
serializeStruct(byteStream, obj, (StructObjectInspector) objInspector, warnedOnceNullMapKey);
} else {
// 2/ serialize the union
serializeUnion(byteStream, obj, (UnionObjectInspector) objInspector, warnedOnceNullMapKey);
}
if (!skipLengthPrefix) {
// 3/ update the byte size of the struct
int typeEnd = byteStream.getLength();
int typeSize = typeEnd - typeStart;
writeSizeAtOffset(byteStream, byteSizeStart, typeSize);
}
return;
}
default:
{
throw new RuntimeException("Unrecognized type: " + objInspector.getCategory());
}
}
}
use of org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector in project hive by apache.
the class ObjectInspectorUtils method hashCode.
public static int hashCode(Object o, ObjectInspector objIns) {
if (o == null) {
return 0;
}
switch(objIns.getCategory()) {
case PRIMITIVE:
{
PrimitiveObjectInspector poi = ((PrimitiveObjectInspector) objIns);
switch(poi.getPrimitiveCategory()) {
case VOID:
return 0;
case BOOLEAN:
return ((BooleanObjectInspector) poi).get(o) ? 1 : 0;
case BYTE:
return ((ByteObjectInspector) poi).get(o);
case SHORT:
return ((ShortObjectInspector) poi).get(o);
case INT:
return ((IntObjectInspector) poi).get(o);
case LONG:
{
long a = ((LongObjectInspector) poi).get(o);
return (int) ((a >>> 32) ^ a);
}
case FLOAT:
return Float.floatToIntBits(((FloatObjectInspector) poi).get(o));
case DOUBLE:
{
// This hash function returns the same result as Double.hashCode()
// while DoubleWritable.hashCode returns a different result.
long a = Double.doubleToLongBits(((DoubleObjectInspector) poi).get(o));
return (int) ((a >>> 32) ^ a);
}
case STRING:
{
// This hash function returns the same result as String.hashCode() when
// all characters are ASCII, while Text.hashCode() always returns a
// different result.
Text t = ((StringObjectInspector) poi).getPrimitiveWritableObject(o);
int r = 0;
for (int i = 0; i < t.getLength(); i++) {
r = r * 31 + t.getBytes()[i];
}
return r;
}
case CHAR:
return ((HiveCharObjectInspector) poi).getPrimitiveWritableObject(o).hashCode();
case VARCHAR:
return ((HiveVarcharObjectInspector) poi).getPrimitiveWritableObject(o).hashCode();
case BINARY:
return ((BinaryObjectInspector) poi).getPrimitiveWritableObject(o).hashCode();
case DATE:
return ((DateObjectInspector) poi).getPrimitiveWritableObject(o).hashCode();
case TIMESTAMP:
// Use old timestamp writable hash code for backwards compatibility
TimestampWritable ts = new TimestampWritable(java.sql.Timestamp.valueOf(((TimestampObjectInspector) poi).getPrimitiveWritableObject(o).toString()));
return ts.hashCode();
case TIMESTAMPLOCALTZ:
TimestampLocalTZWritable tstz = ((TimestampLocalTZObjectInspector) poi).getPrimitiveWritableObject(o);
return tstz.hashCode();
case INTERVAL_YEAR_MONTH:
HiveIntervalYearMonthWritable intervalYearMonth = ((HiveIntervalYearMonthObjectInspector) poi).getPrimitiveWritableObject(o);
return intervalYearMonth.hashCode();
case INTERVAL_DAY_TIME:
HiveIntervalDayTimeWritable intervalDayTime = ((HiveIntervalDayTimeObjectInspector) poi).getPrimitiveWritableObject(o);
return intervalDayTime.hashCode();
case DECIMAL:
// compatible hash code.
return ((HiveDecimalObjectInspector) poi).getPrimitiveWritableObject(o).hashCode();
default:
{
throw new RuntimeException("Unknown type: " + poi.getPrimitiveCategory());
}
}
}
case LIST:
{
int r = 0;
ListObjectInspector listOI = (ListObjectInspector) objIns;
ObjectInspector elemOI = listOI.getListElementObjectInspector();
for (int ii = 0; ii < listOI.getListLength(o); ++ii) {
r = 31 * r + hashCode(listOI.getListElement(o, ii), elemOI);
}
return r;
}
case MAP:
{
int r = 0;
MapObjectInspector mapOI = (MapObjectInspector) objIns;
ObjectInspector keyOI = mapOI.getMapKeyObjectInspector();
ObjectInspector valueOI = mapOI.getMapValueObjectInspector();
Map<?, ?> map = mapOI.getMap(o);
for (Map.Entry<?, ?> entry : map.entrySet()) {
r += hashCode(entry.getKey(), keyOI) ^ hashCode(entry.getValue(), valueOI);
}
return r;
}
case STRUCT:
int r = 0;
StructObjectInspector structOI = (StructObjectInspector) objIns;
List<? extends StructField> fields = structOI.getAllStructFieldRefs();
for (StructField field : fields) {
r = 31 * r + hashCode(structOI.getStructFieldData(o, field), field.getFieldObjectInspector());
}
return r;
case UNION:
UnionObjectInspector uOI = (UnionObjectInspector) objIns;
byte tag = uOI.getTag(o);
return hashCode(uOI.getField(o), uOI.getObjectInspectors().get(tag));
default:
throw new RuntimeException("Unknown type: " + objIns.getTypeName());
}
}
Aggregations