use of org.apache.orc.DoubleColumnStatistics in project hive by apache.
the class TestOrcSerDeStats method testOrcSerDeStatsComplexOldFormat.
@Test
public void testOrcSerDeStatsComplexOldFormat() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).version(OrcFile.Version.V_0_11).bufferSize(10000));
// 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64
writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536, Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(3, "good"), inner(4, "bad")), map(), Timestamp.valueOf("2000-03-12 15:00:00"), HiveDecimal.create("12345678.6547456")));
// 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 =
// 97
writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536, Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.valueOf("2000-03-11 15:00:00"), HiveDecimal.create("12345678.6547452")));
writer.close();
long rowCount = writer.getNumberOfRows();
long rawDataSize = writer.getRawDataSize();
assertEquals(2, rowCount);
assertEquals(1740, rawDataSize);
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(2, reader.getNumberOfRows());
assertEquals(1740, reader.getRawDataSize());
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1")));
assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1")));
assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1")));
assertEquals(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
assertEquals(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
assertEquals(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list")));
assertEquals(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map")));
assertEquals(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle")));
assertEquals(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts")));
assertEquals(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1")));
assertEquals(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1")));
assertEquals(1195, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1")));
assertEquals(185, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1")));
assertEquals(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1", "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list", "map", "middle", "ts", "decimal1")));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(2, stats[1].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072", stats[3].toString());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 2 hasNull: false min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0", stats[7].toString());
assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum());
assertEquals("count: 2 hasNull: false sum: 5", stats[8].toString());
assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum());
assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum());
assertEquals(5, ((StringColumnStatistics) stats[9]).getSum());
assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
}
use of org.apache.orc.DoubleColumnStatistics in project hive by apache.
the class TestOrcFile method test1.
@Test
public void test1() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000));
writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536, Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(3, "good"), inner(4, "bad")), map()));
writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536, Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), map(inner(5, "chani"), inner(1, "mauddib"))));
writer.close();
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
TypeDescription schema = writer.getSchema();
assertEquals(23, schema.getMaximumId());
boolean[] expected = new boolean[] { false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false };
boolean[] included = OrcUtils.includeColumns("", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] { false, true, false, false, false, false, false, false, false, true, true, true, true, true, true, false, false, false, false, true, true, true, true, true };
included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] { false, true, false, false, false, false, false, false, false, true, true, true, true, true, true, false, false, false, false, true, true, true, true, true };
included = OrcUtils.includeColumns("boolean1,string1,middle,map", schema);
assertEquals(true, Arrays.equals(expected, included));
expected = new boolean[] { false, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true };
included = OrcUtils.includeColumns("boolean1,byte1,short1,int1,long1,float1,double1,bytes1,string1,middle,list,map", schema);
assertEquals(true, Arrays.equals(expected, included));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(2, stats[1].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072", stats[3].toString());
StripeStatistics ss = reader.getStripeStatistics().get(0);
assertEquals(2, ss.getColumnStatistics()[0].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) ss.getColumnStatistics()[1]).getTrueCount());
assertEquals(1024, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMinimum());
assertEquals(2048, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getMaximum());
assertEquals(3072, ((IntegerColumnStatistics) ss.getColumnStatistics()[3]).getSum());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0", stats[7].toString());
assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
// check the inspectors
StructObjectInspector readerInspector = (StructObjectInspector) reader.getObjectInspector();
assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory());
assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint," + "int1:int,long1:bigint,float1:float,double1:double,bytes1:" + "binary,string1:string,middle:struct<list:array<struct<int1:int," + "string1:string>>>,list:array<struct<int1:int,string1:string>>," + "map:map<string,struct<int1:int,string1:string>>>", readerInspector.getTypeName());
List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
BooleanObjectInspector bo = (BooleanObjectInspector) readerInspector.getStructFieldRef("boolean1").getFieldObjectInspector();
ByteObjectInspector by = (ByteObjectInspector) readerInspector.getStructFieldRef("byte1").getFieldObjectInspector();
ShortObjectInspector sh = (ShortObjectInspector) readerInspector.getStructFieldRef("short1").getFieldObjectInspector();
IntObjectInspector in = (IntObjectInspector) readerInspector.getStructFieldRef("int1").getFieldObjectInspector();
LongObjectInspector lo = (LongObjectInspector) readerInspector.getStructFieldRef("long1").getFieldObjectInspector();
FloatObjectInspector fl = (FloatObjectInspector) readerInspector.getStructFieldRef("float1").getFieldObjectInspector();
DoubleObjectInspector dbl = (DoubleObjectInspector) readerInspector.getStructFieldRef("double1").getFieldObjectInspector();
BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector.getStructFieldRef("bytes1").getFieldObjectInspector();
StringObjectInspector st = (StringObjectInspector) readerInspector.getStructFieldRef("string1").getFieldObjectInspector();
StructObjectInspector mid = (StructObjectInspector) readerInspector.getStructFieldRef("middle").getFieldObjectInspector();
List<? extends StructField> midFields = mid.getAllStructFieldRefs();
ListObjectInspector midli = (ListObjectInspector) midFields.get(0).getFieldObjectInspector();
StructObjectInspector inner = (StructObjectInspector) midli.getListElementObjectInspector();
List<? extends StructField> inFields = inner.getAllStructFieldRefs();
ListObjectInspector li = (ListObjectInspector) readerInspector.getStructFieldRef("list").getFieldObjectInspector();
MapObjectInspector ma = (MapObjectInspector) readerInspector.getStructFieldRef("map").getFieldObjectInspector();
StringObjectInspector mk = (StringObjectInspector) ma.getMapKeyObjectInspector();
RecordReader rows = reader.rows();
Object row = rows.next(null);
assertNotNull(row);
// check the contents of the first row
assertEquals(false, bo.get(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals(1, by.get(readerInspector.getStructFieldData(row, fields.get(1))));
assertEquals(1024, sh.get(readerInspector.getStructFieldData(row, fields.get(2))));
assertEquals(65536, in.get(readerInspector.getStructFieldData(row, fields.get(3))));
assertEquals(Long.MAX_VALUE, lo.get(readerInspector.getStructFieldData(row, fields.get(4))));
assertEquals(1.0, fl.get(readerInspector.getStructFieldData(row, fields.get(5))), 0.00001);
assertEquals(-15.0, dbl.get(readerInspector.getStructFieldData(row, fields.get(6))), 0.00001);
assertEquals(bytes(0, 1, 2, 3, 4), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(7))));
assertEquals("hi", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(8))));
List<?> midRow = midli.getList(mid.getStructFieldData(readerInspector.getStructFieldData(row, fields.get(9)), midFields.get(0)));
assertNotNull(midRow);
assertEquals(2, midRow.size());
assertEquals(1, in.get(inner.getStructFieldData(midRow.get(0), inFields.get(0))));
assertEquals("bye", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(0), inFields.get(1))));
assertEquals(2, in.get(inner.getStructFieldData(midRow.get(1), inFields.get(0))));
assertEquals("sigh", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(1), inFields.get(1))));
List<?> list = li.getList(readerInspector.getStructFieldData(row, fields.get(10)));
assertEquals(2, list.size());
assertEquals(3, in.get(inner.getStructFieldData(list.get(0), inFields.get(0))));
assertEquals("good", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(0), inFields.get(1))));
assertEquals(4, in.get(inner.getStructFieldData(list.get(1), inFields.get(0))));
assertEquals("bad", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(1), inFields.get(1))));
Map<?, ?> map = ma.getMap(readerInspector.getStructFieldData(row, fields.get(11)));
assertEquals(0, map.size());
// check the contents of second row
assertEquals(true, rows.hasNext());
row = rows.next(row);
assertEquals(true, bo.get(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals(100, by.get(readerInspector.getStructFieldData(row, fields.get(1))));
assertEquals(2048, sh.get(readerInspector.getStructFieldData(row, fields.get(2))));
assertEquals(65536, in.get(readerInspector.getStructFieldData(row, fields.get(3))));
assertEquals(Long.MAX_VALUE, lo.get(readerInspector.getStructFieldData(row, fields.get(4))));
assertEquals(2.0, fl.get(readerInspector.getStructFieldData(row, fields.get(5))), 0.00001);
assertEquals(-5.0, dbl.get(readerInspector.getStructFieldData(row, fields.get(6))), 0.00001);
assertEquals(bytes(), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(7))));
assertEquals("bye", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(8))));
midRow = midli.getList(mid.getStructFieldData(readerInspector.getStructFieldData(row, fields.get(9)), midFields.get(0)));
assertNotNull(midRow);
assertEquals(2, midRow.size());
assertEquals(1, in.get(inner.getStructFieldData(midRow.get(0), inFields.get(0))));
assertEquals("bye", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(0), inFields.get(1))));
assertEquals(2, in.get(inner.getStructFieldData(midRow.get(1), inFields.get(0))));
assertEquals("sigh", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(1), inFields.get(1))));
list = li.getList(readerInspector.getStructFieldData(row, fields.get(10)));
assertEquals(3, list.size());
assertEquals(100000000, in.get(inner.getStructFieldData(list.get(0), inFields.get(0))));
assertEquals("cat", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(0), inFields.get(1))));
assertEquals(-100000, in.get(inner.getStructFieldData(list.get(1), inFields.get(0))));
assertEquals("in", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(1), inFields.get(1))));
assertEquals(1234, in.get(inner.getStructFieldData(list.get(2), inFields.get(0))));
assertEquals("hat", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(2), inFields.get(1))));
map = ma.getMap(readerInspector.getStructFieldData(row, fields.get(11)));
assertEquals(2, map.size());
boolean[] found = new boolean[2];
for (Object key : map.keySet()) {
String str = mk.getPrimitiveJavaObject(key);
if (str.equals("chani")) {
assertEquals(false, found[0]);
assertEquals(5, in.get(inner.getStructFieldData(map.get(key), inFields.get(0))));
assertEquals(str, st.getPrimitiveJavaObject(inner.getStructFieldData(map.get(key), inFields.get(1))));
found[0] = true;
} else if (str.equals("mauddib")) {
assertEquals(false, found[1]);
assertEquals(1, in.get(inner.getStructFieldData(map.get(key), inFields.get(0))));
assertEquals(str, st.getPrimitiveJavaObject(inner.getStructFieldData(map.get(key), inFields.get(1))));
found[1] = true;
} else {
throw new IllegalArgumentException("Unknown key " + str);
}
}
assertEquals(true, found[0]);
assertEquals(true, found[1]);
// handle the close up
assertEquals(false, rows.hasNext());
rows.close();
}
use of org.apache.orc.DoubleColumnStatistics in project hive by apache.
the class TestOrcFile method testReadFormat_0_11.
@Test
public void testReadFormat_0_11() throws Exception {
Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(fs));
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for (StripeInformation stripe : reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
}
}
assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(2, stripeCount);
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(7500, stats[1].getNumberOfValues());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000", stats[3].toString());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0", stats[7].toString());
assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
// check the inspectors
StructObjectInspector readerInspector = (StructObjectInspector) reader.getObjectInspector();
assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory());
assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint," + "int1:int,long1:bigint,float1:float,double1:double,bytes1:" + "binary,string1:string,middle:struct<list:array<struct<int1:int," + "string1:string>>>,list:array<struct<int1:int,string1:string>>," + "map:map<string,struct<int1:int,string1:string>>,ts:timestamp," + "decimal1:decimal(38,18)>", readerInspector.getTypeName());
List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
BooleanObjectInspector bo = (BooleanObjectInspector) readerInspector.getStructFieldRef("boolean1").getFieldObjectInspector();
ByteObjectInspector by = (ByteObjectInspector) readerInspector.getStructFieldRef("byte1").getFieldObjectInspector();
ShortObjectInspector sh = (ShortObjectInspector) readerInspector.getStructFieldRef("short1").getFieldObjectInspector();
IntObjectInspector in = (IntObjectInspector) readerInspector.getStructFieldRef("int1").getFieldObjectInspector();
LongObjectInspector lo = (LongObjectInspector) readerInspector.getStructFieldRef("long1").getFieldObjectInspector();
FloatObjectInspector fl = (FloatObjectInspector) readerInspector.getStructFieldRef("float1").getFieldObjectInspector();
DoubleObjectInspector dbl = (DoubleObjectInspector) readerInspector.getStructFieldRef("double1").getFieldObjectInspector();
BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector.getStructFieldRef("bytes1").getFieldObjectInspector();
StringObjectInspector st = (StringObjectInspector) readerInspector.getStructFieldRef("string1").getFieldObjectInspector();
StructObjectInspector mid = (StructObjectInspector) readerInspector.getStructFieldRef("middle").getFieldObjectInspector();
List<? extends StructField> midFields = mid.getAllStructFieldRefs();
ListObjectInspector midli = (ListObjectInspector) midFields.get(0).getFieldObjectInspector();
StructObjectInspector inner = (StructObjectInspector) midli.getListElementObjectInspector();
List<? extends StructField> inFields = inner.getAllStructFieldRefs();
ListObjectInspector li = (ListObjectInspector) readerInspector.getStructFieldRef("list").getFieldObjectInspector();
MapObjectInspector ma = (MapObjectInspector) readerInspector.getStructFieldRef("map").getFieldObjectInspector();
TimestampObjectInspector tso = (TimestampObjectInspector) readerInspector.getStructFieldRef("ts").getFieldObjectInspector();
HiveDecimalObjectInspector dco = (HiveDecimalObjectInspector) readerInspector.getStructFieldRef("decimal1").getFieldObjectInspector();
StringObjectInspector mk = (StringObjectInspector) ma.getMapKeyObjectInspector();
RecordReader rows = reader.rows();
Object row = rows.next(null);
assertNotNull(row);
// check the contents of the first row
assertEquals(false, bo.get(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals(1, by.get(readerInspector.getStructFieldData(row, fields.get(1))));
assertEquals(1024, sh.get(readerInspector.getStructFieldData(row, fields.get(2))));
assertEquals(65536, in.get(readerInspector.getStructFieldData(row, fields.get(3))));
assertEquals(Long.MAX_VALUE, lo.get(readerInspector.getStructFieldData(row, fields.get(4))));
assertEquals(1.0, fl.get(readerInspector.getStructFieldData(row, fields.get(5))), 0.00001);
assertEquals(-15.0, dbl.get(readerInspector.getStructFieldData(row, fields.get(6))), 0.00001);
assertEquals(bytes(0, 1, 2, 3, 4), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(7))));
assertEquals("hi", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(8))));
List<?> midRow = midli.getList(mid.getStructFieldData(readerInspector.getStructFieldData(row, fields.get(9)), midFields.get(0)));
assertNotNull(midRow);
assertEquals(2, midRow.size());
assertEquals(1, in.get(inner.getStructFieldData(midRow.get(0), inFields.get(0))));
assertEquals("bye", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(0), inFields.get(1))));
assertEquals(2, in.get(inner.getStructFieldData(midRow.get(1), inFields.get(0))));
assertEquals("sigh", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(1), inFields.get(1))));
List<?> list = li.getList(readerInspector.getStructFieldData(row, fields.get(10)));
assertEquals(2, list.size());
assertEquals(3, in.get(inner.getStructFieldData(list.get(0), inFields.get(0))));
assertEquals("good", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(0), inFields.get(1))));
assertEquals(4, in.get(inner.getStructFieldData(list.get(1), inFields.get(0))));
assertEquals("bad", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(1), inFields.get(1))));
Map<?, ?> map = ma.getMap(readerInspector.getStructFieldData(row, fields.get(11)));
assertEquals(0, map.size());
assertEquals(Timestamp.valueOf("2000-03-12 15:00:00"), tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(12))));
assertEquals(HiveDecimal.create("12345678.6547456"), dco.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(13))));
// check the contents of second row
assertEquals(true, rows.hasNext());
rows.seekToRow(7499);
row = rows.next(null);
assertEquals(true, bo.get(readerInspector.getStructFieldData(row, fields.get(0))));
assertEquals(100, by.get(readerInspector.getStructFieldData(row, fields.get(1))));
assertEquals(2048, sh.get(readerInspector.getStructFieldData(row, fields.get(2))));
assertEquals(65536, in.get(readerInspector.getStructFieldData(row, fields.get(3))));
assertEquals(Long.MAX_VALUE, lo.get(readerInspector.getStructFieldData(row, fields.get(4))));
assertEquals(2.0, fl.get(readerInspector.getStructFieldData(row, fields.get(5))), 0.00001);
assertEquals(-5.0, dbl.get(readerInspector.getStructFieldData(row, fields.get(6))), 0.00001);
assertEquals(bytes(), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(7))));
assertEquals("bye", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(8))));
midRow = midli.getList(mid.getStructFieldData(readerInspector.getStructFieldData(row, fields.get(9)), midFields.get(0)));
assertNotNull(midRow);
assertEquals(2, midRow.size());
assertEquals(1, in.get(inner.getStructFieldData(midRow.get(0), inFields.get(0))));
assertEquals("bye", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(0), inFields.get(1))));
assertEquals(2, in.get(inner.getStructFieldData(midRow.get(1), inFields.get(0))));
assertEquals("sigh", st.getPrimitiveJavaObject(inner.getStructFieldData(midRow.get(1), inFields.get(1))));
list = li.getList(readerInspector.getStructFieldData(row, fields.get(10)));
assertEquals(3, list.size());
assertEquals(100000000, in.get(inner.getStructFieldData(list.get(0), inFields.get(0))));
assertEquals("cat", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(0), inFields.get(1))));
assertEquals(-100000, in.get(inner.getStructFieldData(list.get(1), inFields.get(0))));
assertEquals("in", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(1), inFields.get(1))));
assertEquals(1234, in.get(inner.getStructFieldData(list.get(2), inFields.get(0))));
assertEquals("hat", st.getPrimitiveJavaObject(inner.getStructFieldData(list.get(2), inFields.get(1))));
map = ma.getMap(readerInspector.getStructFieldData(row, fields.get(11)));
assertEquals(2, map.size());
boolean[] found = new boolean[2];
for (Object key : map.keySet()) {
String str = mk.getPrimitiveJavaObject(key);
if (str.equals("chani")) {
assertEquals(false, found[0]);
assertEquals(5, in.get(inner.getStructFieldData(map.get(key), inFields.get(0))));
assertEquals(str, st.getPrimitiveJavaObject(inner.getStructFieldData(map.get(key), inFields.get(1))));
found[0] = true;
} else if (str.equals("mauddib")) {
assertEquals(false, found[1]);
assertEquals(1, in.get(inner.getStructFieldData(map.get(key), inFields.get(0))));
assertEquals(str, st.getPrimitiveJavaObject(inner.getStructFieldData(map.get(key), inFields.get(1))));
found[1] = true;
} else {
throw new IllegalArgumentException("Unknown key " + str);
}
}
assertEquals(true, found[0]);
assertEquals(true, found[1]);
assertEquals(Timestamp.valueOf("2000-03-12 15:00:01"), tso.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(12))));
assertEquals(HiveDecimal.create("12345678.6547457"), dco.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(13))));
// handle the close up
assertEquals(false, rows.hasNext());
rows.close();
}
use of org.apache.orc.DoubleColumnStatistics in project hive by apache.
the class TestOrcSerDeStats method testSerdeStatsOldFormat.
@Test(expected = ClassCastException.class)
public void testSerdeStatsOldFormat() throws Exception {
Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(fs));
int stripeCount = 0;
int rowCount = 0;
long currentOffset = -1;
for (StripeInformation stripe : reader.getStripes()) {
stripeCount += 1;
rowCount += stripe.getNumberOfRows();
if (currentOffset < 0) {
currentOffset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
} else {
assertEquals(currentOffset, stripe.getOffset());
currentOffset += stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
}
}
assertEquals(reader.getNumberOfRows(), rowCount);
assertEquals(6300000, reader.getRawDataSize());
assertEquals(2, stripeCount);
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(7500, stats[1].getNumberOfValues());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000", stats[3].toString());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0", stats[7].toString());
assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum());
assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum());
assertEquals(0, ((StringColumnStatistics) stats[9]).getSum());
assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
// old orc format will not have binary statistics. toString() will show only
// the general column statistics
assertEquals("count: 7500 hasNull: true", stats[8].toString());
// since old orc format doesn't support binary statistics,
// this should throw ClassCastException
assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum());
}
use of org.apache.orc.DoubleColumnStatistics in project hive by apache.
the class TestOrcSerDeStats method testOrcSerDeStatsComplex.
@Test
public void testOrcSerDeStatsComplex() throws Exception {
ObjectInspector inspector;
synchronized (TestOrcSerDeStats.class) {
inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
}
Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000));
// 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64
writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536, Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(3, "good"), inner(4, "bad")), map(), Timestamp.valueOf("2000-03-12 15:00:00"), HiveDecimal.create("12345678.6547456")));
// 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 =
// 97
writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536, Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.valueOf("2000-03-11 15:00:00"), HiveDecimal.create("12345678.6547452")));
writer.close();
long rowCount = writer.getNumberOfRows();
long rawDataSize = writer.getRawDataSize();
assertEquals(2, rowCount);
assertEquals(1740, rawDataSize);
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(2, reader.getNumberOfRows());
assertEquals(1740, reader.getRawDataSize());
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1")));
assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1")));
assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1")));
assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1")));
assertEquals(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
assertEquals(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
assertEquals(455, reader.getRawDataSizeOfColumns(Lists.newArrayList("list")));
assertEquals(368, reader.getRawDataSizeOfColumns(Lists.newArrayList("map")));
assertEquals(364, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle")));
assertEquals(80, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts")));
assertEquals(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1")));
assertEquals(88, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1")));
assertEquals(1195, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1")));
assertEquals(185, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1")));
assertEquals(rawDataSize, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1", "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list", "map", "middle", "ts", "decimal1")));
// check the stats
ColumnStatistics[] stats = reader.getStatistics();
assertEquals(2, stats[1].getNumberOfValues());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
assertEquals("count: 2 hasNull: false true: 1", stats[1].toString());
assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
assertEquals("count: 2 hasNull: false min: 1024 max: 2048 sum: 3072", stats[3].toString());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
assertEquals("count: 2 hasNull: false min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
assertEquals("count: 2 hasNull: false min: -15.0 max: -5.0 sum: -20.0", stats[7].toString());
assertEquals("count: 2 hasNull: false min: bye max: hi sum: 5", stats[9].toString());
}
Aggregations