Search in sources :

Example 1 with DecimalColumnStatistics

use of org.apache.orc.DecimalColumnStatistics in project hive by apache.

the class TestOrcFile method testHiveDecimalIsNullReset.

@Test
public void testHiveDecimalIsNullReset() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(DecimalStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000));
    // orc creates 1000 batch size to make memory check align with 5000 instead of 5120
    for (int i = 0; i < 1000; i++) {
        writer.addRow(new DecimalStruct(null));
    }
    writer.addRow(new DecimalStruct(new HiveDecimalWritable("1.00")));
    writer.addRow(new DecimalStruct(new HiveDecimalWritable("2.00")));
    writer.addRow(new DecimalStruct(new HiveDecimalWritable("3.00")));
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    StructObjectInspector readerInspector = (StructObjectInspector) reader.getObjectInspector();
    List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
    HiveDecimalObjectInspector doi = (HiveDecimalObjectInspector) readerInspector.getStructFieldRef("dec").getFieldObjectInspector();
    RecordReader rows = reader.rows();
    int idx = 0;
    while (rows.hasNext()) {
        Object row = rows.next(null);
        if (idx < 1000) {
            assertEquals(null, doi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
        }
        if (idx == 1000) {
            assertEquals(new HiveDecimalWritable(1), doi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
        }
        if (idx == 1001) {
            assertEquals(new HiveDecimalWritable(2), doi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
        }
        if (idx == 10002) {
            assertEquals(new HiveDecimalWritable(3), doi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
        }
        idx++;
    }
    // check the stats
    ColumnStatistics[] stats = reader.getStatistics();
    assertEquals(1003, stats[0].getNumberOfValues());
    assertEquals(3, stats[1].getNumberOfValues());
    assertEquals(HiveDecimal.create(3), ((DecimalColumnStatistics) stats[1]).getMaximum());
    assertEquals(HiveDecimal.create(1), ((DecimalColumnStatistics) stats[1]).getMinimum());
    assertEquals(HiveDecimal.create(6), ((DecimalColumnStatistics) stats[1]).getSum());
    assertEquals(true, stats[1].hasNull());
}
Also used : DecimalColumnStatistics(org.apache.orc.DecimalColumnStatistics) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) StringColumnStatistics(org.apache.orc.StringColumnStatistics) DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) ColumnStatistics(org.apache.orc.ColumnStatistics) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 2 with DecimalColumnStatistics

use of org.apache.orc.DecimalColumnStatistics in project hive by apache.

the class TestOrcFile method testUnionAndTimestamp.

/**
     * We test union, timestamp, and decimal separately since we need to make the
     * object inspector manually. (The Hive reflection-based doesn't handle
     * them properly.)
     */
@Test
public void testUnionAndTimestamp() throws Exception {
    List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT).addFieldNames("time").addFieldNames("union").addFieldNames("decimal").addSubtypes(1).addSubtypes(2).addSubtypes(5).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.TIMESTAMP).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.UNION).addSubtypes(3).addSubtypes(4).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.DECIMAL).build());
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = OrcStruct.createObjectInspector(0, types);
    }
    HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(1000).compress(CompressionKind.NONE).batchSize(1000).bufferSize(100).blockPadding(false));
    OrcStruct row = new OrcStruct(3);
    OrcUnion union = new OrcUnion();
    row.setFieldValue(1, union);
    row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("2000-03-12 15:00:00")));
    HiveDecimal value = HiveDecimal.create("12345678.6547456");
    row.setFieldValue(2, new HiveDecimalWritable(value));
    union.set((byte) 0, new IntWritable(42));
    writer.addRow(row);
    row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")));
    union.set((byte) 1, new Text("hello"));
    value = HiveDecimal.create("-5643.234");
    row.setFieldValue(2, new HiveDecimalWritable(value));
    writer.addRow(row);
    row.setFieldValue(0, null);
    row.setFieldValue(1, null);
    row.setFieldValue(2, null);
    writer.addRow(row);
    row.setFieldValue(1, union);
    union.set((byte) 0, null);
    writer.addRow(row);
    union.set((byte) 1, null);
    writer.addRow(row);
    union.set((byte) 0, new IntWritable(200000));
    row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("1970-01-01 00:00:00")));
    value = HiveDecimal.create("10000000000000000000");
    row.setFieldValue(2, new HiveDecimalWritable(value));
    writer.addRow(row);
    Random rand = new Random(42);
    for (int i = 1970; i < 2038; ++i) {
        row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf(i + "-05-05 12:34:56." + i)));
        if ((i & 1) == 0) {
            union.set((byte) 0, new IntWritable(i * i));
        } else {
            union.set((byte) 1, new Text(Integer.toString(i * i)));
        }
        value = HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
        row.setFieldValue(2, new HiveDecimalWritable(value));
        if (maxValue.compareTo(value) < 0) {
            maxValue = value;
        }
        writer.addRow(row);
    }
    // let's add a lot of constant rows to test the rle
    row.setFieldValue(0, null);
    union.set((byte) 0, new IntWritable(1732050807));
    row.setFieldValue(2, null);
    for (int i = 0; i < 5000; ++i) {
        writer.addRow(row);
    }
    union.set((byte) 0, new IntWritable(0));
    writer.addRow(row);
    union.set((byte) 0, new IntWritable(10));
    writer.addRow(row);
    union.set((byte) 0, new IntWritable(138));
    writer.addRow(row);
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    TypeDescription schema = writer.getSchema();
    assertEquals(5, schema.getMaximumId());
    boolean[] expected = new boolean[] { false, false, false, false, false, false };
    boolean[] included = OrcUtils.includeColumns("", schema);
    assertEquals(true, Arrays.equals(expected, included));
    expected = new boolean[] { false, true, false, false, false, true };
    included = OrcUtils.includeColumns("time,decimal", schema);
    assertEquals(true, Arrays.equals(expected, included));
    expected = new boolean[] { false, false, true, true, true, false };
    included = OrcUtils.includeColumns("union", schema);
    assertEquals(true, Arrays.equals(expected, included));
    assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
    assertEquals(5077, reader.getNumberOfRows());
    DecimalColumnStatistics stats = (DecimalColumnStatistics) reader.getStatistics()[5];
    assertEquals(71, stats.getNumberOfValues());
    assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum());
    assertEquals(maxValue, stats.getMaximum());
    // TODO: fix this
    //    assertEquals(null,stats.getSum());
    int stripeCount = 0;
    int rowCount = 0;
    long currentOffset = -1;
    for (StripeInformation stripe : reader.getStripes()) {
        stripeCount += 1;
        rowCount += stripe.getNumberOfRows();
        if (currentOffset < 0) {
            currentOffset = stripe.getOffset() + stripe.getLength();
        } else {
            assertEquals(currentOffset, stripe.getOffset());
            currentOffset += stripe.getLength();
        }
    }
    assertEquals(reader.getNumberOfRows(), rowCount);
    assertEquals(2, stripeCount);
    assertEquals(reader.getContentLength(), currentOffset);
    RecordReader rows = reader.rows();
    assertEquals(0, rows.getRowNumber());
    assertEquals(0.0, rows.getProgress(), 0.000001);
    assertEquals(true, rows.hasNext());
    row = (OrcStruct) rows.next(null);
    assertEquals(1, rows.getRowNumber());
    inspector = reader.getObjectInspector();
    assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>", inspector.getTypeName());
    assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-12 15:00:00")), row.getFieldValue(0));
    union = (OrcUnion) row.getFieldValue(1);
    assertEquals(0, union.getTag());
    assertEquals(new IntWritable(42), union.getObject());
    assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")), row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(2, rows.getRowNumber());
    assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")), row.getFieldValue(0));
    assertEquals(1, union.getTag());
    assertEquals(new Text("hello"), union.getObject());
    assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(null, row.getFieldValue(0));
    assertEquals(null, row.getFieldValue(1));
    assertEquals(null, row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(null, row.getFieldValue(0));
    union = (OrcUnion) row.getFieldValue(1);
    assertEquals(0, union.getTag());
    assertEquals(null, union.getObject());
    assertEquals(null, row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(null, row.getFieldValue(0));
    assertEquals(1, union.getTag());
    assertEquals(null, union.getObject());
    assertEquals(null, row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(new TimestampWritable(Timestamp.valueOf("1970-01-01 00:00:00")), row.getFieldValue(0));
    assertEquals(new IntWritable(200000), union.getObject());
    assertEquals(new HiveDecimalWritable(HiveDecimal.create("10000000000000000000")), row.getFieldValue(2));
    rand = new Random(42);
    for (int i = 1970; i < 2038; ++i) {
        row = (OrcStruct) rows.next(row);
        assertEquals(new TimestampWritable(Timestamp.valueOf(i + "-05-05 12:34:56." + i)), row.getFieldValue(0));
        if ((i & 1) == 0) {
            assertEquals(0, union.getTag());
            assertEquals(new IntWritable(i * i), union.getObject());
        } else {
            assertEquals(1, union.getTag());
            assertEquals(new Text(Integer.toString(i * i)), union.getObject());
        }
        assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18))), row.getFieldValue(2));
    }
    for (int i = 0; i < 5000; ++i) {
        row = (OrcStruct) rows.next(row);
        assertEquals(new IntWritable(1732050807), union.getObject());
    }
    row = (OrcStruct) rows.next(row);
    assertEquals(new IntWritable(0), union.getObject());
    row = (OrcStruct) rows.next(row);
    assertEquals(new IntWritable(10), union.getObject());
    row = (OrcStruct) rows.next(row);
    assertEquals(new IntWritable(138), union.getObject());
    assertEquals(false, rows.hasNext());
    assertEquals(1.0, rows.getProgress(), 0.00001);
    assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
    rows.seekToRow(1);
    row = (OrcStruct) rows.next(row);
    assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")), row.getFieldValue(0));
    assertEquals(1, union.getTag());
    assertEquals(new Text("hello"), union.getObject());
    assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), row.getFieldValue(2));
    rows.close();
}
Also used : HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) OrcProto(org.apache.orc.OrcProto) DecimalColumnStatistics(org.apache.orc.DecimalColumnStatistics) ArrayList(java.util.ArrayList) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) Text(org.apache.hadoop.io.Text) Random(java.util.Random) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) BigInteger(java.math.BigInteger) TypeDescription(org.apache.orc.TypeDescription) IntWritable(org.apache.hadoop.io.IntWritable) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Aggregations

HiveDecimalWritable (org.apache.hadoop.hive.serde2.io.HiveDecimalWritable)2 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)2 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)2 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)2 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)2 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)2 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)2 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)2 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)2 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)2 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)2 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)2 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)2 ShortObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector)2 StringObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector)2 TimestampObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector)2 DecimalColumnStatistics (org.apache.orc.DecimalColumnStatistics)2 Test (org.junit.Test)2 BigInteger (java.math.BigInteger)1 ArrayList (java.util.ArrayList)1