Search in sources :

Example 26 with TypeDescription

use of org.apache.orc.TypeDescription in project hive by apache.

the class TestOrcFile method testUnionAndTimestamp.

/**
 * We test union, timestamp, and decimal separately since we need to make the
 * object inspector manually. (The Hive reflection-based doesn't handle
 * them properly.)
 */
@Test
public void testUnionAndTimestamp() throws Exception {
    List<OrcProto.Type> types = new ArrayList<OrcProto.Type>();
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRUCT).addFieldNames("time").addFieldNames("union").addFieldNames("decimal").addSubtypes(1).addSubtypes(2).addSubtypes(5).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.TIMESTAMP).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.UNION).addSubtypes(3).addSubtypes(4).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.INT).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.STRING).build());
    types.add(OrcProto.Type.newBuilder().setKind(OrcProto.Type.Kind.DECIMAL).build());
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = OrcStruct.createObjectInspector(0, types);
    }
    HiveDecimal maxValue = HiveDecimal.create("10000000000000000000");
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(1000).compress(CompressionKind.NONE).batchSize(1000).bufferSize(100).blockPadding(false));
    OrcStruct row = new OrcStruct(3);
    OrcUnion union = new OrcUnion();
    row.setFieldValue(1, union);
    row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("2000-03-12 15:00:00")));
    HiveDecimal value = HiveDecimal.create("12345678.6547456");
    row.setFieldValue(2, new HiveDecimalWritable(value));
    union.set((byte) 0, new IntWritable(42));
    writer.addRow(row);
    row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")));
    union.set((byte) 1, new Text("hello"));
    value = HiveDecimal.create("-5643.234");
    row.setFieldValue(2, new HiveDecimalWritable(value));
    writer.addRow(row);
    row.setFieldValue(0, null);
    row.setFieldValue(1, null);
    row.setFieldValue(2, null);
    writer.addRow(row);
    row.setFieldValue(1, union);
    union.set((byte) 0, null);
    writer.addRow(row);
    union.set((byte) 1, null);
    writer.addRow(row);
    union.set((byte) 0, new IntWritable(200000));
    row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf("1970-01-01 00:00:00")));
    value = HiveDecimal.create("10000000000000000000");
    row.setFieldValue(2, new HiveDecimalWritable(value));
    writer.addRow(row);
    Random rand = new Random(42);
    for (int i = 1970; i < 2038; ++i) {
        row.setFieldValue(0, new TimestampWritable(Timestamp.valueOf(i + "-05-05 12:34:56." + i)));
        if ((i & 1) == 0) {
            union.set((byte) 0, new IntWritable(i * i));
        } else {
            union.set((byte) 1, new Text(Integer.toString(i * i)));
        }
        value = HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18));
        row.setFieldValue(2, new HiveDecimalWritable(value));
        if (maxValue.compareTo(value) < 0) {
            maxValue = value;
        }
        writer.addRow(row);
    }
    // let's add a lot of constant rows to test the rle
    row.setFieldValue(0, null);
    union.set((byte) 0, new IntWritable(1732050807));
    row.setFieldValue(2, null);
    for (int i = 0; i < 5000; ++i) {
        writer.addRow(row);
    }
    union.set((byte) 0, new IntWritable(0));
    writer.addRow(row);
    union.set((byte) 0, new IntWritable(10));
    writer.addRow(row);
    union.set((byte) 0, new IntWritable(138));
    writer.addRow(row);
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    TypeDescription schema = writer.getSchema();
    assertEquals(5, schema.getMaximumId());
    boolean[] expected = new boolean[] { false, false, false, false, false, false };
    boolean[] included = OrcUtils.includeColumns("", schema);
    assertEquals(true, Arrays.equals(expected, included));
    expected = new boolean[] { false, true, false, false, false, true };
    included = OrcUtils.includeColumns("time,decimal", schema);
    assertEquals(true, Arrays.equals(expected, included));
    expected = new boolean[] { false, false, true, true, true, false };
    included = OrcUtils.includeColumns("union", schema);
    assertEquals(true, Arrays.equals(expected, included));
    assertEquals(false, reader.getMetadataKeys().iterator().hasNext());
    assertEquals(5077, reader.getNumberOfRows());
    DecimalColumnStatistics stats = (DecimalColumnStatistics) reader.getStatistics()[5];
    assertEquals(71, stats.getNumberOfValues());
    assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum());
    assertEquals(maxValue, stats.getMaximum());
    // TODO: fix this
    // assertEquals(null,stats.getSum());
    int stripeCount = 0;
    int rowCount = 0;
    long currentOffset = -1;
    for (StripeInformation stripe : reader.getStripes()) {
        stripeCount += 1;
        rowCount += stripe.getNumberOfRows();
        if (currentOffset < 0) {
            currentOffset = stripe.getOffset() + stripe.getLength();
        } else {
            assertEquals(currentOffset, stripe.getOffset());
            currentOffset += stripe.getLength();
        }
    }
    assertEquals(reader.getNumberOfRows(), rowCount);
    assertEquals(2, stripeCount);
    assertEquals(reader.getContentLength(), currentOffset);
    RecordReader rows = reader.rows();
    assertEquals(0, rows.getRowNumber());
    assertEquals(0.0, rows.getProgress(), 0.000001);
    assertEquals(true, rows.hasNext());
    row = (OrcStruct) rows.next(null);
    assertEquals(1, rows.getRowNumber());
    inspector = reader.getObjectInspector();
    assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>", inspector.getTypeName());
    assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-12 15:00:00")), row.getFieldValue(0));
    union = (OrcUnion) row.getFieldValue(1);
    assertEquals(0, union.getTag());
    assertEquals(new IntWritable(42), union.getObject());
    assertEquals(new HiveDecimalWritable(HiveDecimal.create("12345678.6547456")), row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(2, rows.getRowNumber());
    assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")), row.getFieldValue(0));
    assertEquals(1, union.getTag());
    assertEquals(new Text("hello"), union.getObject());
    assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(null, row.getFieldValue(0));
    assertEquals(null, row.getFieldValue(1));
    assertEquals(null, row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(null, row.getFieldValue(0));
    union = (OrcUnion) row.getFieldValue(1);
    assertEquals(0, union.getTag());
    assertEquals(null, union.getObject());
    assertEquals(null, row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(null, row.getFieldValue(0));
    assertEquals(1, union.getTag());
    assertEquals(null, union.getObject());
    assertEquals(null, row.getFieldValue(2));
    row = (OrcStruct) rows.next(row);
    assertEquals(new TimestampWritable(Timestamp.valueOf("1970-01-01 00:00:00")), row.getFieldValue(0));
    assertEquals(new IntWritable(200000), union.getObject());
    assertEquals(new HiveDecimalWritable(HiveDecimal.create("10000000000000000000")), row.getFieldValue(2));
    rand = new Random(42);
    for (int i = 1970; i < 2038; ++i) {
        row = (OrcStruct) rows.next(row);
        assertEquals(new TimestampWritable(Timestamp.valueOf(i + "-05-05 12:34:56." + i)), row.getFieldValue(0));
        if ((i & 1) == 0) {
            assertEquals(0, union.getTag());
            assertEquals(new IntWritable(i * i), union.getObject());
        } else {
            assertEquals(1, union.getTag());
            assertEquals(new Text(Integer.toString(i * i)), union.getObject());
        }
        assertEquals(new HiveDecimalWritable(HiveDecimal.create(new BigInteger(64, rand), rand.nextInt(18))), row.getFieldValue(2));
    }
    for (int i = 0; i < 5000; ++i) {
        row = (OrcStruct) rows.next(row);
        assertEquals(new IntWritable(1732050807), union.getObject());
    }
    row = (OrcStruct) rows.next(row);
    assertEquals(new IntWritable(0), union.getObject());
    row = (OrcStruct) rows.next(row);
    assertEquals(new IntWritable(10), union.getObject());
    row = (OrcStruct) rows.next(row);
    assertEquals(new IntWritable(138), union.getObject());
    assertEquals(false, rows.hasNext());
    assertEquals(1.0, rows.getProgress(), 0.00001);
    assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
    rows.seekToRow(1);
    row = (OrcStruct) rows.next(row);
    assertEquals(new TimestampWritable(Timestamp.valueOf("2000-03-20 12:00:00.123456789")), row.getFieldValue(0));
    assertEquals(1, union.getTag());
    assertEquals(new Text("hello"), union.getObject());
    assertEquals(new HiveDecimalWritable(HiveDecimal.create("-5643.234")), row.getFieldValue(2));
    rows.close();
}
Also used : HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) HiveDecimalWritable(org.apache.hadoop.hive.serde2.io.HiveDecimalWritable) OrcProto(org.apache.orc.OrcProto) DecimalColumnStatistics(org.apache.orc.DecimalColumnStatistics) ArrayList(java.util.ArrayList) TimestampWritable(org.apache.hadoop.hive.serde2.io.TimestampWritable) Text(org.apache.hadoop.io.Text) Random(java.util.Random) HiveDecimal(org.apache.hadoop.hive.common.type.HiveDecimal) BigInteger(java.math.BigInteger) TypeDescription(org.apache.orc.TypeDescription) IntWritable(org.apache.hadoop.io.IntWritable) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Aggregations

TypeDescription (org.apache.orc.TypeDescription)26 Test (org.junit.Test)7 ArrayList (java.util.ArrayList)6 BytesColumnVector (org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector)6 LongColumnVector (org.apache.hadoop.hive.ql.exec.vector.LongColumnVector)6 VectorizedRowBatch (org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch)6 StructColumnVector (org.apache.hadoop.hive.ql.exec.vector.StructColumnVector)5 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)4 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)4 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)4 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)4 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)4 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)4 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)4 Configuration (org.apache.hadoop.conf.Configuration)3 Path (org.apache.hadoop.fs.Path)3 ColumnVector (org.apache.hadoop.hive.ql.exec.vector.ColumnVector)3 ListColumnVector (org.apache.hadoop.hive.ql.exec.vector.ListColumnVector)3