Search in sources :

Example 1 with StringColumnStatistics

use of org.apache.orc.StringColumnStatistics in project hive by apache.

the class TestOrcSerDeStats method testOrcSerDeStatsComplexOldFormat.

@Test
public void testOrcSerDeStatsComplexOldFormat() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcSerDeStats.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).version(OrcFile.Version.V_0_11).bufferSize(10000));
    // 1 + 2 + 4 + 8 + 4 + 8 + 5 + 2 + 4 + 3 + 4 + 4 + 4 + 4 + 4 + 3 = 64
    writer.addRow(new BigRow(false, (byte) 1, (short) 1024, 65536, Long.MAX_VALUE, (float) 1.0, -15.0, bytes(0, 1, 2, 3, 4), "hi", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(3, "good"), inner(4, "bad")), map(), Timestamp.valueOf("2000-03-12 15:00:00"), HiveDecimal.create("12345678.6547456")));
    // 1 + 2 + 4 + 8 + 4 + 8 + 3 + 4 + 3 + 4 + 4 + 4 + 3 + 4 + 2 + 4 + 3 + 5 + 4 + 5 + 7 + 4 + 7 =
    // 97
    writer.addRow(new BigRow(true, (byte) 100, (short) 2048, 65536, Long.MAX_VALUE, (float) 2.0, -5.0, bytes(), "bye", new MiddleStruct(inner(1, "bye"), inner(2, "sigh")), list(inner(100000000, "cat"), inner(-100000, "in"), inner(1234, "hat")), map(inner(5, "chani"), inner(1, "mauddib")), Timestamp.valueOf("2000-03-11 15:00:00"), HiveDecimal.create("12345678.6547452")));
    writer.close();
    long rowCount = writer.getNumberOfRows();
    long rawDataSize = writer.getRawDataSize();
    assertEquals(2, rowCount);
    assertEquals(1668, rawDataSize);
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    assertEquals(2, reader.getNumberOfRows());
    assertEquals(1760, reader.getRawDataSize());
    assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1")));
    assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("byte1")));
    assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("short1")));
    assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("int1")));
    assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("long1")));
    assertEquals(8, reader.getRawDataSizeOfColumns(Lists.newArrayList("float1")));
    assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("double1")));
    assertEquals(5, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
    assertEquals(172, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
    assertEquals(483, reader.getRawDataSizeOfColumns(Lists.newArrayList("list")));
    assertEquals(384, reader.getRawDataSizeOfColumns(Lists.newArrayList("map")));
    assertEquals(396, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle")));
    assertEquals(16, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts")));
    assertEquals(224, reader.getRawDataSizeOfColumns(Lists.newArrayList("decimal1")));
    assertEquals(24, reader.getRawDataSizeOfColumns(Lists.newArrayList("ts", "int1")));
    assertEquals(1271, reader.getRawDataSizeOfColumns(Lists.newArrayList("middle", "list", "map", "float1")));
    assertEquals(185, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "byte1", "string1")));
    assertEquals(1752, reader.getRawDataSizeOfColumns(Lists.newArrayList("boolean1", "byte1", "short1", "int1", "long1", "float1", "double1", "bytes1", "string1", "list", "map", "middle", "ts", "decimal1")));
    // check the stats
    ColumnStatistics[] stats = reader.getStatistics();
    assertEquals(2, stats[1].getNumberOfValues());
    assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount());
    assertEquals(1, ((BooleanColumnStatistics) stats[1]).getTrueCount());
    assertEquals("count: 2 hasNull: false bytesOnDisk: 5 true: 1", stats[1].toString());
    assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
    assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
    assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
    assertEquals(3072, ((IntegerColumnStatistics) stats[3]).getSum());
    assertEquals("count: 2 hasNull: false bytesOnDisk: 8 min: 1024 max: 2048 sum: 3072", stats[3].toString());
    assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
    assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
    assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
    assertEquals("count: 2 hasNull: false bytesOnDisk: 12 min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
    assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
    assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
    assertEquals(-20.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
    assertEquals("count: 2 hasNull: false bytesOnDisk: 15 min: -15.0 max: -5.0 sum: -20.0", stats[7].toString());
    assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum());
    assertEquals("count: 2 hasNull: false bytesOnDisk: 14 sum: 5", stats[8].toString());
    assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum());
    assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum());
    assertEquals(5, ((StringColumnStatistics) stats[9]).getSum());
    assertEquals("count: 2 hasNull: false bytesOnDisk: 20 min: bye max: hi sum: 5", stats[9].toString());
    reader.close();
}
Also used : DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) StringColumnStatistics(org.apache.orc.StringColumnStatistics) ColumnStatistics(org.apache.orc.ColumnStatistics) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) Test(org.junit.Test)

Example 2 with StringColumnStatistics

use of org.apache.orc.StringColumnStatistics in project hive by apache.

the class TestOrcFile method columnProjection.

@Test
public void columnProjection() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(InnerStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(1000).compress(CompressionKind.NONE).bufferSize(100).rowIndexStride(1000));
    Random r1 = new Random(1);
    Random r2 = new Random(2);
    int x;
    int minInt = 0, maxInt = 0;
    String y;
    String minStr = null, maxStr = null;
    for (int i = 0; i < 21000; ++i) {
        x = r1.nextInt();
        y = Long.toHexString(r2.nextLong());
        if (i == 0 || x < minInt) {
            minInt = x;
        }
        if (i == 0 || x > maxInt) {
            maxInt = x;
        }
        if (i == 0 || y.compareTo(minStr) < 0) {
            minStr = y;
        }
        if (i == 0 || y.compareTo(maxStr) > 0) {
            maxStr = y;
        }
        writer.addRow(inner(x, y));
    }
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    // check out the statistics
    ColumnStatistics[] stats = reader.getStatistics();
    assertEquals(3, stats.length);
    for (ColumnStatistics s : stats) {
        assertEquals(21000, s.getNumberOfValues());
        if (s instanceof IntegerColumnStatistics) {
            assertEquals(minInt, ((IntegerColumnStatistics) s).getMinimum());
            assertEquals(maxInt, ((IntegerColumnStatistics) s).getMaximum());
        } else if (s instanceof StringColumnStatistics) {
            assertEquals(maxStr, ((StringColumnStatistics) s).getMaximum());
            assertEquals(minStr, ((StringColumnStatistics) s).getMinimum());
        }
    }
    // check out the types
    List<OrcProto.Type> types = reader.getTypes();
    assertEquals(3, types.size());
    assertEquals(OrcProto.Type.Kind.STRUCT, types.get(0).getKind());
    assertEquals(2, types.get(0).getSubtypesCount());
    assertEquals(1, types.get(0).getSubtypes(0));
    assertEquals(2, types.get(0).getSubtypes(1));
    assertEquals(OrcProto.Type.Kind.INT, types.get(1).getKind());
    assertEquals(0, types.get(1).getSubtypesCount());
    assertEquals(OrcProto.Type.Kind.STRING, types.get(2).getKind());
    assertEquals(0, types.get(2).getSubtypesCount());
    // read the contents and make sure they match
    RecordReader rows1 = reader.rows(new boolean[] { true, true, false });
    RecordReader rows2 = reader.rows(new boolean[] { true, false, true });
    r1 = new Random(1);
    r2 = new Random(2);
    OrcStruct row1 = null;
    OrcStruct row2 = null;
    for (int i = 0; i < 21000; ++i) {
        assertEquals(true, rows1.hasNext());
        assertEquals(true, rows2.hasNext());
        row1 = (OrcStruct) rows1.next(row1);
        row2 = (OrcStruct) rows2.next(row2);
        assertEquals(r1.nextInt(), ((IntWritable) row1.getFieldValue(0)).get());
        assertEquals(Long.toHexString(r2.nextLong()), row2.getFieldValue(1).toString());
    }
    assertEquals(false, rows1.hasNext());
    assertEquals(false, rows2.hasNext());
    rows1.close();
    rows2.close();
}
Also used : DecimalColumnStatistics(org.apache.orc.DecimalColumnStatistics) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) StringColumnStatistics(org.apache.orc.StringColumnStatistics) DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) ColumnStatistics(org.apache.orc.ColumnStatistics) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) Random(java.util.Random) StringColumnStatistics(org.apache.orc.StringColumnStatistics) Test(org.junit.Test)

Example 3 with StringColumnStatistics

use of org.apache.orc.StringColumnStatistics in project hive by apache.

the class TestOrcFile method testStringAndBinaryStatistics.

@Test
public void testStringAndBinaryStatistics() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcFile.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000));
    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo"));
    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar"));
    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null));
    writer.addRow(new SimpleStruct(null, "hi"));
    writer.close();
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    TypeDescription schema = writer.getSchema();
    assertEquals(2, schema.getMaximumId());
    boolean[] expected = new boolean[] { false, false, true };
    boolean[] included = OrcUtils.includeColumns("string1", schema);
    assertEquals(true, Arrays.equals(expected, included));
    expected = new boolean[] { false, false, false };
    included = OrcUtils.includeColumns("", schema);
    assertEquals(true, Arrays.equals(expected, included));
    expected = new boolean[] { false, false, false };
    included = OrcUtils.includeColumns(null, schema);
    assertEquals(true, Arrays.equals(expected, included));
    // check the stats
    ColumnStatistics[] stats = reader.getStatistics();
    assertEquals(4, stats[0].getNumberOfValues());
    assertEquals("count: 4 hasNull: false", stats[0].toString());
    assertEquals(3, stats[1].getNumberOfValues());
    assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
    assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", stats[1].toString());
    assertEquals(3, stats[2].getNumberOfValues());
    assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
    assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
    assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
    assertEquals("count: 3 hasNull: true bytesOnDisk: 22 min: bar max: hi sum: 8", stats[2].toString());
    // check the inspectors
    StructObjectInspector readerInspector = (StructObjectInspector) reader.getObjectInspector();
    assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory());
    assertEquals("struct<bytes1:binary,string1:string>", readerInspector.getTypeName());
    List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
    BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector.getStructFieldRef("bytes1").getFieldObjectInspector();
    StringObjectInspector st = (StringObjectInspector) readerInspector.getStructFieldRef("string1").getFieldObjectInspector();
    RecordReader rows = reader.rows();
    Object row = rows.next(null);
    assertNotNull(row);
    // check the contents of the first row
    assertEquals(bytes(0, 1, 2, 3, 4), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
    assertEquals("foo", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
    // check the contents of second row
    assertEquals(true, rows.hasNext());
    row = rows.next(row);
    assertEquals(bytes(0, 1, 2, 3), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
    assertEquals("bar", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
    // check the contents of second row
    assertEquals(true, rows.hasNext());
    row = rows.next(row);
    assertEquals(bytes(0, 1, 2, 3, 4, 5), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
    assertNull(st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
    // check the contents of second row
    assertEquals(true, rows.hasNext());
    row = rows.next(row);
    assertNull(bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
    assertEquals("hi", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
    // handle the close up
    assertEquals(false, rows.hasNext());
    rows.close();
}
Also used : DecimalColumnStatistics(org.apache.orc.DecimalColumnStatistics) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) StringColumnStatistics(org.apache.orc.StringColumnStatistics) DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) ColumnStatistics(org.apache.orc.ColumnStatistics) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) BooleanObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector) ShortObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ShortObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) MapObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) FloatObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) ListObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector) IntObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) ByteObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) TimestampObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.TimestampObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) TypeDescription(org.apache.orc.TypeDescription) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 4 with StringColumnStatistics

use of org.apache.orc.StringColumnStatistics in project hive by apache.

the class TestOrcSerDeStats method testStringAndBinaryStatistics.

@Test
public void testStringAndBinaryStatistics() throws Exception {
    ObjectInspector inspector;
    synchronized (TestOrcSerDeStats.class) {
        inspector = ObjectInspectorFactory.getReflectionObjectInspector(SimpleStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
    }
    Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).stripeSize(100000).bufferSize(10000));
    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4), "foo"));
    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3), "bar"));
    writer.addRow(new SimpleStruct(bytes(0, 1, 2, 3, 4, 5), null));
    writer.addRow(new SimpleStruct(null, "hi"));
    writer.close();
    assertEquals(4, writer.getNumberOfRows());
    assertEquals(273, writer.getRawDataSize());
    Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    assertEquals(4, reader.getNumberOfRows());
    assertEquals(289, reader.getRawDataSize());
    assertEquals(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1")));
    assertEquals(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1")));
    assertEquals(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1")));
    // check the stats
    ColumnStatistics[] stats = reader.getStatistics();
    assertEquals(4, stats[0].getNumberOfValues());
    assertEquals("count: 4 hasNull: false", stats[0].toString());
    assertEquals(3, stats[1].getNumberOfValues());
    assertEquals(15, ((BinaryColumnStatistics) stats[1]).getSum());
    assertEquals("count: 3 hasNull: true bytesOnDisk: 28 sum: 15", stats[1].toString());
    assertEquals(3, stats[2].getNumberOfValues());
    assertEquals("bar", ((StringColumnStatistics) stats[2]).getMinimum());
    assertEquals("hi", ((StringColumnStatistics) stats[2]).getMaximum());
    assertEquals(8, ((StringColumnStatistics) stats[2]).getSum());
    assertEquals("count: 3 hasNull: true bytesOnDisk: 22 min: bar max: hi sum: 8", stats[2].toString());
    // check the inspectors
    StructObjectInspector readerInspector = (StructObjectInspector) reader.getObjectInspector();
    assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory());
    assertEquals("struct<bytes1:binary,string1:string>", readerInspector.getTypeName());
    List<? extends StructField> fields = readerInspector.getAllStructFieldRefs();
    BinaryObjectInspector bi = (BinaryObjectInspector) readerInspector.getStructFieldRef("bytes1").getFieldObjectInspector();
    StringObjectInspector st = (StringObjectInspector) readerInspector.getStructFieldRef("string1").getFieldObjectInspector();
    RecordReader rows = reader.rows();
    Object row = rows.next(null);
    assertNotNull(row);
    // check the contents of the first row
    assertEquals(bytes(0, 1, 2, 3, 4), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
    assertEquals("foo", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
    // check the contents of second row
    assertEquals(true, rows.hasNext());
    row = rows.next(row);
    assertEquals(bytes(0, 1, 2, 3), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
    assertEquals("bar", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
    // check the contents of second row
    assertEquals(true, rows.hasNext());
    row = rows.next(row);
    assertEquals(bytes(0, 1, 2, 3, 4, 5), bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
    assertNull(st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
    // check the contents of second row
    assertEquals(true, rows.hasNext());
    row = rows.next(row);
    assertNull(bi.getPrimitiveWritableObject(readerInspector.getStructFieldData(row, fields.get(0))));
    assertEquals("hi", st.getPrimitiveJavaObject(readerInspector.getStructFieldData(row, fields.get(1))));
    // handle the close up
    assertEquals(false, rows.hasNext());
    rows.close();
}
Also used : DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) StringColumnStatistics(org.apache.orc.StringColumnStatistics) ColumnStatistics(org.apache.orc.ColumnStatistics) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) BinaryObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) Test(org.junit.Test)

Example 5 with StringColumnStatistics

use of org.apache.orc.StringColumnStatistics in project hive by apache.

the class TestOrcSerDeStats method testSerdeStatsOldFormat.

@Test(expected = ClassCastException.class)
public void testSerdeStatsOldFormat() throws Exception {
    Path oldFilePath = new Path(HiveTestUtils.getFileFromClasspath("orc-file-11-format.orc"));
    Reader reader = OrcFile.createReader(oldFilePath, OrcFile.readerOptions(conf).filesystem(fs));
    int stripeCount = 0;
    int rowCount = 0;
    long currentOffset = -1;
    for (StripeInformation stripe : reader.getStripes()) {
        stripeCount += 1;
        rowCount += stripe.getNumberOfRows();
        if (currentOffset < 0) {
            currentOffset = stripe.getOffset() + stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
        } else {
            assertEquals(currentOffset, stripe.getOffset());
            currentOffset += stripe.getIndexLength() + stripe.getDataLength() + stripe.getFooterLength();
        }
    }
    assertEquals(reader.getNumberOfRows(), rowCount);
    assertEquals(6615000, reader.getRawDataSize());
    assertEquals(2, stripeCount);
    // check the stats
    ColumnStatistics[] stats = reader.getStatistics();
    assertEquals(7500, stats[1].getNumberOfValues());
    assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount());
    assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getTrueCount());
    assertEquals("count: 7500 hasNull: true true: 3750", stats[1].toString());
    assertEquals(2048, ((IntegerColumnStatistics) stats[3]).getMaximum());
    assertEquals(1024, ((IntegerColumnStatistics) stats[3]).getMinimum());
    assertEquals(true, ((IntegerColumnStatistics) stats[3]).isSumDefined());
    assertEquals(11520000, ((IntegerColumnStatistics) stats[3]).getSum());
    assertEquals("count: 7500 hasNull: true min: 1024 max: 2048 sum: 11520000", stats[3].toString());
    assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMaximum());
    assertEquals(Long.MAX_VALUE, ((IntegerColumnStatistics) stats[5]).getMinimum());
    assertEquals(false, ((IntegerColumnStatistics) stats[5]).isSumDefined());
    assertEquals("count: 7500 hasNull: true min: 9223372036854775807 max: 9223372036854775807", stats[5].toString());
    assertEquals(-15.0, ((DoubleColumnStatistics) stats[7]).getMinimum());
    assertEquals(-5.0, ((DoubleColumnStatistics) stats[7]).getMaximum());
    assertEquals(-75000.0, ((DoubleColumnStatistics) stats[7]).getSum(), 0.00001);
    assertEquals("count: 7500 hasNull: true min: -15.0 max: -5.0 sum: -75000.0", stats[7].toString());
    assertEquals("bye", ((StringColumnStatistics) stats[9]).getMinimum());
    assertEquals("hi", ((StringColumnStatistics) stats[9]).getMaximum());
    assertEquals(0, ((StringColumnStatistics) stats[9]).getSum());
    assertEquals("count: 7500 hasNull: true min: bye max: hi sum: 0", stats[9].toString());
    // old orc format will not have binary statistics. toString() will show only
    // the general column statistics
    assertEquals("count: 7500 hasNull: true", stats[8].toString());
    // since old orc format doesn't support binary statistics,
    // this should throw ClassCastException
    assertEquals(5, ((BinaryColumnStatistics) stats[8]).getSum());
    reader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) DoubleColumnStatistics(org.apache.orc.DoubleColumnStatistics) IntegerColumnStatistics(org.apache.orc.IntegerColumnStatistics) BooleanColumnStatistics(org.apache.orc.BooleanColumnStatistics) StringColumnStatistics(org.apache.orc.StringColumnStatistics) ColumnStatistics(org.apache.orc.ColumnStatistics) BinaryColumnStatistics(org.apache.orc.BinaryColumnStatistics) StripeInformation(org.apache.orc.StripeInformation) Test(org.junit.Test)

Aggregations

BinaryColumnStatistics (org.apache.orc.BinaryColumnStatistics)5 BooleanColumnStatistics (org.apache.orc.BooleanColumnStatistics)5 ColumnStatistics (org.apache.orc.ColumnStatistics)5 DoubleColumnStatistics (org.apache.orc.DoubleColumnStatistics)5 IntegerColumnStatistics (org.apache.orc.IntegerColumnStatistics)5 StringColumnStatistics (org.apache.orc.StringColumnStatistics)5 Test (org.junit.Test)5 ObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector)4 StructObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)4 BinaryObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BinaryObjectInspector)4 StringObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector)4 ListObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.ListObjectInspector)2 MapObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.MapObjectInspector)2 BooleanObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.BooleanObjectInspector)2 ByteObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.ByteObjectInspector)2 DoubleObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector)2 FloatObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.FloatObjectInspector)2 HiveDecimalObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector)2 IntObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.IntObjectInspector)2 LongObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector)2