Search in sources :

Example 1 with IntegerStatistics

use of io.prestosql.orc.metadata.statistics.IntegerStatistics in project hetu-core by openlookeng.

the class TestOrcReaderPositions method testRowGroupSkipping.

@Test
public void testRowGroupSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        // create single strip file with multiple row groups
        int rowCount = 142_000;
        createSequentialFile(tempFile.getFile(), rowCount);
        // test reading two row groups from middle of file
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == rowCount) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return (stats.getMin() == 50_000) || (stats.getMin() == 60_000);
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), rowCount);
            assertEquals(reader.getReaderRowCount(), rowCount);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            long position = 50_000;
            while (true) {
                Page page = reader.nextPage();
                if (page == null) {
                    break;
                }
                page = page.getLoadedPage();
                Block block = page.getBlock(0);
                for (int i = 0; i < block.getPositionCount(); i++) {
                    assertEquals(BIGINT.getLong(block, i), position + i);
                }
                assertEquals(reader.getFilePosition(), position);
                assertEquals(reader.getReaderPosition(), position);
                position += page.getPositionCount();
            }
            assertEquals(position, 70_000);
            assertEquals(reader.getFilePosition(), rowCount);
            assertEquals(reader.getReaderPosition(), rowCount);
        }
    }
}
Also used : Footer(io.prestosql.orc.metadata.Footer) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcTester.createOrcRecordWriter(io.prestosql.orc.OrcTester.createOrcRecordWriter) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) OrcTester.createSettableStructObjectInspector(io.prestosql.orc.OrcTester.createSettableStructObjectInspector) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) MAX_BLOCK_SIZE(io.prestosql.orc.OrcTester.MAX_BLOCK_SIZE) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) INITIAL_BATCH_SIZE(io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) MAX_BATCH_SIZE(io.prestosql.orc.OrcReader.MAX_BATCH_SIZE) ORC_12(io.prestosql.orc.OrcTester.Format.ORC_12) VARCHAR(io.prestosql.spi.type.VarcharType.VARCHAR) OrcTester.createCustomOrcRecordReader(io.prestosql.orc.OrcTester.createCustomOrcRecordReader) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) BATCH_SIZE_GROWTH_FACTOR(io.prestosql.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Block(io.prestosql.spi.block.Block) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) Assert.fail(org.testng.Assert.fail) Page(io.prestosql.spi.Page) IOException(java.io.IOException) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) File(java.io.File) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) DataSize(io.airlift.units.DataSize) Serializer(org.apache.hadoop.hive.serde2.Serializer) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) Block(io.prestosql.spi.block.Block) Page(io.prestosql.spi.Page) OrcTester.createCustomOrcRecordReader(io.prestosql.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 2 with IntegerStatistics

use of io.prestosql.orc.metadata.statistics.IntegerStatistics in project hetu-core by openlookeng.

the class TestOrcReaderPositions method testStripeSkipping.

@Test
public void testStripeSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        createMultiStripeFile(tempFile.getFile());
        // test reading second and fourth stripes
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == 100) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return ((stats.getMin() == 60) && (stats.getMax() == 117)) || ((stats.getMin() == 180) && (stats.getMax() == 237));
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), 100);
            assertEquals(reader.getReaderRowCount(), 40);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            // second stripe
            Page page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 0);
            assertEquals(reader.getFilePosition(), 20);
            assertCurrentBatch(page, 1);
            // fourth stripe
            page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 20);
            assertEquals(reader.getFilePosition(), 60);
            assertCurrentBatch(page, 3);
            page = reader.nextPage();
            assertNull(page);
            assertEquals(reader.getReaderPosition(), 40);
            assertEquals(reader.getFilePosition(), 100);
        }
    }
}
Also used : Footer(io.prestosql.orc.metadata.Footer) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) MEGABYTE(io.airlift.units.DataSize.Unit.MEGABYTE) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcTester.createOrcRecordWriter(io.prestosql.orc.OrcTester.createOrcRecordWriter) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) OrcTester.createSettableStructObjectInspector(io.prestosql.orc.OrcTester.createSettableStructObjectInspector) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) MAX_BLOCK_SIZE(io.prestosql.orc.OrcTester.MAX_BLOCK_SIZE) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) INITIAL_BATCH_SIZE(io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) MAX_BATCH_SIZE(io.prestosql.orc.OrcReader.MAX_BATCH_SIZE) ORC_12(io.prestosql.orc.OrcTester.Format.ORC_12) VARCHAR(io.prestosql.spi.type.VarcharType.VARCHAR) OrcTester.createCustomOrcRecordReader(io.prestosql.orc.OrcTester.createCustomOrcRecordReader) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) BATCH_SIZE_GROWTH_FACTOR(io.prestosql.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) BIGINT(io.prestosql.spi.type.BigintType.BIGINT) Block(io.prestosql.spi.block.Block) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) Assert.fail(org.testng.Assert.fail) Page(io.prestosql.spi.Page) IOException(java.io.IOException) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) File(java.io.File) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) DataSize(io.airlift.units.DataSize) Serializer(org.apache.hadoop.hive.serde2.Serializer) CompressionKind(io.prestosql.orc.metadata.CompressionKind) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.prestosql.orc.metadata.OrcColumnId) Page(io.prestosql.spi.Page) OrcTester.createCustomOrcRecordReader(io.prestosql.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 3 with IntegerStatistics

use of io.prestosql.orc.metadata.statistics.IntegerStatistics in project hetu-core by openlookeng.

the class TestOrcBloomFilters method testMatches.

@Test
public // simulate query on a 2 columns where 1 is used as part of the where, with and without bloom filter
void testMatches() {
    TupleDomainOrcPredicate predicate = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(true).addColumn(ROOT_COLUMN, Domain.singleValue(BIGINT, 1234L)).build();
    TupleDomainOrcPredicate emptyPredicate = TupleDomainOrcPredicate.builder().build();
    // assemble a matching and a non-matching bloom filter
    HashableBloomFilter bloomFilter = new HashableBloomFilter(1000, 0.01);
    OrcProto.BloomFilter emptyOrcBloomFilter = toOrcBloomFilter(bloomFilter);
    bloomFilter.addLong(1234);
    OrcProto.BloomFilter orcBloomFilter = toOrcBloomFilter(bloomFilter);
    ColumnMetadata<ColumnStatistics> matchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, toBloomFilter(orcBloomFilter))));
    ColumnMetadata<ColumnStatistics> nonMatchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, toBloomFilter(emptyOrcBloomFilter))));
    ColumnMetadata<ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null)));
    assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
    assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex));
    assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
    assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex));
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) OrcProto(io.prestosql.orc.proto.OrcProto) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 4 with IntegerStatistics

use of io.prestosql.orc.metadata.statistics.IntegerStatistics in project hetu-core by openlookeng.

the class OrcWriteValidation method validateColumnStatisticsEquivalent.

private static void validateColumnStatisticsEquivalent(OrcDataSourceId orcDataSourceId, String name, ColumnStatistics actualColumnStatistics, ColumnStatistics expectedColumnStatistics) throws OrcCorruptionException {
    requireNonNull(name, "name is null");
    requireNonNull(actualColumnStatistics, "actualColumnStatistics is null");
    requireNonNull(expectedColumnStatistics, "expectedColumnStatistics is null");
    if (actualColumnStatistics.getNumberOfValues() != expectedColumnStatistics.getNumberOfValues()) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected number of values in %s statistics", name);
    }
    if (!Objects.equals(actualColumnStatistics.getBooleanStatistics(), expectedColumnStatistics.getBooleanStatistics())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected boolean counts in %s statistics", name);
    }
    if (!Objects.equals(actualColumnStatistics.getIntegerStatistics(), expectedColumnStatistics.getIntegerStatistics())) {
        IntegerStatistics actualIntegerStatistics = actualColumnStatistics.getIntegerStatistics();
        IntegerStatistics expectedIntegerStatistics = expectedColumnStatistics.getIntegerStatistics();
        // Ignore the validation of sum if one of the two sums is null.
        if (actualIntegerStatistics == null || expectedIntegerStatistics == null || !Objects.equals(actualIntegerStatistics.getMin(), expectedIntegerStatistics.getMin()) || !Objects.equals(actualIntegerStatistics.getMax(), expectedIntegerStatistics.getMax()) || (actualIntegerStatistics.getSum() != null && expectedIntegerStatistics.getSum() != null && !Objects.equals(actualIntegerStatistics.getSum(), expectedIntegerStatistics.getSum()))) {
            throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected integer range in %s statistics", name);
        }
    }
    if (!Objects.equals(actualColumnStatistics.getDoubleStatistics(), expectedColumnStatistics.getDoubleStatistics())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected double range in %s statistics", name);
    }
    StringStatistics expectedStringStatistics = expectedColumnStatistics.getStringStatistics();
    if (expectedStringStatistics != null) {
        expectedStringStatistics = new StringStatistics(minStringTruncateToValidRange(expectedStringStatistics.getMin(), HiveWriterVersion.ORC_HIVE_8732), maxStringTruncateToValidRange(expectedStringStatistics.getMax(), HiveWriterVersion.ORC_HIVE_8732), expectedStringStatistics.getSum());
    }
    StringStatistics actualStringStatistics = actualColumnStatistics.getStringStatistics();
    if (!Objects.equals(actualColumnStatistics.getStringStatistics(), expectedStringStatistics) && expectedStringStatistics != null) {
        // Merging row group stats can produce nulls given we have string stats limit.
        if (actualStringStatistics == null || actualStringStatistics.getSum() != expectedStringStatistics.getSum() || (expectedStringStatistics.getMax() != null && !Objects.equals(actualStringStatistics.getMax(), expectedStringStatistics.getMax())) || (expectedStringStatistics.getMin() != null && !Objects.equals(actualStringStatistics.getMin(), expectedStringStatistics.getMin()))) {
            throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected string range in %s statistics", name);
        }
    }
    if (!Objects.equals(actualColumnStatistics.getDateStatistics(), expectedColumnStatistics.getDateStatistics())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected date range in %s statistics", name);
    }
    if (!Objects.equals(actualColumnStatistics.getDecimalStatistics(), expectedColumnStatistics.getDecimalStatistics())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected decimal range in %s statistics", name);
    }
    if (!Objects.equals(actualColumnStatistics.getBloomFilter(), expectedColumnStatistics.getBloomFilter())) {
        throw new OrcCorruptionException(orcDataSourceId, "Write validation failed: unexpected bloom filter in %s statistics", name);
    }
}
Also used : StringStatistics(io.prestosql.orc.metadata.statistics.StringStatistics) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics)

Aggregations

IntegerStatistics (io.prestosql.orc.metadata.statistics.IntegerStatistics)4 Test (org.testng.annotations.Test)3 ImmutableMap (com.google.common.collect.ImmutableMap)2 Maps (com.google.common.collect.Maps)2 Slice (io.airlift.slice.Slice)2 DataSize (io.airlift.units.DataSize)2 MEGABYTE (io.airlift.units.DataSize.Unit.MEGABYTE)2 BATCH_SIZE_GROWTH_FACTOR (io.prestosql.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR)2 INITIAL_BATCH_SIZE (io.prestosql.orc.OrcReader.INITIAL_BATCH_SIZE)2 MAX_BATCH_SIZE (io.prestosql.orc.OrcReader.MAX_BATCH_SIZE)2 ORC_12 (io.prestosql.orc.OrcTester.Format.ORC_12)2 MAX_BLOCK_SIZE (io.prestosql.orc.OrcTester.MAX_BLOCK_SIZE)2 OrcTester.createCustomOrcRecordReader (io.prestosql.orc.OrcTester.createCustomOrcRecordReader)2 OrcTester.createOrcRecordWriter (io.prestosql.orc.OrcTester.createOrcRecordWriter)2 OrcTester.createSettableStructObjectInspector (io.prestosql.orc.OrcTester.createSettableStructObjectInspector)2 CompressionKind (io.prestosql.orc.metadata.CompressionKind)2 Footer (io.prestosql.orc.metadata.Footer)2 OrcColumnId (io.prestosql.orc.metadata.OrcColumnId)2 Page (io.prestosql.spi.Page)2 Block (io.prestosql.spi.block.Block)2