Search in sources :

Example 1 with IntegerStatistics

use of io.trino.orc.metadata.statistics.IntegerStatistics in project trino by trinodb.

the class TestOrcReaderPositions method testRowGroupSkipping.

@Test
public void testRowGroupSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        // create single strip file with multiple row groups
        int rowCount = 142_000;
        createSequentialFile(tempFile.getFile(), rowCount);
        // test reading two row groups from middle of file
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == rowCount) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return (stats.getMin() == 50_000) || (stats.getMin() == 60_000);
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), rowCount);
            assertEquals(reader.getReaderRowCount(), rowCount);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            long position = 50_000;
            while (true) {
                Page page = reader.nextPage();
                if (page == null) {
                    break;
                }
                page = page.getLoadedPage();
                Block block = page.getBlock(0);
                for (int i = 0; i < block.getPositionCount(); i++) {
                    assertEquals(BIGINT.getLong(block, i), position + i);
                }
                assertEquals(reader.getFilePosition(), position);
                assertEquals(reader.getReaderPosition(), position);
                position += page.getPositionCount();
            }
            assertEquals(position, 70_000);
            assertEquals(reader.getFilePosition(), rowCount);
            assertEquals(reader.getReaderPosition(), rowCount);
        }
    }
}
Also used : OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) BATCH_SIZE_GROWTH_FACTOR(io.trino.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) READER_OPTIONS(io.trino.orc.OrcTester.READER_OPTIONS) ORC_12(io.trino.orc.OrcTester.Format.ORC_12) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) Assert.fail(org.testng.Assert.fail) IOException(java.io.IOException) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) OrcTester.createSettableStructObjectInspector(io.trino.orc.OrcTester.createSettableStructObjectInspector) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) CompressionKind(io.trino.orc.metadata.CompressionKind) File(java.io.File) Footer(io.trino.orc.metadata.Footer) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) MAX_BATCH_SIZE(io.trino.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(io.trino.spi.type.BigintType.BIGINT) Serializer(org.apache.hadoop.hive.serde2.Serializer) OrcTester.createOrcRecordWriter(io.trino.orc.OrcTester.createOrcRecordWriter) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) OrcColumnId(io.trino.orc.metadata.OrcColumnId) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Block(io.trino.spi.block.Block) Page(io.trino.spi.Page) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 2 with IntegerStatistics

use of io.trino.orc.metadata.statistics.IntegerStatistics in project trino by trinodb.

the class TestOrcReaderPositions method testStripeSkipping.

@Test
public void testStripeSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        createMultiStripeFile(tempFile.getFile());
        // test reading second and fourth stripes
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == 100) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return ((stats.getMin() == 60) && (stats.getMax() == 117)) || ((stats.getMin() == 180) && (stats.getMax() == 237));
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), 100);
            assertEquals(reader.getReaderRowCount(), 40);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            // second stripe
            Page page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 0);
            assertEquals(reader.getFilePosition(), 20);
            assertCurrentBatch(page, 1);
            // fourth stripe
            page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 20);
            assertEquals(reader.getFilePosition(), 60);
            assertCurrentBatch(page, 3);
            page = reader.nextPage();
            assertNull(page);
            assertEquals(reader.getReaderPosition(), 40);
            assertEquals(reader.getFilePosition(), 100);
        }
    }
}
Also used : OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) BATCH_SIZE_GROWTH_FACTOR(io.trino.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) READER_OPTIONS(io.trino.orc.OrcTester.READER_OPTIONS) ORC_12(io.trino.orc.OrcTester.Format.ORC_12) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) Assert.fail(org.testng.Assert.fail) IOException(java.io.IOException) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) OrcTester.createSettableStructObjectInspector(io.trino.orc.OrcTester.createSettableStructObjectInspector) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) CompressionKind(io.trino.orc.metadata.CompressionKind) File(java.io.File) Footer(io.trino.orc.metadata.Footer) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) MAX_BATCH_SIZE(io.trino.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(io.trino.spi.type.BigintType.BIGINT) Serializer(org.apache.hadoop.hive.serde2.Serializer) OrcTester.createOrcRecordWriter(io.trino.orc.OrcTester.createOrcRecordWriter) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) OrcColumnId(io.trino.orc.metadata.OrcColumnId) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Page(io.trino.spi.Page) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 3 with IntegerStatistics

use of io.trino.orc.metadata.statistics.IntegerStatistics in project trino by trinodb.

the class TestOrcBloomFilters method testMatchesExpandedRange.

@Test
public void testMatchesExpandedRange() {
    Range range = Range.range(BIGINT, 1233L, true, 1235L, true);
    TupleDomainOrcPredicate predicate = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(true).addColumn(ROOT_COLUMN, Domain.create(ValueSet.ofRanges(range), false)).setDomainCompactionThreshold(100).build();
    ColumnMetadata<ColumnStatistics> matchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null, new Utf8BloomFilterBuilder(1000, 0.01).addLong(1234L).buildBloomFilter())));
    ColumnMetadata<ColumnStatistics> nonMatchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null, new Utf8BloomFilterBuilder(1000, 0.01).addLong(9876L).buildBloomFilter())));
    assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
    assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) Utf8BloomFilterBuilder(io.trino.orc.metadata.statistics.Utf8BloomFilterBuilder) Range(io.trino.spi.predicate.Range) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 4 with IntegerStatistics

use of io.trino.orc.metadata.statistics.IntegerStatistics in project trino by trinodb.

the class TestOrcBloomFilters method testMatches.

@Test
public // simulate query on 2 columns where 1 is used as part of the where, with and without bloom filter
void testMatches() {
    TupleDomainOrcPredicate predicate = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(true).addColumn(ROOT_COLUMN, Domain.singleValue(BIGINT, 1234L)).build();
    TupleDomainOrcPredicate emptyPredicate = TupleDomainOrcPredicate.builder().build();
    ColumnMetadata<ColumnStatistics> matchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null, new Utf8BloomFilterBuilder(1000, 0.01).addLong(1234L).buildBloomFilter())));
    ColumnMetadata<ColumnStatistics> nonMatchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null, new Utf8BloomFilterBuilder(1000, 0.01).buildBloomFilter())));
    ColumnMetadata<ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null, null)));
    assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
    assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex));
    assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
    assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex));
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) Utf8BloomFilterBuilder(io.trino.orc.metadata.statistics.Utf8BloomFilterBuilder) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 5 with IntegerStatistics

use of io.trino.orc.metadata.statistics.IntegerStatistics in project trino by trinodb.

the class TestOrcBloomFilters method testMatchesNonExpandedRange.

@Test
public void testMatchesNonExpandedRange() {
    ColumnMetadata<ColumnStatistics> matchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null, new Utf8BloomFilterBuilder(1000, 0.01).addLong(1500L).buildBloomFilter())));
    Range range = Range.range(BIGINT, 1233L, true, 1235L, true);
    TupleDomainOrcPredicate.TupleDomainOrcPredicateBuilder builder = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(true).addColumn(ROOT_COLUMN, Domain.create(ValueSet.ofRanges(range), false));
    // Domain expansion doesn't take place -> no bloom filtering -> ranges overlap
    assertTrue(builder.setDomainCompactionThreshold(1).build().matches(1L, matchingStatisticsByColumnIndex));
    assertFalse(builder.setDomainCompactionThreshold(100).build().matches(1L, matchingStatisticsByColumnIndex));
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) Utf8BloomFilterBuilder(io.trino.orc.metadata.statistics.Utf8BloomFilterBuilder) Range(io.trino.spi.predicate.Range) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Aggregations

IntegerStatistics (io.trino.orc.metadata.statistics.IntegerStatistics)7 Test (org.testng.annotations.Test)5 Slice (io.airlift.slice.Slice)3 ColumnMetadata (io.trino.orc.metadata.ColumnMetadata)3 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)3 Utf8BloomFilterBuilder (io.trino.orc.metadata.statistics.Utf8BloomFilterBuilder)3 ImmutableMap (com.google.common.collect.ImmutableMap)2 Maps (com.google.common.collect.Maps)2 BATCH_SIZE_GROWTH_FACTOR (io.trino.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR)2 INITIAL_BATCH_SIZE (io.trino.orc.OrcReader.INITIAL_BATCH_SIZE)2 MAX_BATCH_SIZE (io.trino.orc.OrcReader.MAX_BATCH_SIZE)2 ORC_12 (io.trino.orc.OrcTester.Format.ORC_12)2 READER_OPTIONS (io.trino.orc.OrcTester.READER_OPTIONS)2 OrcTester.createCustomOrcRecordReader (io.trino.orc.OrcTester.createCustomOrcRecordReader)2 OrcTester.createOrcRecordWriter (io.trino.orc.OrcTester.createOrcRecordWriter)2 OrcTester.createSettableStructObjectInspector (io.trino.orc.OrcTester.createSettableStructObjectInspector)2 CompressionKind (io.trino.orc.metadata.CompressionKind)2 Footer (io.trino.orc.metadata.Footer)2 OrcColumnId (io.trino.orc.metadata.OrcColumnId)2 StringStatistics (io.trino.orc.metadata.statistics.StringStatistics)2