Search in sources :

Example 1 with MAX_BATCH_SIZE

use of io.trino.orc.OrcReader.MAX_BATCH_SIZE in project trino by trinodb.

the class TestOrcReaderPositions method testRowGroupSkipping.

@Test
public void testRowGroupSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        // create single strip file with multiple row groups
        int rowCount = 142_000;
        createSequentialFile(tempFile.getFile(), rowCount);
        // test reading two row groups from middle of file
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == rowCount) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return (stats.getMin() == 50_000) || (stats.getMin() == 60_000);
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), rowCount);
            assertEquals(reader.getReaderRowCount(), rowCount);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            long position = 50_000;
            while (true) {
                Page page = reader.nextPage();
                if (page == null) {
                    break;
                }
                page = page.getLoadedPage();
                Block block = page.getBlock(0);
                for (int i = 0; i < block.getPositionCount(); i++) {
                    assertEquals(BIGINT.getLong(block, i), position + i);
                }
                assertEquals(reader.getFilePosition(), position);
                assertEquals(reader.getReaderPosition(), position);
                position += page.getPositionCount();
            }
            assertEquals(position, 70_000);
            assertEquals(reader.getFilePosition(), rowCount);
            assertEquals(reader.getReaderPosition(), rowCount);
        }
    }
}
Also used : OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) BATCH_SIZE_GROWTH_FACTOR(io.trino.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) READER_OPTIONS(io.trino.orc.OrcTester.READER_OPTIONS) ORC_12(io.trino.orc.OrcTester.Format.ORC_12) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) Assert.fail(org.testng.Assert.fail) IOException(java.io.IOException) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) OrcTester.createSettableStructObjectInspector(io.trino.orc.OrcTester.createSettableStructObjectInspector) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) CompressionKind(io.trino.orc.metadata.CompressionKind) File(java.io.File) Footer(io.trino.orc.metadata.Footer) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) MAX_BATCH_SIZE(io.trino.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(io.trino.spi.type.BigintType.BIGINT) Serializer(org.apache.hadoop.hive.serde2.Serializer) OrcTester.createOrcRecordWriter(io.trino.orc.OrcTester.createOrcRecordWriter) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) OrcColumnId(io.trino.orc.metadata.OrcColumnId) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Block(io.trino.spi.block.Block) Page(io.trino.spi.Page) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Example 2 with MAX_BATCH_SIZE

use of io.trino.orc.OrcReader.MAX_BATCH_SIZE in project trino by trinodb.

the class TestOrcReaderPositions method testStripeSkipping.

@Test
public void testStripeSkipping() throws Exception {
    try (TempFile tempFile = new TempFile()) {
        createMultiStripeFile(tempFile.getFile());
        // test reading second and fourth stripes
        OrcPredicate predicate = (numberOfRows, allColumnStatistics) -> {
            if (numberOfRows == 100) {
                return true;
            }
            IntegerStatistics stats = allColumnStatistics.get(new OrcColumnId(1)).getIntegerStatistics();
            return ((stats.getMin() == 60) && (stats.getMax() == 117)) || ((stats.getMin() == 180) && (stats.getMax() == 237));
        };
        try (OrcRecordReader reader = createCustomOrcRecordReader(tempFile, predicate, BIGINT, MAX_BATCH_SIZE)) {
            assertEquals(reader.getFileRowCount(), 100);
            assertEquals(reader.getReaderRowCount(), 40);
            assertEquals(reader.getFilePosition(), 0);
            assertEquals(reader.getReaderPosition(), 0);
            // second stripe
            Page page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 0);
            assertEquals(reader.getFilePosition(), 20);
            assertCurrentBatch(page, 1);
            // fourth stripe
            page = reader.nextPage().getLoadedPage();
            assertEquals(page.getPositionCount(), 20);
            assertEquals(reader.getReaderPosition(), 20);
            assertEquals(reader.getFilePosition(), 60);
            assertCurrentBatch(page, 3);
            page = reader.nextPage();
            assertNull(page);
            assertEquals(reader.getReaderPosition(), 40);
            assertEquals(reader.getFilePosition(), 100);
        }
    }
}
Also used : OrcFile(org.apache.hadoop.hive.ql.io.orc.OrcFile) Slice(io.airlift.slice.Slice) Assert.assertNull(org.testng.Assert.assertNull) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Page(io.trino.spi.Page) Assert.assertEquals(org.testng.Assert.assertEquals) Writable(org.apache.hadoop.io.Writable) Test(org.testng.annotations.Test) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) ByteBuffer(java.nio.ByteBuffer) Writer(org.apache.hadoop.hive.ql.io.orc.Writer) BATCH_SIZE_GROWTH_FACTOR(io.trino.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) VARCHAR(io.trino.spi.type.VarcharType.VARCHAR) Block(io.trino.spi.block.Block) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) READER_OPTIONS(io.trino.orc.OrcTester.READER_OPTIONS) ORC_12(io.trino.orc.OrcTester.Format.ORC_12) NullMemoryManager(org.apache.orc.NullMemoryManager) ImmutableMap(com.google.common.collect.ImmutableMap) UTF_8(java.nio.charset.StandardCharsets.UTF_8) SNAPPY(org.apache.hadoop.hive.ql.io.orc.CompressionKind.SNAPPY) INITIAL_BATCH_SIZE(io.trino.orc.OrcReader.INITIAL_BATCH_SIZE) Assert.fail(org.testng.Assert.fail) IOException(java.io.IOException) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) OrcTester.createSettableStructObjectInspector(io.trino.orc.OrcTester.createSettableStructObjectInspector) Math.min(java.lang.Math.min) Field(java.lang.reflect.Field) Maps(com.google.common.collect.Maps) CompressionKind(io.trino.orc.metadata.CompressionKind) File(java.io.File) Footer(io.trino.orc.metadata.Footer) SettableStructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.SettableStructObjectInspector) MAX_BATCH_SIZE(io.trino.orc.OrcReader.MAX_BATCH_SIZE) BIGINT(io.trino.spi.type.BigintType.BIGINT) Serializer(org.apache.hadoop.hive.serde2.Serializer) OrcTester.createOrcRecordWriter(io.trino.orc.OrcTester.createOrcRecordWriter) Assert.assertTrue(org.testng.Assert.assertTrue) OrcOutputFormat(org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) OrcColumnId(io.trino.orc.metadata.OrcColumnId) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) OrcColumnId(io.trino.orc.metadata.OrcColumnId) Page(io.trino.spi.Page) OrcTester.createCustomOrcRecordReader(io.trino.orc.OrcTester.createCustomOrcRecordReader) IntegerStatistics(io.trino.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Aggregations

ImmutableMap (com.google.common.collect.ImmutableMap)2 Maps (com.google.common.collect.Maps)2 Slice (io.airlift.slice.Slice)2 BATCH_SIZE_GROWTH_FACTOR (io.trino.orc.OrcReader.BATCH_SIZE_GROWTH_FACTOR)2 INITIAL_BATCH_SIZE (io.trino.orc.OrcReader.INITIAL_BATCH_SIZE)2 MAX_BATCH_SIZE (io.trino.orc.OrcReader.MAX_BATCH_SIZE)2 ORC_12 (io.trino.orc.OrcTester.Format.ORC_12)2 READER_OPTIONS (io.trino.orc.OrcTester.READER_OPTIONS)2 OrcTester.createCustomOrcRecordReader (io.trino.orc.OrcTester.createCustomOrcRecordReader)2 OrcTester.createOrcRecordWriter (io.trino.orc.OrcTester.createOrcRecordWriter)2 OrcTester.createSettableStructObjectInspector (io.trino.orc.OrcTester.createSettableStructObjectInspector)2 CompressionKind (io.trino.orc.metadata.CompressionKind)2 Footer (io.trino.orc.metadata.Footer)2 OrcColumnId (io.trino.orc.metadata.OrcColumnId)2 IntegerStatistics (io.trino.orc.metadata.statistics.IntegerStatistics)2 Page (io.trino.spi.Page)2 Block (io.trino.spi.block.Block)2 BIGINT (io.trino.spi.type.BigintType.BIGINT)2 VARCHAR (io.trino.spi.type.VarcharType.VARCHAR)2 File (java.io.File)2