Search in sources :

Example 1 with StripeInformation

use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.

the class TestCachingOrcDataSource method doIntegration.

private void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize tinyStripeThreshold) throws IOException {
    OrcReaderOptions options = new OrcReaderOptions().withMaxMergeDistance(maxMergeDistance).withTinyStripeThreshold(tinyStripeThreshold).withMaxReadBlockSize(DataSize.of(1, Unit.MEGABYTE));
    OrcReader orcReader = OrcReader.createOrcReader(orcDataSource, options).orElseThrow(() -> new RuntimeException("File is empty"));
    // 1 for reading file footer
    assertEquals(orcDataSource.getReadCount(), 1);
    List<StripeInformation> stripes = orcReader.getFooter().getStripes();
    // Sanity check number of stripes. This can be three or higher because of orc writer low memory mode.
    assertGreaterThanOrEqual(stripes.size(), 3);
    // verify wrapped by CachingOrcReader
    assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class);
    OrcRecordReader orcRecordReader = orcReader.createRecordReader(orcReader.getRootColumn().getNestedColumns(), ImmutableList.of(VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE, RuntimeException::new);
    int positionCount = 0;
    while (true) {
        Page page = orcRecordReader.nextPage();
        if (page == null) {
            break;
        }
        page = page.getLoadedPage();
        Block block = page.getBlock(0);
        positionCount += block.getPositionCount();
    }
    assertEquals(positionCount, POSITION_COUNT);
}
Also used : Block(io.trino.spi.block.Block) Page(io.trino.spi.Page) StripeInformation(io.trino.orc.metadata.StripeInformation)

Example 2 with StripeInformation

use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.

the class TestCachingOrcDataSource method testWrapWithCacheIfTinyStripes.

@Test
public void testWrapWithCacheIfTinyStripes() {
    DataSize maxMergeDistance = DataSize.of(1, Unit.MEGABYTE);
    DataSize tinyStripeThreshold = DataSize.of(8, Unit.MEGABYTE);
    OrcDataSource actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(), maxMergeDistance, tinyStripeThreshold);
    assertInstanceOf(actual, CachingOrcDataSource.class);
    actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10)), maxMergeDistance, tinyStripeThreshold);
    assertInstanceOf(actual, CachingOrcDataSource.class);
    actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 10, 10, 10)), maxMergeDistance, tinyStripeThreshold);
    assertInstanceOf(actual, CachingOrcDataSource.class);
    actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)), maxMergeDistance, tinyStripeThreshold);
    assertInstanceOf(actual, CachingOrcDataSource.class);
    actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20 + 1, 10, 10)), maxMergeDistance, tinyStripeThreshold);
    assertNotInstanceOf(actual, CachingOrcDataSource.class);
}
Also used : DataSize(io.airlift.units.DataSize) StripeInformation(io.trino.orc.metadata.StripeInformation) Test(org.testng.annotations.Test)

Example 3 with StripeInformation

use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.

the class TestCachingOrcDataSource method testTinyStripesReadCacheAt.

@Test
public void testTinyStripesReadCacheAt() throws IOException {
    DataSize maxMergeDistance = DataSize.of(1, Unit.MEGABYTE);
    DataSize tinyStripeThreshold = DataSize.of(8, Unit.MEGABYTE);
    TestingOrcDataSource testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
    CachingOrcDataSource cachingOrcDataSource = new CachingOrcDataSource(testingOrcDataSource, createTinyStripesRangeFinder(ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)), maxMergeDistance, tinyStripeThreshold));
    cachingOrcDataSource.readCacheAt(3);
    assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 60)));
    cachingOrcDataSource.readCacheAt(63);
    assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(63, 8 * 1048576)));
    testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
    cachingOrcDataSource = new CachingOrcDataSource(testingOrcDataSource, createTinyStripesRangeFinder(ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)), maxMergeDistance, tinyStripeThreshold));
    // read at the end of a stripe
    cachingOrcDataSource.readCacheAt(62);
    assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 60)));
    cachingOrcDataSource.readCacheAt(63);
    assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(63, 8 * 1048576)));
    testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
    cachingOrcDataSource = new CachingOrcDataSource(testingOrcDataSource, createTinyStripesRangeFinder(ImmutableList.of(new StripeInformation(123, 3, 1, 1, 1), new StripeInformation(123, 4, 1048576, 1048576, 1048576 * 3), new StripeInformation(123, 4 + 1048576 * 5, 1048576, 1048576, 1048576)), maxMergeDistance, tinyStripeThreshold));
    cachingOrcDataSource.readCacheAt(3);
    assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 1 + 1048576 * 5)));
    cachingOrcDataSource.readCacheAt(4 + 1048576 * 5);
    assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(4 + 1048576 * 5, 3 * 1048576)));
}
Also used : DataSize(io.airlift.units.DataSize) StripeInformation(io.trino.orc.metadata.StripeInformation) Test(org.testng.annotations.Test)

Example 4 with StripeInformation

use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.

the class TestSliceDictionaryColumnReader method testDictionaryReaderUpdatesRetainedSize.

@Test
public void testDictionaryReaderUpdatesRetainedSize() throws Exception {
    // create orc file
    List<String> values = createValues();
    File temporaryDirectory = createTempDir();
    File orcFile = new File(temporaryDirectory, randomUUID().toString());
    writeOrcColumnTrino(orcFile, NONE, VARCHAR, values.iterator(), new OrcWriterStats());
    // prepare for read
    OrcDataSource dataSource = new MemoryOrcDataSource(new OrcDataSourceId(orcFile.getPath()), Slices.wrappedBuffer(readAllBytes(orcFile.toPath())));
    OrcReader orcReader = OrcReader.createOrcReader(dataSource, new OrcReaderOptions()).orElseThrow(() -> new RuntimeException("File is empty"));
    Footer footer = orcReader.getFooter();
    List<OrcColumn> columns = orcReader.getRootColumn().getNestedColumns();
    assertTrue(columns.size() == 1);
    StripeReader stripeReader = new StripeReader(dataSource, UTC, Optional.empty(), footer.getTypes(), ImmutableSet.copyOf(columns), footer.getRowsInRowGroup(), OrcPredicate.TRUE, ORIGINAL, new OrcMetadataReader(), Optional.empty());
    AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext();
    SliceDictionaryColumnReader columnReader = new SliceDictionaryColumnReader(columns.get(0), memoryContext.newLocalMemoryContext(TestSliceDictionaryColumnReader.class.getSimpleName()), -1, false);
    List<StripeInformation> stripeInformations = footer.getStripes();
    for (StripeInformation stripeInformation : stripeInformations) {
        Stripe stripe = stripeReader.readStripe(stripeInformation, newSimpleAggregatedMemoryContext());
        List<RowGroup> rowGroups = stripe.getRowGroups();
        columnReader.startStripe(stripe.getFileTimeZone(), stripe.getDictionaryStreamSources(), stripe.getColumnEncodings());
        for (RowGroup rowGroup : rowGroups) {
            columnReader.startRowGroup(rowGroup.getStreamSources());
            columnReader.prepareNextRead(1000);
            columnReader.readBlock();
            // memory usage check
            assertEquals(memoryContext.getBytes(), columnReader.getRetainedSizeInBytes());
        }
    }
    columnReader.close();
    assertTrue(memoryContext.getBytes() == 0);
}
Also used : SliceDictionaryColumnReader(io.trino.orc.reader.SliceDictionaryColumnReader) OrcMetadataReader(io.trino.orc.metadata.OrcMetadataReader) AggregatedMemoryContext.newSimpleAggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext) AggregatedMemoryContext(io.trino.memory.context.AggregatedMemoryContext) Footer(io.trino.orc.metadata.Footer) File(java.io.File) StripeInformation(io.trino.orc.metadata.StripeInformation) Test(org.testng.annotations.Test)

Example 5 with StripeInformation

use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.

the class OrcRecordReader method advanceToNextStripe.

private void advanceToNextStripe() throws IOException {
    currentStripeMemoryContext.close();
    currentStripeMemoryContext = memoryUsage.newAggregatedMemoryContext();
    rowGroups = ImmutableList.<RowGroup>of().iterator();
    if (currentStripe >= 0) {
        if (stripeStatisticsValidation.isPresent()) {
            StatisticsValidation statisticsValidation = stripeStatisticsValidation.get();
            long offset = stripes.get(currentStripe).getOffset();
            writeValidation.get().validateStripeStatistics(orcDataSource.getId(), offset, statisticsValidation.build().get());
            statisticsValidation.reset();
        }
    }
    currentStripe++;
    if (currentStripe >= stripes.size()) {
        return;
    }
    if (currentStripe > 0) {
        currentStripePosition += stripes.get(currentStripe - 1).getNumberOfRows();
    }
    StripeInformation stripeInformation = stripes.get(currentStripe);
    validateWriteStripe(stripeInformation.getNumberOfRows());
    Stripe stripe = stripeReader.readStripe(stripeInformation, currentStripeMemoryContext);
    if (stripe != null) {
        // Give readers access to dictionary streams
        InputStreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources();
        ColumnMetadata<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
        ZoneId fileTimeZone = stripe.getFileTimeZone();
        for (ColumnReader column : columnReaders) {
            if (column != null) {
                column.startStripe(fileTimeZone, dictionaryStreamSources, columnEncodings);
            }
        }
        rowGroups = stripe.getRowGroups().iterator();
    }
    orcDataSourceMemoryUsage.setBytes(orcDataSource.getRetainedSize());
}
Also used : ColumnEncoding(io.trino.orc.metadata.ColumnEncoding) InputStreamSources(io.trino.orc.stream.InputStreamSources) ZoneId(java.time.ZoneId) StatisticsValidation(io.trino.orc.OrcWriteValidation.StatisticsValidation) ColumnReader(io.trino.orc.reader.ColumnReader) ColumnReaders.createColumnReader(io.trino.orc.reader.ColumnReaders.createColumnReader) StripeInformation(io.trino.orc.metadata.StripeInformation)

Aggregations

StripeInformation (io.trino.orc.metadata.StripeInformation)7 Test (org.testng.annotations.Test)4 Slice (io.airlift.slice.Slice)2 DataSize (io.airlift.units.DataSize)2 ColumnEncoding (io.trino.orc.metadata.ColumnEncoding)2 Footer (io.trino.orc.metadata.Footer)2 OrcMetadataReader (io.trino.orc.metadata.OrcMetadataReader)2 Stream (io.trino.orc.metadata.Stream)2 StripeFooter (io.trino.orc.metadata.StripeFooter)2 Page (io.trino.spi.Page)2 Block (io.trino.spi.block.Block)2 Slices.utf8Slice (io.airlift.slice.Slices.utf8Slice)1 AggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext)1 AggregatedMemoryContext.newSimpleAggregatedMemoryContext (io.trino.memory.context.AggregatedMemoryContext.newSimpleAggregatedMemoryContext)1 OrcWriteValidationMode (io.trino.orc.OrcWriteValidation.OrcWriteValidationMode)1 StatisticsValidation (io.trino.orc.OrcWriteValidation.StatisticsValidation)1 StripeReader.isIndexStream (io.trino.orc.StripeReader.isIndexStream)1 OrcColumnId (io.trino.orc.metadata.OrcColumnId)1 OrcType (io.trino.orc.metadata.OrcType)1 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)1