use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.
the class TestCachingOrcDataSource method doIntegration.
private void doIntegration(TestingOrcDataSource orcDataSource, DataSize maxMergeDistance, DataSize tinyStripeThreshold) throws IOException {
OrcReaderOptions options = new OrcReaderOptions().withMaxMergeDistance(maxMergeDistance).withTinyStripeThreshold(tinyStripeThreshold).withMaxReadBlockSize(DataSize.of(1, Unit.MEGABYTE));
OrcReader orcReader = OrcReader.createOrcReader(orcDataSource, options).orElseThrow(() -> new RuntimeException("File is empty"));
// 1 for reading file footer
assertEquals(orcDataSource.getReadCount(), 1);
List<StripeInformation> stripes = orcReader.getFooter().getStripes();
// Sanity check number of stripes. This can be three or higher because of orc writer low memory mode.
assertGreaterThanOrEqual(stripes.size(), 3);
// verify wrapped by CachingOrcReader
assertInstanceOf(wrapWithCacheIfTinyStripes(orcDataSource, stripes, maxMergeDistance, tinyStripeThreshold), CachingOrcDataSource.class);
OrcRecordReader orcRecordReader = orcReader.createRecordReader(orcReader.getRootColumn().getNestedColumns(), ImmutableList.of(VARCHAR), (numberOfRows, statisticsByColumnIndex) -> true, HIVE_STORAGE_TIME_ZONE, newSimpleAggregatedMemoryContext(), INITIAL_BATCH_SIZE, RuntimeException::new);
int positionCount = 0;
while (true) {
Page page = orcRecordReader.nextPage();
if (page == null) {
break;
}
page = page.getLoadedPage();
Block block = page.getBlock(0);
positionCount += block.getPositionCount();
}
assertEquals(positionCount, POSITION_COUNT);
}
use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.
the class TestCachingOrcDataSource method testWrapWithCacheIfTinyStripes.
@Test
public void testWrapWithCacheIfTinyStripes() {
DataSize maxMergeDistance = DataSize.of(1, Unit.MEGABYTE);
DataSize tinyStripeThreshold = DataSize.of(8, Unit.MEGABYTE);
OrcDataSource actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(), maxMergeDistance, tinyStripeThreshold);
assertInstanceOf(actual, CachingOrcDataSource.class);
actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10)), maxMergeDistance, tinyStripeThreshold);
assertInstanceOf(actual, CachingOrcDataSource.class);
actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 10, 10, 10)), maxMergeDistance, tinyStripeThreshold);
assertInstanceOf(actual, CachingOrcDataSource.class);
actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)), maxMergeDistance, tinyStripeThreshold);
assertInstanceOf(actual, CachingOrcDataSource.class);
actual = wrapWithCacheIfTinyStripes(FakeOrcDataSource.INSTANCE, ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20 + 1, 10, 10)), maxMergeDistance, tinyStripeThreshold);
assertNotInstanceOf(actual, CachingOrcDataSource.class);
}
use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.
the class TestCachingOrcDataSource method testTinyStripesReadCacheAt.
@Test
public void testTinyStripesReadCacheAt() throws IOException {
DataSize maxMergeDistance = DataSize.of(1, Unit.MEGABYTE);
DataSize tinyStripeThreshold = DataSize.of(8, Unit.MEGABYTE);
TestingOrcDataSource testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
CachingOrcDataSource cachingOrcDataSource = new CachingOrcDataSource(testingOrcDataSource, createTinyStripesRangeFinder(ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)), maxMergeDistance, tinyStripeThreshold));
cachingOrcDataSource.readCacheAt(3);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 60)));
cachingOrcDataSource.readCacheAt(63);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(63, 8 * 1048576)));
testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
cachingOrcDataSource = new CachingOrcDataSource(testingOrcDataSource, createTinyStripesRangeFinder(ImmutableList.of(new StripeInformation(123, 3, 10, 10, 10), new StripeInformation(123, 33, 10, 10, 10), new StripeInformation(123, 63, 1048576 * 8 - 20, 10, 10)), maxMergeDistance, tinyStripeThreshold));
// read at the end of a stripe
cachingOrcDataSource.readCacheAt(62);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 60)));
cachingOrcDataSource.readCacheAt(63);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(63, 8 * 1048576)));
testingOrcDataSource = new TestingOrcDataSource(FakeOrcDataSource.INSTANCE);
cachingOrcDataSource = new CachingOrcDataSource(testingOrcDataSource, createTinyStripesRangeFinder(ImmutableList.of(new StripeInformation(123, 3, 1, 1, 1), new StripeInformation(123, 4, 1048576, 1048576, 1048576 * 3), new StripeInformation(123, 4 + 1048576 * 5, 1048576, 1048576, 1048576)), maxMergeDistance, tinyStripeThreshold));
cachingOrcDataSource.readCacheAt(3);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(3, 1 + 1048576 * 5)));
cachingOrcDataSource.readCacheAt(4 + 1048576 * 5);
assertEquals(testingOrcDataSource.getLastReadRanges(), ImmutableList.of(new DiskRange(4 + 1048576 * 5, 3 * 1048576)));
}
use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.
the class TestSliceDictionaryColumnReader method testDictionaryReaderUpdatesRetainedSize.
@Test
public void testDictionaryReaderUpdatesRetainedSize() throws Exception {
// create orc file
List<String> values = createValues();
File temporaryDirectory = createTempDir();
File orcFile = new File(temporaryDirectory, randomUUID().toString());
writeOrcColumnTrino(orcFile, NONE, VARCHAR, values.iterator(), new OrcWriterStats());
// prepare for read
OrcDataSource dataSource = new MemoryOrcDataSource(new OrcDataSourceId(orcFile.getPath()), Slices.wrappedBuffer(readAllBytes(orcFile.toPath())));
OrcReader orcReader = OrcReader.createOrcReader(dataSource, new OrcReaderOptions()).orElseThrow(() -> new RuntimeException("File is empty"));
Footer footer = orcReader.getFooter();
List<OrcColumn> columns = orcReader.getRootColumn().getNestedColumns();
assertTrue(columns.size() == 1);
StripeReader stripeReader = new StripeReader(dataSource, UTC, Optional.empty(), footer.getTypes(), ImmutableSet.copyOf(columns), footer.getRowsInRowGroup(), OrcPredicate.TRUE, ORIGINAL, new OrcMetadataReader(), Optional.empty());
AggregatedMemoryContext memoryContext = newSimpleAggregatedMemoryContext();
SliceDictionaryColumnReader columnReader = new SliceDictionaryColumnReader(columns.get(0), memoryContext.newLocalMemoryContext(TestSliceDictionaryColumnReader.class.getSimpleName()), -1, false);
List<StripeInformation> stripeInformations = footer.getStripes();
for (StripeInformation stripeInformation : stripeInformations) {
Stripe stripe = stripeReader.readStripe(stripeInformation, newSimpleAggregatedMemoryContext());
List<RowGroup> rowGroups = stripe.getRowGroups();
columnReader.startStripe(stripe.getFileTimeZone(), stripe.getDictionaryStreamSources(), stripe.getColumnEncodings());
for (RowGroup rowGroup : rowGroups) {
columnReader.startRowGroup(rowGroup.getStreamSources());
columnReader.prepareNextRead(1000);
columnReader.readBlock();
// memory usage check
assertEquals(memoryContext.getBytes(), columnReader.getRetainedSizeInBytes());
}
}
columnReader.close();
assertTrue(memoryContext.getBytes() == 0);
}
use of io.trino.orc.metadata.StripeInformation in project trino by trinodb.
the class OrcRecordReader method advanceToNextStripe.
private void advanceToNextStripe() throws IOException {
currentStripeMemoryContext.close();
currentStripeMemoryContext = memoryUsage.newAggregatedMemoryContext();
rowGroups = ImmutableList.<RowGroup>of().iterator();
if (currentStripe >= 0) {
if (stripeStatisticsValidation.isPresent()) {
StatisticsValidation statisticsValidation = stripeStatisticsValidation.get();
long offset = stripes.get(currentStripe).getOffset();
writeValidation.get().validateStripeStatistics(orcDataSource.getId(), offset, statisticsValidation.build().get());
statisticsValidation.reset();
}
}
currentStripe++;
if (currentStripe >= stripes.size()) {
return;
}
if (currentStripe > 0) {
currentStripePosition += stripes.get(currentStripe - 1).getNumberOfRows();
}
StripeInformation stripeInformation = stripes.get(currentStripe);
validateWriteStripe(stripeInformation.getNumberOfRows());
Stripe stripe = stripeReader.readStripe(stripeInformation, currentStripeMemoryContext);
if (stripe != null) {
// Give readers access to dictionary streams
InputStreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources();
ColumnMetadata<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
ZoneId fileTimeZone = stripe.getFileTimeZone();
for (ColumnReader column : columnReaders) {
if (column != null) {
column.startStripe(fileTimeZone, dictionaryStreamSources, columnEncodings);
}
}
rowGroups = stripe.getRowGroups().iterator();
}
orcDataSourceMemoryUsage.setBytes(orcDataSource.getRetainedSize());
}
Aggregations