use of io.trino.orc.reader.ColumnReader in project trino by trinodb.
the class OrcRecordReader method advanceToNextStripe.
private void advanceToNextStripe() throws IOException {
currentStripeMemoryContext.close();
currentStripeMemoryContext = memoryUsage.newAggregatedMemoryContext();
rowGroups = ImmutableList.<RowGroup>of().iterator();
if (currentStripe >= 0) {
if (stripeStatisticsValidation.isPresent()) {
StatisticsValidation statisticsValidation = stripeStatisticsValidation.get();
long offset = stripes.get(currentStripe).getOffset();
writeValidation.get().validateStripeStatistics(orcDataSource.getId(), offset, statisticsValidation.build().get());
statisticsValidation.reset();
}
}
currentStripe++;
if (currentStripe >= stripes.size()) {
return;
}
if (currentStripe > 0) {
currentStripePosition += stripes.get(currentStripe - 1).getNumberOfRows();
}
StripeInformation stripeInformation = stripes.get(currentStripe);
validateWriteStripe(stripeInformation.getNumberOfRows());
Stripe stripe = stripeReader.readStripe(stripeInformation, currentStripeMemoryContext);
if (stripe != null) {
// Give readers access to dictionary streams
InputStreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources();
ColumnMetadata<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
ZoneId fileTimeZone = stripe.getFileTimeZone();
for (ColumnReader column : columnReaders) {
if (column != null) {
column.startStripe(fileTimeZone, dictionaryStreamSources, columnEncodings);
}
}
rowGroups = stripe.getRowGroups().iterator();
}
orcDataSourceMemoryUsage.setBytes(orcDataSource.getRetainedSize());
}
use of io.trino.orc.reader.ColumnReader in project trino by trinodb.
the class OrcRecordReader method createColumnReaders.
private static ColumnReader[] createColumnReaders(List<OrcColumn> columns, List<Type> readTypes, List<OrcReader.ProjectedLayout> readLayouts, AggregatedMemoryContext memoryContext, OrcBlockFactory blockFactory, FieldMapperFactory fieldMapperFactory) throws OrcCorruptionException {
ColumnReader[] columnReaders = new ColumnReader[columns.size()];
for (int columnIndex = 0; columnIndex < columns.size(); columnIndex++) {
Type readType = readTypes.get(columnIndex);
OrcColumn column = columns.get(columnIndex);
OrcReader.ProjectedLayout projectedLayout = readLayouts.get(columnIndex);
columnReaders[columnIndex] = createColumnReader(readType, column, projectedLayout, memoryContext, blockFactory, fieldMapperFactory);
}
return columnReaders;
}
use of io.trino.orc.reader.ColumnReader in project trino by trinodb.
the class OrcRecordReader method nextPage.
public Page nextPage() throws IOException {
// update position for current row group (advancing resets them)
filePosition += currentBatchSize;
currentPosition += currentBatchSize;
currentBatchSize = 0;
// if next row is within the current group return
if (nextRowInGroup >= currentGroupRowCount) {
// attempt to advance to next row group
if (!advanceToNextRowGroup()) {
filePosition = fileRowCount;
currentPosition = totalRowCount;
return null;
}
}
// We will grow currentBatchSize by BATCH_SIZE_GROWTH_FACTOR starting from initialBatchSize to maxBatchSize or
// the number of rows left in this rowgroup, whichever is smaller. maxBatchSize is adjusted according to the
// block size for every batch and never exceed MAX_BATCH_SIZE. But when the number of rows in the last batch in
// the current rowgroup is smaller than min(nextBatchSize, maxBatchSize), the nextBatchSize for next batch in
// the new rowgroup should be grown based on min(nextBatchSize, maxBatchSize) but not by the number of rows in
// the last batch, i.e. currentGroupRowCount - nextRowInGroup. For example, if the number of rows read for
// single fixed width column are: 1, 16, 256, 1024, 1024,..., 1024, 256 and the 256 was because there is only
// 256 rows left in this row group, then the nextBatchSize should be 1024 instead of 512. So we need to grow the
// nextBatchSize before limiting the currentBatchSize by currentGroupRowCount - nextRowInGroup.
currentBatchSize = min(nextBatchSize, maxBatchSize);
nextBatchSize = min(currentBatchSize * BATCH_SIZE_GROWTH_FACTOR, MAX_BATCH_SIZE);
currentBatchSize = toIntExact(min(currentBatchSize, currentGroupRowCount - nextRowInGroup));
for (ColumnReader column : columnReaders) {
if (column != null) {
column.prepareNextRead(currentBatchSize);
}
}
nextRowInGroup += currentBatchSize;
// create a lazy page
blockFactory.nextPage();
Arrays.fill(currentBytesPerCell, 0);
Block[] blocks = new Block[columnReaders.length];
for (int i = 0; i < columnReaders.length; i++) {
int columnIndex = i;
blocks[columnIndex] = blockFactory.createBlock(currentBatchSize, columnReaders[columnIndex]::readBlock, false);
listenForLoads(blocks[columnIndex], block -> blockLoaded(columnIndex, block));
}
Page page = new Page(currentBatchSize, blocks);
validateWritePageChecksum(page);
return page;
}
use of io.trino.orc.reader.ColumnReader in project trino by trinodb.
the class OrcRecordReader method close.
@Override
public void close() throws IOException {
try (Closer closer = Closer.create()) {
closer.register(orcDataSource);
for (ColumnReader column : columnReaders) {
if (column != null) {
closer.register(column::close);
}
}
}
if (writeChecksumBuilder.isPresent()) {
WriteChecksum actualChecksum = writeChecksumBuilder.get().build();
validateWrite(validation -> validation.getChecksum().getTotalRowCount() == actualChecksum.getTotalRowCount(), "Invalid row count");
List<Long> columnHashes = actualChecksum.getColumnHashes();
for (int i = 0; i < columnHashes.size(); i++) {
int columnIndex = i;
validateWrite(validation -> validation.getChecksum().getColumnHashes().get(columnIndex).equals(columnHashes.get(columnIndex)), "Invalid checksum for column %s", columnIndex);
}
validateWrite(validation -> validation.getChecksum().getStripeHash() == actualChecksum.getStripeHash(), "Invalid stripes checksum");
}
if (fileStatisticsValidation.isPresent()) {
Optional<ColumnMetadata<ColumnStatistics>> columnStatistics = fileStatisticsValidation.get().build();
writeValidation.get().validateFileStatistics(orcDataSource.getId(), columnStatistics);
}
}
use of io.trino.orc.reader.ColumnReader in project trino by trinodb.
the class OrcRecordReader method advanceToNextRowGroup.
private boolean advanceToNextRowGroup() throws IOException {
nextRowInGroup = 0;
if (currentRowGroup >= 0) {
if (rowGroupStatisticsValidation.isPresent()) {
StatisticsValidation statisticsValidation = rowGroupStatisticsValidation.get();
long offset = stripes.get(currentStripe).getOffset();
writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), offset, currentRowGroup, statisticsValidation.build().get());
statisticsValidation.reset();
}
}
while (!rowGroups.hasNext() && currentStripe < stripes.size()) {
advanceToNextStripe();
currentRowGroup = -1;
}
if (!rowGroups.hasNext()) {
currentGroupRowCount = 0;
return false;
}
currentRowGroup++;
RowGroup currentRowGroup = rowGroups.next();
currentGroupRowCount = currentRowGroup.getRowCount();
if (currentRowGroup.getMinAverageRowBytes() > 0) {
maxBatchSize = toIntExact(min(maxBatchSize, max(1, maxBlockBytes / currentRowGroup.getMinAverageRowBytes())));
}
currentPosition = currentStripePosition + currentRowGroup.getRowOffset();
filePosition = stripeFilePositions.get(currentStripe) + currentRowGroup.getRowOffset();
// give reader data streams from row group
InputStreamSources rowGroupStreamSources = currentRowGroup.getStreamSources();
for (ColumnReader column : columnReaders) {
if (column != null) {
column.startRowGroup(rowGroupStreamSources);
}
}
return true;
}
Aggregations