Search in sources :

Example 1 with ColumnReader

use of io.trino.orc.reader.ColumnReader in project trino by trinodb.

the class OrcRecordReader method advanceToNextStripe.

private void advanceToNextStripe() throws IOException {
    currentStripeMemoryContext.close();
    currentStripeMemoryContext = memoryUsage.newAggregatedMemoryContext();
    rowGroups = ImmutableList.<RowGroup>of().iterator();
    if (currentStripe >= 0) {
        if (stripeStatisticsValidation.isPresent()) {
            StatisticsValidation statisticsValidation = stripeStatisticsValidation.get();
            long offset = stripes.get(currentStripe).getOffset();
            writeValidation.get().validateStripeStatistics(orcDataSource.getId(), offset, statisticsValidation.build().get());
            statisticsValidation.reset();
        }
    }
    currentStripe++;
    if (currentStripe >= stripes.size()) {
        return;
    }
    if (currentStripe > 0) {
        currentStripePosition += stripes.get(currentStripe - 1).getNumberOfRows();
    }
    StripeInformation stripeInformation = stripes.get(currentStripe);
    validateWriteStripe(stripeInformation.getNumberOfRows());
    Stripe stripe = stripeReader.readStripe(stripeInformation, currentStripeMemoryContext);
    if (stripe != null) {
        // Give readers access to dictionary streams
        InputStreamSources dictionaryStreamSources = stripe.getDictionaryStreamSources();
        ColumnMetadata<ColumnEncoding> columnEncodings = stripe.getColumnEncodings();
        ZoneId fileTimeZone = stripe.getFileTimeZone();
        for (ColumnReader column : columnReaders) {
            if (column != null) {
                column.startStripe(fileTimeZone, dictionaryStreamSources, columnEncodings);
            }
        }
        rowGroups = stripe.getRowGroups().iterator();
    }
    orcDataSourceMemoryUsage.setBytes(orcDataSource.getRetainedSize());
}
Also used : ColumnEncoding(io.trino.orc.metadata.ColumnEncoding) InputStreamSources(io.trino.orc.stream.InputStreamSources) ZoneId(java.time.ZoneId) StatisticsValidation(io.trino.orc.OrcWriteValidation.StatisticsValidation) ColumnReader(io.trino.orc.reader.ColumnReader) ColumnReaders.createColumnReader(io.trino.orc.reader.ColumnReaders.createColumnReader) StripeInformation(io.trino.orc.metadata.StripeInformation)

Example 2 with ColumnReader

use of io.trino.orc.reader.ColumnReader in project trino by trinodb.

the class OrcRecordReader method createColumnReaders.

private static ColumnReader[] createColumnReaders(List<OrcColumn> columns, List<Type> readTypes, List<OrcReader.ProjectedLayout> readLayouts, AggregatedMemoryContext memoryContext, OrcBlockFactory blockFactory, FieldMapperFactory fieldMapperFactory) throws OrcCorruptionException {
    ColumnReader[] columnReaders = new ColumnReader[columns.size()];
    for (int columnIndex = 0; columnIndex < columns.size(); columnIndex++) {
        Type readType = readTypes.get(columnIndex);
        OrcColumn column = columns.get(columnIndex);
        OrcReader.ProjectedLayout projectedLayout = readLayouts.get(columnIndex);
        columnReaders[columnIndex] = createColumnReader(readType, column, projectedLayout, memoryContext, blockFactory, fieldMapperFactory);
    }
    return columnReaders;
}
Also used : Type(io.trino.spi.type.Type) OrcType(io.trino.orc.metadata.OrcType) ColumnReader(io.trino.orc.reader.ColumnReader) ColumnReaders.createColumnReader(io.trino.orc.reader.ColumnReaders.createColumnReader)

Example 3 with ColumnReader

use of io.trino.orc.reader.ColumnReader in project trino by trinodb.

the class OrcRecordReader method nextPage.

public Page nextPage() throws IOException {
    // update position for current row group (advancing resets them)
    filePosition += currentBatchSize;
    currentPosition += currentBatchSize;
    currentBatchSize = 0;
    // if next row is within the current group return
    if (nextRowInGroup >= currentGroupRowCount) {
        // attempt to advance to next row group
        if (!advanceToNextRowGroup()) {
            filePosition = fileRowCount;
            currentPosition = totalRowCount;
            return null;
        }
    }
    // We will grow currentBatchSize by BATCH_SIZE_GROWTH_FACTOR starting from initialBatchSize to maxBatchSize or
    // the number of rows left in this rowgroup, whichever is smaller. maxBatchSize is adjusted according to the
    // block size for every batch and never exceed MAX_BATCH_SIZE. But when the number of rows in the last batch in
    // the current rowgroup is smaller than min(nextBatchSize, maxBatchSize), the nextBatchSize for next batch in
    // the new rowgroup should be grown based on min(nextBatchSize, maxBatchSize) but not by the number of rows in
    // the last batch, i.e. currentGroupRowCount - nextRowInGroup. For example, if the number of rows read for
    // single fixed width column are: 1, 16, 256, 1024, 1024,..., 1024, 256 and the 256 was because there is only
    // 256 rows left in this row group, then the nextBatchSize should be 1024 instead of 512. So we need to grow the
    // nextBatchSize before limiting the currentBatchSize by currentGroupRowCount - nextRowInGroup.
    currentBatchSize = min(nextBatchSize, maxBatchSize);
    nextBatchSize = min(currentBatchSize * BATCH_SIZE_GROWTH_FACTOR, MAX_BATCH_SIZE);
    currentBatchSize = toIntExact(min(currentBatchSize, currentGroupRowCount - nextRowInGroup));
    for (ColumnReader column : columnReaders) {
        if (column != null) {
            column.prepareNextRead(currentBatchSize);
        }
    }
    nextRowInGroup += currentBatchSize;
    // create a lazy page
    blockFactory.nextPage();
    Arrays.fill(currentBytesPerCell, 0);
    Block[] blocks = new Block[columnReaders.length];
    for (int i = 0; i < columnReaders.length; i++) {
        int columnIndex = i;
        blocks[columnIndex] = blockFactory.createBlock(currentBatchSize, columnReaders[columnIndex]::readBlock, false);
        listenForLoads(blocks[columnIndex], block -> blockLoaded(columnIndex, block));
    }
    Page page = new Page(currentBatchSize, blocks);
    validateWritePageChecksum(page);
    return page;
}
Also used : Block(io.trino.spi.block.Block) Page(io.trino.spi.Page) ColumnReader(io.trino.orc.reader.ColumnReader) ColumnReaders.createColumnReader(io.trino.orc.reader.ColumnReaders.createColumnReader)

Example 4 with ColumnReader

use of io.trino.orc.reader.ColumnReader in project trino by trinodb.

the class OrcRecordReader method close.

@Override
public void close() throws IOException {
    try (Closer closer = Closer.create()) {
        closer.register(orcDataSource);
        for (ColumnReader column : columnReaders) {
            if (column != null) {
                closer.register(column::close);
            }
        }
    }
    if (writeChecksumBuilder.isPresent()) {
        WriteChecksum actualChecksum = writeChecksumBuilder.get().build();
        validateWrite(validation -> validation.getChecksum().getTotalRowCount() == actualChecksum.getTotalRowCount(), "Invalid row count");
        List<Long> columnHashes = actualChecksum.getColumnHashes();
        for (int i = 0; i < columnHashes.size(); i++) {
            int columnIndex = i;
            validateWrite(validation -> validation.getChecksum().getColumnHashes().get(columnIndex).equals(columnHashes.get(columnIndex)), "Invalid checksum for column %s", columnIndex);
        }
        validateWrite(validation -> validation.getChecksum().getStripeHash() == actualChecksum.getStripeHash(), "Invalid stripes checksum");
    }
    if (fileStatisticsValidation.isPresent()) {
        Optional<ColumnMetadata<ColumnStatistics>> columnStatistics = fileStatisticsValidation.get().build();
        writeValidation.get().validateFileStatistics(orcDataSource.getId(), columnStatistics);
    }
}
Also used : Closer(com.google.common.io.Closer) ColumnMetadata(io.trino.orc.metadata.ColumnMetadata) Comparator.comparingLong(java.util.Comparator.comparingLong) ColumnReader(io.trino.orc.reader.ColumnReader) ColumnReaders.createColumnReader(io.trino.orc.reader.ColumnReaders.createColumnReader) WriteChecksum(io.trino.orc.OrcWriteValidation.WriteChecksum)

Example 5 with ColumnReader

use of io.trino.orc.reader.ColumnReader in project trino by trinodb.

the class OrcRecordReader method advanceToNextRowGroup.

private boolean advanceToNextRowGroup() throws IOException {
    nextRowInGroup = 0;
    if (currentRowGroup >= 0) {
        if (rowGroupStatisticsValidation.isPresent()) {
            StatisticsValidation statisticsValidation = rowGroupStatisticsValidation.get();
            long offset = stripes.get(currentStripe).getOffset();
            writeValidation.get().validateRowGroupStatistics(orcDataSource.getId(), offset, currentRowGroup, statisticsValidation.build().get());
            statisticsValidation.reset();
        }
    }
    while (!rowGroups.hasNext() && currentStripe < stripes.size()) {
        advanceToNextStripe();
        currentRowGroup = -1;
    }
    if (!rowGroups.hasNext()) {
        currentGroupRowCount = 0;
        return false;
    }
    currentRowGroup++;
    RowGroup currentRowGroup = rowGroups.next();
    currentGroupRowCount = currentRowGroup.getRowCount();
    if (currentRowGroup.getMinAverageRowBytes() > 0) {
        maxBatchSize = toIntExact(min(maxBatchSize, max(1, maxBlockBytes / currentRowGroup.getMinAverageRowBytes())));
    }
    currentPosition = currentStripePosition + currentRowGroup.getRowOffset();
    filePosition = stripeFilePositions.get(currentStripe) + currentRowGroup.getRowOffset();
    // give reader data streams from row group
    InputStreamSources rowGroupStreamSources = currentRowGroup.getStreamSources();
    for (ColumnReader column : columnReaders) {
        if (column != null) {
            column.startRowGroup(rowGroupStreamSources);
        }
    }
    return true;
}
Also used : InputStreamSources(io.trino.orc.stream.InputStreamSources) StatisticsValidation(io.trino.orc.OrcWriteValidation.StatisticsValidation) ColumnReader(io.trino.orc.reader.ColumnReader) ColumnReaders.createColumnReader(io.trino.orc.reader.ColumnReaders.createColumnReader)

Aggregations

ColumnReader (io.trino.orc.reader.ColumnReader)5 ColumnReaders.createColumnReader (io.trino.orc.reader.ColumnReaders.createColumnReader)5 StatisticsValidation (io.trino.orc.OrcWriteValidation.StatisticsValidation)2 InputStreamSources (io.trino.orc.stream.InputStreamSources)2 Closer (com.google.common.io.Closer)1 WriteChecksum (io.trino.orc.OrcWriteValidation.WriteChecksum)1 ColumnEncoding (io.trino.orc.metadata.ColumnEncoding)1 ColumnMetadata (io.trino.orc.metadata.ColumnMetadata)1 OrcType (io.trino.orc.metadata.OrcType)1 StripeInformation (io.trino.orc.metadata.StripeInformation)1 Page (io.trino.spi.Page)1 Block (io.trino.spi.block.Block)1 Type (io.trino.spi.type.Type)1 ZoneId (java.time.ZoneId)1 Comparator.comparingLong (java.util.Comparator.comparingLong)1