Search in sources :

Example 1 with OffsetRange

use of com.facebook.presto.parquet.reader.ColumnIndexFilterUtils.OffsetRange in project presto by prestodb.

the class ParquetReader method concatRanges.

private List<OffsetRange> concatRanges(List<OffsetRange> offsetRanges) {
    List<OffsetRange> pageRanges = new ArrayList<>();
    OffsetRange currentParts = null;
    for (OffsetRange range : offsetRanges) {
        long startPosition = range.getOffset();
        // first part or not consecutive => new list
        if (currentParts == null || currentParts.endPos() != startPosition) {
            currentParts = new OffsetRange(startPosition, 0);
        }
        pageRanges.add(currentParts);
        currentParts.extendLength(range.getLength());
    }
    return pageRanges;
}
Also used : OffsetRange(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils.OffsetRange) IntArrayList(it.unimi.dsi.fastutil.ints.IntArrayList) ArrayList(java.util.ArrayList) BooleanArrayList(it.unimi.dsi.fastutil.booleans.BooleanArrayList)

Example 2 with OffsetRange

use of com.facebook.presto.parquet.reader.ColumnIndexFilterUtils.OffsetRange in project presto by prestodb.

the class ParquetReader method readPrimitive.

private ColumnChunk readPrimitive(PrimitiveField field) throws IOException {
    ColumnDescriptor columnDescriptor = field.getDescriptor();
    int fieldId = field.getId();
    ColumnReader columnReader = columnReaders[fieldId];
    if (!columnReader.isInitialized()) {
        validateParquet(currentBlockMetadata.getRowCount() > 0, "Row group has 0 rows");
        ColumnChunkMetaData metadata = getColumnChunkMetaData(columnDescriptor);
        long startingPosition = metadata.getStartingPos();
        int totalSize = toIntExact(metadata.getTotalSize());
        if (shouldUseColumnIndex(metadata.getPath())) {
            OffsetIndex offsetIndex = blockIndexStores.get(currentBlock).getOffsetIndex(metadata.getPath());
            OffsetIndex filteredOffsetIndex = ColumnIndexFilterUtils.filterOffsetIndex(offsetIndex, currentGroupRowRanges, blocks.get(currentBlock).getRowCount());
            List<OffsetRange> offsetRanges = ColumnIndexFilterUtils.calculateOffsetRanges(filteredOffsetIndex, metadata, offsetIndex.getOffset(0), startingPosition);
            List<OffsetRange> consecutiveRanges = concatRanges(offsetRanges);
            List<ByteBuffer> buffers = allocateBlocks(consecutiveRanges);
            for (int i = 0; i < consecutiveRanges.size(); i++) {
                ByteBuffer buffer = buffers.get(i);
                dataSource.readFully(startingPosition + consecutiveRanges.get(i).getOffset(), buffer.array());
            }
            PageReader pageReader = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex);
            columnReader.init(pageReader, field, currentGroupRowRanges);
            if (enableVerification) {
                ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
                PageReader pageReaderVerification = createPageReader(buffers, totalSize, metadata, columnDescriptor, filteredOffsetIndex);
                verificationColumnReader.init(pageReaderVerification, field, currentGroupRowRanges);
            }
        } else {
            byte[] buffer = allocateBlock(totalSize);
            dataSource.readFully(startingPosition, buffer);
            PageReader pageReader = createPageReader(buffer, totalSize, metadata, columnDescriptor);
            columnReader.init(pageReader, field, null);
            if (enableVerification) {
                ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
                PageReader pageReaderVerification = createPageReader(buffer, totalSize, metadata, columnDescriptor);
                verificationColumnReader.init(pageReaderVerification, field, null);
            }
        }
    }
    ColumnChunk columnChunk = columnReader.readNext();
    columnChunk = typeCoercion(columnChunk, field.getDescriptor().getPrimitiveType().getPrimitiveTypeName(), field.getType());
    if (enableVerification) {
        ColumnReader verificationColumnReader = verificationColumnReaders[field.getId()];
        ColumnChunk expected = verificationColumnReader.readNext();
        ParquetResultVerifierUtils.verifyColumnChunks(columnChunk, expected, columnDescriptor.getPath().length > 1, field, dataSource.getId());
    }
    // update max size per primitive column chunk
    long bytesPerCell = columnChunk.getBlock().getSizeInBytes() / batchSize;
    if (maxBytesPerCell[fieldId] < bytesPerCell) {
        // update batch size
        maxCombinedBytesPerRow = maxCombinedBytesPerRow - maxBytesPerCell[fieldId] + bytesPerCell;
        maxBatchSize = toIntExact(min(maxBatchSize, max(1, maxReadBlockBytes / maxCombinedBytesPerRow)));
        maxBytesPerCell[fieldId] = bytesPerCell;
    }
    return columnChunk;
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) RichColumnDescriptor(com.facebook.presto.parquet.RichColumnDescriptor) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ByteBuffer(java.nio.ByteBuffer) OffsetRange(com.facebook.presto.parquet.reader.ColumnIndexFilterUtils.OffsetRange) ColumnReader(com.facebook.presto.parquet.ColumnReader) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Aggregations

OffsetRange (com.facebook.presto.parquet.reader.ColumnIndexFilterUtils.OffsetRange)2 ColumnReader (com.facebook.presto.parquet.ColumnReader)1 RichColumnDescriptor (com.facebook.presto.parquet.RichColumnDescriptor)1 BooleanArrayList (it.unimi.dsi.fastutil.booleans.BooleanArrayList)1 IntArrayList (it.unimi.dsi.fastutil.ints.IntArrayList)1 ByteBuffer (java.nio.ByteBuffer)1 ArrayList (java.util.ArrayList)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)1 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)1