Search in sources :

Example 1 with OffsetRange

use of org.apache.parquet.hadoop.ColumnIndexFilterUtils.OffsetRange in project parquet-mr by apache.

the class ParquetFileReader method internalReadFilteredRowGroup.

private ColumnChunkPageReadStore internalReadFilteredRowGroup(BlockMetaData block, RowRanges rowRanges, ColumnIndexStore ciStore) throws IOException {
    ColumnChunkPageReadStore rowGroup = new ColumnChunkPageReadStore(rowRanges);
    // prepare the list of consecutive parts to read them in one scan
    ChunkListBuilder builder = new ChunkListBuilder(block.getRowCount());
    List<ConsecutivePartList> allParts = new ArrayList<>();
    ConsecutivePartList currentParts = null;
    for (ColumnChunkMetaData mc : block.getColumns()) {
        ColumnPath pathKey = mc.getPath();
        ColumnDescriptor columnDescriptor = paths.get(pathKey);
        if (columnDescriptor != null) {
            OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());
            OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount());
            for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
                BenchmarkCounter.incrementTotalBytes(range.getLength());
                long startingPos = range.getOffset();
                // first part or not consecutive => new list
                if (currentParts == null || currentParts.endPos() != startingPos) {
                    currentParts = new ConsecutivePartList(startingPos);
                    allParts.add(currentParts);
                }
                ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos, range.getLength());
                currentParts.addChunk(chunkDescriptor);
                builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
            }
        }
    }
    // actually read all the chunks
    for (ConsecutivePartList consecutiveChunks : allParts) {
        consecutiveChunks.readAll(f, builder);
    }
    for (Chunk chunk : builder.build()) {
        readChunkPages(chunk, block, rowGroup);
    }
    return rowGroup;
}
Also used : ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) ArrayList(java.util.ArrayList) ColumnPath(org.apache.parquet.hadoop.metadata.ColumnPath) OffsetRange(org.apache.parquet.hadoop.ColumnIndexFilterUtils.OffsetRange) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex) ColumnIndexFilterUtils.filterOffsetIndex(org.apache.parquet.hadoop.ColumnIndexFilterUtils.filterOffsetIndex)

Aggregations

ArrayList (java.util.ArrayList)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 OffsetRange (org.apache.parquet.hadoop.ColumnIndexFilterUtils.OffsetRange)1 ColumnIndexFilterUtils.filterOffsetIndex (org.apache.parquet.hadoop.ColumnIndexFilterUtils.filterOffsetIndex)1 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)1 ColumnPath (org.apache.parquet.hadoop.metadata.ColumnPath)1 OffsetIndex (org.apache.parquet.internal.column.columnindex.OffsetIndex)1