use of org.apache.parquet.hadoop.ColumnIndexFilterUtils.OffsetRange in project parquet-mr by apache.
the class ParquetFileReader method internalReadFilteredRowGroup.
private ColumnChunkPageReadStore internalReadFilteredRowGroup(BlockMetaData block, RowRanges rowRanges, ColumnIndexStore ciStore) throws IOException {
ColumnChunkPageReadStore rowGroup = new ColumnChunkPageReadStore(rowRanges);
// prepare the list of consecutive parts to read them in one scan
ChunkListBuilder builder = new ChunkListBuilder(block.getRowCount());
List<ConsecutivePartList> allParts = new ArrayList<>();
ConsecutivePartList currentParts = null;
for (ColumnChunkMetaData mc : block.getColumns()) {
ColumnPath pathKey = mc.getPath();
ColumnDescriptor columnDescriptor = paths.get(pathKey);
if (columnDescriptor != null) {
OffsetIndex offsetIndex = ciStore.getOffsetIndex(mc.getPath());
OffsetIndex filteredOffsetIndex = filterOffsetIndex(offsetIndex, rowRanges, block.getRowCount());
for (OffsetRange range : calculateOffsetRanges(filteredOffsetIndex, mc, offsetIndex.getOffset(0))) {
BenchmarkCounter.incrementTotalBytes(range.getLength());
long startingPos = range.getOffset();
// first part or not consecutive => new list
if (currentParts == null || currentParts.endPos() != startingPos) {
currentParts = new ConsecutivePartList(startingPos);
allParts.add(currentParts);
}
ChunkDescriptor chunkDescriptor = new ChunkDescriptor(columnDescriptor, mc, startingPos, range.getLength());
currentParts.addChunk(chunkDescriptor);
builder.setOffsetIndex(chunkDescriptor, filteredOffsetIndex);
}
}
}
// actually read all the chunks
for (ConsecutivePartList consecutiveChunks : allParts) {
consecutiveChunks.readAll(f, builder);
}
for (Chunk chunk : builder.build()) {
readChunkPages(chunk, block, rowGroup);
}
return rowGroup;
}
Aggregations