use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class SliceDictionaryColumnWriter method getBloomFilters.
@Override
public List<StreamDataOutput> getBloomFilters(CompressedMetadataWriter metadataWriter) throws IOException {
List<BloomFilter> bloomFilters = rowGroups.stream().map(rowGroup -> rowGroup.getColumnStatistics().getBloomFilter()).filter(Objects::nonNull).collect(toImmutableList());
if (!bloomFilters.isEmpty()) {
Slice slice = metadataWriter.writeBloomFilters(bloomFilters);
Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false);
return ImmutableList.of(new StreamDataOutput(slice, stream));
}
return ImmutableList.of();
}
use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class SliceDirectColumnWriter method getBloomFilters.
@Override
public List<StreamDataOutput> getBloomFilters(CompressedMetadataWriter metadataWriter) throws IOException {
List<BloomFilter> bloomFilters = rowGroupColumnStatistics.stream().map(ColumnStatistics::getBloomFilter).filter(Objects::nonNull).collect(toImmutableList());
if (!bloomFilters.isEmpty()) {
Slice slice = metadataWriter.writeBloomFilters(bloomFilters);
Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false);
return ImmutableList.of(new StreamDataOutput(slice, stream));
}
return ImmutableList.of();
}
use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class StripeReader method readColumnIndexes.
private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, Map<OrcColumnId, List<BloomFilter>> bloomFilterIndexes) throws IOException {
ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
Stream stream = entry.getValue();
if (stream.getStreamKind() == ROW_INDEX) {
OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
List<BloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey().getColumnId());
List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
if (bloomFilters != null && !bloomFilters.isEmpty()) {
ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
for (int i = 0; i < rowGroupIndexes.size(); i++) {
RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
}
rowGroupIndexes = newRowGroupIndexes.build();
}
columnIndexes.put(entry.getKey(), rowGroupIndexes);
}
}
return columnIndexes.buildOrThrow();
}
use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class TupleDomainOrcPredicate method columnOverlaps.
private boolean columnOverlaps(Domain predicateDomain, long numberOfRows, ColumnStatistics columnStatistics) {
Domain stripeDomain = getDomain(predicateDomain.getType(), numberOfRows, columnStatistics);
if (!stripeDomain.overlaps(predicateDomain)) {
// there is no overlap between the predicate and this column
return false;
}
// if bloom filters are not enabled, we cannot restrict the range overlap
if (!orcBloomFiltersEnabled) {
return true;
}
// if there an overlap in null values, the bloom filter cannot eliminate the overlap
if (predicateDomain.isNullAllowed() && stripeDomain.isNullAllowed()) {
return true;
}
// extract the discrete values from the predicate
Optional<Collection<Object>> discreteValues = extractDiscreteValues(predicateDomain.getValues());
if (discreteValues.isEmpty()) {
// values are not discrete, so we can't exclude this section
return true;
}
BloomFilter bloomFilter = columnStatistics.getBloomFilter();
if (bloomFilter == null) {
// no bloom filter so we can't exclude this section
return true;
}
// if none of the discrete predicate values are found in the bloom filter, there is no overlap and the section should be skipped
return discreteValues.get().stream().anyMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType()));
}
Aggregations