use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.
the class StripeReader method readColumnIndexes.
private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, StripeId stripeId) throws IOException {
// read the bloom filter for each column
Map<Integer, List<HiveBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
StreamId streamId = entry.getKey();
Stream stream = entry.getValue();
if (stream.getStreamKind() == ROW_INDEX) {
OrcInputStream inputStream = streamsData.get(streamId);
List<HiveBloomFilter> bloomFilters = bloomFilterIndexes.get(streamId.getColumn());
List<RowGroupIndex> rowGroupIndexes = stripeMetadataSource.getRowIndexes(metadataReader, hiveWriterVersion, stripeId, streamId, inputStream, bloomFilters, runtimeStats);
columnIndexes.put(entry.getKey(), rowGroupIndexes);
}
}
return columnIndexes.build();
}
use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.
the class OrcMetadataReader method readBloomFilterIndexes.
@Override
public List<HiveBloomFilter> readBloomFilterIndexes(InputStream inputStream) throws IOException {
CodedInputStream input = CodedInputStream.newInstance(inputStream);
OrcProto.BloomFilterIndex bloomFilter = OrcProto.BloomFilterIndex.parseFrom(input);
List<OrcProto.BloomFilter> bloomFilterList = bloomFilter.getBloomFilterList();
ImmutableList.Builder<HiveBloomFilter> builder = ImmutableList.builder();
for (OrcProto.BloomFilter orcBloomFilter : bloomFilterList) {
builder.add(new HiveBloomFilter(orcBloomFilter.getBitsetList(), orcBloomFilter.getBitsetCount() * 64, orcBloomFilter.getNumHashFunctions()));
}
return builder.build();
}
use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.
the class TupleDomainOrcPredicate method columnOverlaps.
private boolean columnOverlaps(ColumnReference<C> columnReference, Domain predicateDomain, long numberOfRows, ColumnStatistics columnStatistics) {
Domain stripeDomain = getDomain(columnReference.getType(), numberOfRows, columnStatistics);
if (!stripeDomain.overlaps(predicateDomain)) {
// there is no overlap between the predicate and this column
return false;
}
// if bloom filters are not enabled, we can not restrict the range overlap
if (!orcBloomFiltersEnabled) {
return true;
}
// if there an overlap in null values, the bloom filter can not eliminate the overlap
if (predicateDomain.isNullAllowed() && stripeDomain.isNullAllowed()) {
return true;
}
// extract the discrete values from the predicate
Optional<Collection<Object>> discreteValues = extractDiscreteValues(predicateDomain.getValues());
if (!discreteValues.isPresent()) {
// values are not discrete, so we can't exclude this section
return true;
}
HiveBloomFilter bloomFilter = columnStatistics.getBloomFilter();
if (bloomFilter == null) {
// no bloom filter so we can't exclude this section
return true;
}
// if none of the discrete predicate values are found in the bloom filter, there is no overlap and the section should be skipped
if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType()))) {
return false;
}
return true;
}
use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.
the class DwrfMetadataReader method readRowIndexes.
@Override
public List<RowGroupIndex> readRowIndexes(HiveWriterVersion hiveWriterVersion, InputStream inputStream, List<HiveBloomFilter> bloomFilters) throws IOException {
long cpuStart = THREAD_MX_BEAN.getCurrentThreadCpuTime();
CodedInputStream input = CodedInputStream.newInstance(inputStream);
DwrfProto.RowIndex rowIndex = DwrfProto.RowIndex.parseFrom(input);
runtimeStats.addMetricValue("DwrfReadRowIndexesTimeNanos", THREAD_MX_BEAN.getCurrentThreadCpuTime() - cpuStart);
return IntStream.range(0, rowIndex.getEntryCount()).mapToObj(i -> toRowGroupIndex(hiveWriterVersion, rowIndex.getEntry(i), bloomFilters == null || bloomFilters.isEmpty() ? null : bloomFilters.get(i))).collect(toImmutableList());
}
use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.
the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.
@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);
bloomFilterWrite.addString(TEST_STRING);
assertTrue(bloomFilterWrite.testString(TEST_STRING));
OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());
OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);
// Read through method
InputStream inputStream = new ByteArrayInputStream(bytes);
OrcMetadataReader metadataReader = new OrcMetadataReader(new RuntimeStats());
List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
assertEquals(bloomFilters.size(), 1);
assertTrue(bloomFilters.get(0).testString(TEST_STRING));
assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN));
assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize());
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
// Validate bit set
assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
// Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
CodedInputStream input = CodedInputStream.newInstance(bytes);
OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
assertEquals(bloomFilterList.size(), 1);
OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
// Validate contents of ORC bloom filter bit set
assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
// hash functions
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
// bit size
assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
Aggregations