use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.
the class StripeReader method readColumnIndexes.
private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, Map<OrcColumnId, List<HashableBloomFilter>> bloomFilterIndexes, StripeInformation stripe) throws IOException {
ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
for (Entry<StreamId, Stream> entry : streams.entrySet()) {
Stream stream = entry.getValue();
if (stream.getStreamKind() == ROW_INDEX) {
OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
List<HashableBloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey().getColumnId());
List<RowGroupIndex> rowGroupIndexes;
if (orcCacheProperties.isRowIndexCacheEnabled()) {
OrcRowIndexCacheKey indexCacheKey = new OrcRowIndexCacheKey();
indexCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
indexCacheKey.setStripeOffset(stripe.getOffset());
indexCacheKey.setStreamId(entry.getKey());
try {
rowGroupIndexes = orcCacheStore.getRowIndexCache().get(indexCacheKey, () -> metadataReader.readRowIndexes(hiveWriterVersion, inputStream));
} catch (UncheckedExecutionException | ExecutionException executionException) {
handleCacheLoadException(executionException);
log.debug(executionException.getCause(), "Error while caching row group indexes. Falling back to default flow");
rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
}
} else {
rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
}
if (bloomFilters != null && !bloomFilters.isEmpty()) {
ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
for (int i = 0; i < rowGroupIndexes.size(); i++) {
RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
}
rowGroupIndexes = newRowGroupIndexes.build();
}
columnIndexes.put(entry.getKey(), rowGroupIndexes);
}
}
return columnIndexes.build();
}
use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.
the class TupleDomainOrcPredicate method columnOverlaps.
private boolean columnOverlaps(Domain predicateDomain, long numberOfRows, ColumnStatistics columnStatistics) {
Domain stripeDomain = getDomain(predicateDomain.getType(), numberOfRows, columnStatistics);
if (!stripeDomain.overlaps(predicateDomain)) {
// there is no overlap between the predicate and this column
return false;
}
// if bloom filters are not enabled, we can not restrict the range overlap
if (!orcBloomFiltersEnabled) {
return true;
}
// if there an overlap in null values, the bloom filter can not eliminate the overlap
if (predicateDomain.isNullAllowed() && stripeDomain.isNullAllowed()) {
return true;
}
// extract the discrete values from the predicate
Optional<Collection<Object>> discreteValues = extractDiscreteValues(predicateDomain.getValues());
if (!discreteValues.isPresent()) {
// values are not discrete, so we can't exclude this section
return true;
}
HashableBloomFilter bloomFilter = columnStatistics.getBloomFilter();
if (bloomFilter == null) {
// no bloom filter so we can't exclude this section
return true;
}
// if none of the discrete predicate values are found in the bloom filter, there is no overlap and the section should be skipped
if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType()))) {
return false;
}
return true;
}
use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.
the class TestOrcBloomFilters method testBloomFilterPredicateValuesExisting.
@Test
public void testBloomFilterPredicateValuesExisting() {
HashableBloomFilter bloomFilter = new HashableBloomFilter(TEST_VALUES.size() * 10, 0.01);
for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) {
Object o = testValue.getKey();
if (o instanceof Long) {
if (testValue.getValue() instanceof RealType) {
bloomFilter.add(intBitsToFloat(((Number) o).intValue()));
} else {
bloomFilter.addLong((Long) o);
}
} else if (o instanceof Integer) {
bloomFilter.addLong((Integer) o);
} else if (o instanceof String) {
bloomFilter.add(((String) o).getBytes(UTF_8));
} else if (o instanceof BigDecimal) {
bloomFilter.add(o.toString().getBytes(UTF_8));
} else if (o instanceof Slice) {
bloomFilter.add(((Slice) o).getBytes());
} else if (o instanceof Timestamp) {
bloomFilter.addLong(((Timestamp) o).getTime());
} else if (o instanceof Double) {
bloomFilter.add((Double) o);
} else {
fail("Unsupported type " + o.getClass());
}
}
for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) {
boolean matched = checkInBloomFilter(bloomFilter, testValue.getKey(), testValue.getValue());
assertTrue(matched, "type " + testValue.getClass());
}
}
use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.
the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.
@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
HashableBloomFilter bloomFilterWrite = new HashableBloomFilter(1000L, 0.05);
bloomFilterWrite.add(TEST_STRING);
assertTrue(bloomFilterWrite.test(TEST_STRING));
assertTrue(bloomFilterWrite.test(wrappedBuffer(TEST_STRING)));
OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());
OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);
// Read through method
InputStream inputStream = new ByteArrayInputStream(bytes);
OrcMetadataReader metadataReader = new OrcMetadataReader();
List<HashableBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
assertEquals(bloomFilters.size(), 1);
assertTrue(bloomFilters.get(0).test(TEST_STRING));
assertTrue(bloomFilters.get(0).test(wrappedBuffer(TEST_STRING)));
assertFalse(bloomFilters.get(0).test(TEST_STRING_NOT_WRITTEN));
assertFalse(bloomFilters.get(0).test(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
assertEquals(bloomFilterWrite.getNumBits(), bloomFilters.get(0).getNumBits());
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
// Validate bit set
assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
// Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
CodedInputStream input = CodedInputStream.newInstance(bytes);
OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
assertEquals(bloomFilterList.size(), 1);
OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
// Validate contents of ORC bloom filter bit set
assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
// hash functions
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
// bit size
assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.
the class TestOrcBloomFilters method testMatches.
@Test
public // simulate query on a 2 columns where 1 is used as part of the where, with and without bloom filter
void testMatches() {
TupleDomainOrcPredicate predicate = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(true).addColumn(ROOT_COLUMN, Domain.singleValue(BIGINT, 1234L)).build();
TupleDomainOrcPredicate emptyPredicate = TupleDomainOrcPredicate.builder().build();
// assemble a matching and a non-matching bloom filter
HashableBloomFilter bloomFilter = new HashableBloomFilter(1000, 0.01);
OrcProto.BloomFilter emptyOrcBloomFilter = toOrcBloomFilter(bloomFilter);
bloomFilter.addLong(1234);
OrcProto.BloomFilter orcBloomFilter = toOrcBloomFilter(bloomFilter);
ColumnMetadata<ColumnStatistics> matchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, toBloomFilter(orcBloomFilter))));
ColumnMetadata<ColumnStatistics> nonMatchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, toBloomFilter(emptyOrcBloomFilter))));
ColumnMetadata<ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null)));
assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex));
assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex));
}
Aggregations