Search in sources :

Example 1 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.HiveBloomFilter in project presto by prestodb.

the class TupleDomainOrcPredicate method columnOverlaps.

private boolean columnOverlaps(ColumnReference<C> columnReference, Domain predicateDomain, long numberOfRows, ColumnStatistics columnStatistics) {
    Domain stripeDomain = getDomain(columnReference.getType(), numberOfRows, columnStatistics);
    if (!stripeDomain.overlaps(predicateDomain)) {
        // there is no overlap between the predicate and this column
        return false;
    }
    // if bloom filters are not enabled, we can not restrict the range overlap
    if (!orcBloomFiltersEnabled) {
        return true;
    }
    // if there an overlap in null values, the bloom filter can not eliminate the overlap
    if (predicateDomain.isNullAllowed() && stripeDomain.isNullAllowed()) {
        return true;
    }
    // extract the discrete values from the predicate
    Optional<Collection<Object>> discreteValues = extractDiscreteValues(predicateDomain.getValues());
    if (!discreteValues.isPresent()) {
        // values are not discrete, so we can't exclude this section
        return true;
    }
    HiveBloomFilter bloomFilter = columnStatistics.getBloomFilter();
    if (bloomFilter == null) {
        // no bloom filter so we can't exclude this section
        return true;
    }
    // if none of the discrete predicate values are found in the bloom filter, there is no overlap and the section should be skipped
    if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType()))) {
        return false;
    }
    return true;
}
Also used : HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) Collection(java.util.Collection) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) Domain(com.facebook.presto.spi.predicate.Domain)

Example 2 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.HiveBloomFilter in project presto by prestodb.

the class TestOrcBloomFilters method testHiveBloomFilterSerde.

@Test
public void testHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilter = new BloomFilter(1_000_000L, 0.05);
    // String
    bloomFilter.addString(TEST_STRING);
    assertTrue(bloomFilter.testString(TEST_STRING));
    assertFalse(bloomFilter.testString(TEST_STRING_NOT_WRITTEN));
    // Integer
    bloomFilter.addLong(TEST_INTEGER);
    assertTrue(bloomFilter.testLong(TEST_INTEGER));
    assertFalse(bloomFilter.testLong(TEST_INTEGER + 1));
    // Re-construct
    HiveBloomFilter hiveBloomFilter = new HiveBloomFilter(ImmutableList.copyOf(Longs.asList(bloomFilter.getBitSet())), bloomFilter.getBitSize(), bloomFilter.getNumHashFunctions());
    // String
    assertTrue(hiveBloomFilter.testString(TEST_STRING));
    assertFalse(hiveBloomFilter.testString(TEST_STRING_NOT_WRITTEN));
    // Integer
    assertTrue(hiveBloomFilter.testLong(TEST_INTEGER));
    assertFalse(hiveBloomFilter.testLong(TEST_INTEGER + 1));
}
Also used : HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) BloomFilter(org.apache.hive.common.util.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter) Test(org.testng.annotations.Test)

Example 3 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.HiveBloomFilter in project presto by prestodb.

the class TestOrcBloomFilters method testMatches.

@Test
public // simulate query on a 2 columns where 1 is used as part of the where, with and without bloom filter
void testMatches() throws Exception {
    // stripe column
    Domain testingColumnHandleDomain = Domain.singleValue(BIGINT, 1234L);
    TupleDomain.ColumnDomain<String> column0 = new TupleDomain.ColumnDomain<>(COLUMN_0, testingColumnHandleDomain);
    // predicate consist of the bigint_0 = 1234
    TupleDomain<String> effectivePredicate = TupleDomain.fromColumnDomains(Optional.of(ImmutableList.of(column0)));
    TupleDomain<String> emptyEffectivePredicate = TupleDomain.all();
    // predicate column references
    List<ColumnReference<String>> columnReferences = ImmutableList.<ColumnReference<String>>builder().add(new ColumnReference<>(COLUMN_0, 0, BIGINT)).add(new ColumnReference<>(COLUMN_1, 1, BIGINT)).build();
    TupleDomainOrcPredicate<String> predicate = new TupleDomainOrcPredicate<>(effectivePredicate, columnReferences, true);
    TupleDomainOrcPredicate<String> emptyPredicate = new TupleDomainOrcPredicate<>(emptyEffectivePredicate, columnReferences, true);
    // assemble a matching and a non-matching bloom filter
    HiveBloomFilter hiveBloomFilter = new HiveBloomFilter(new BloomFilter(1000, 0.01));
    OrcProto.BloomFilter emptyOrcBloomFilter = toOrcBloomFilter(hiveBloomFilter);
    hiveBloomFilter.addLong(1234);
    OrcProto.BloomFilter orcBloomFilter = toOrcBloomFilter(hiveBloomFilter);
    Map<Integer, ColumnStatistics> matchingStatisticsByColumnIndex = ImmutableMap.of(0, new ColumnStatistics(null, null, new IntegerStatistics(10L, 2000L), null, null, null, null, toHiveBloomFilter(orcBloomFilter)));
    Map<Integer, ColumnStatistics> nonMatchingStatisticsByColumnIndex = ImmutableMap.of(0, new ColumnStatistics(null, null, new IntegerStatistics(10L, 2000L), null, null, null, null, toHiveBloomFilter(emptyOrcBloomFilter)));
    Map<Integer, ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = ImmutableMap.of(0, new ColumnStatistics(null, null, new IntegerStatistics(10L, 2000L), null, null, null, null, null));
    assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
    assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex));
    assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
    assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex));
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.ColumnStatistics) OrcProto(com.facebook.presto.orc.proto.OrcProto) HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) BloomFilter(org.apache.hive.common.util.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) TupleDomain(com.facebook.presto.spi.predicate.TupleDomain) Domain(com.facebook.presto.spi.predicate.Domain) ColumnReference(com.facebook.presto.orc.TupleDomainOrcPredicate.ColumnReference) IntegerStatistics(com.facebook.presto.orc.metadata.IntegerStatistics) Test(org.testng.annotations.Test)

Example 4 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.HiveBloomFilter in project presto by prestodb.

the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.

@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);
    bloomFilterWrite.addString(TEST_STRING);
    assertTrue(bloomFilterWrite.testString(TEST_STRING));
    OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
    bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
    bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());
    OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
    OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
    byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);
    // Read through method
    InputStream inputStream = new ByteArrayInputStream(bytes);
    OrcMetadataReader metadataReader = new OrcMetadataReader();
    List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
    assertEquals(bloomFilters.size(), 1);
    assertTrue(bloomFilters.get(0).testString(TEST_STRING));
    assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN));
    assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bytes);
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);
    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
    // Validate contents of ORC bloom filter bit set
    assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) InputStream(java.io.InputStream) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) OrcProto(com.facebook.presto.orc.proto.OrcProto) OrcMetadataReader(com.facebook.presto.orc.metadata.OrcMetadataReader) HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) BloomFilter(org.apache.hive.common.util.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter) ByteArrayInputStream(java.io.ByteArrayInputStream) HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) Test(org.testng.annotations.Test)

Example 5 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.HiveBloomFilter in project presto by prestodb.

the class StripeReader method readColumnIndexes.

private Map<Integer, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, Map<Integer, List<HiveBloomFilter>> bloomFilterIndexes) throws IOException {
    ImmutableMap.Builder<Integer, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == ROW_INDEX) {
            OrcInputStream inputStream = streamsData.get(entry.getKey());
            List<HiveBloomFilter> bloomFilters = bloomFilterIndexes.get(stream.getColumn());
            List<RowGroupIndex> rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
            if (bloomFilters != null && !bloomFilters.isEmpty()) {
                ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
                for (int i = 0; i < rowGroupIndexes.size(); i++) {
                    RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
                    ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
                    newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
                }
                rowGroupIndexes = newRowGroupIndexes.build();
            }
            columnIndexes.put(stream.getColumn(), rowGroupIndexes);
        }
    }
    return columnIndexes.build();
}
Also used : ColumnStatistics(com.facebook.presto.orc.metadata.ColumnStatistics) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) ImmutableList(com.google.common.collect.ImmutableList) ImmutableMap(com.google.common.collect.ImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(com.facebook.presto.orc.checkpoint.StreamCheckpoint) HiveBloomFilter(com.facebook.presto.orc.metadata.HiveBloomFilter) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) ValueStream(com.facebook.presto.orc.stream.ValueStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream)

Aggregations

HiveBloomFilter (com.facebook.presto.orc.metadata.HiveBloomFilter)5 TupleDomainOrcPredicate.checkInBloomFilter (com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter)3 BloomFilter (org.apache.hive.common.util.BloomFilter)3 Test (org.testng.annotations.Test)3 ColumnStatistics (com.facebook.presto.orc.metadata.ColumnStatistics)2 OrcProto (com.facebook.presto.orc.proto.OrcProto)2 Domain (com.facebook.presto.spi.predicate.Domain)2 TupleDomain (com.facebook.presto.spi.predicate.TupleDomain)2 InputStream (java.io.InputStream)2 ColumnReference (com.facebook.presto.orc.TupleDomainOrcPredicate.ColumnReference)1 Checkpoints.getDictionaryStreamCheckpoint (com.facebook.presto.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint)1 StreamCheckpoint (com.facebook.presto.orc.checkpoint.StreamCheckpoint)1 IntegerStatistics (com.facebook.presto.orc.metadata.IntegerStatistics)1 OrcMetadataReader (com.facebook.presto.orc.metadata.OrcMetadataReader)1 RowGroupIndex (com.facebook.presto.orc.metadata.RowGroupIndex)1 Stream (com.facebook.presto.orc.metadata.Stream)1 CodedInputStream (com.facebook.presto.orc.protobuf.CodedInputStream)1 OrcInputStream (com.facebook.presto.orc.stream.OrcInputStream)1 ValueStream (com.facebook.presto.orc.stream.ValueStream)1 ImmutableList (com.google.common.collect.ImmutableList)1