Search in sources :

Example 1 with HashableBloomFilter

use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.

the class StripeReader method readColumnIndexes.

private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcChunkLoader> streamsData, Map<OrcColumnId, List<HashableBloomFilter>> bloomFilterIndexes, StripeInformation stripe) throws IOException {
    ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == ROW_INDEX) {
            OrcInputStream inputStream = new OrcInputStream(streamsData.get(entry.getKey()));
            List<HashableBloomFilter> bloomFilters = bloomFilterIndexes.get(entry.getKey().getColumnId());
            List<RowGroupIndex> rowGroupIndexes;
            if (orcCacheProperties.isRowIndexCacheEnabled()) {
                OrcRowIndexCacheKey indexCacheKey = new OrcRowIndexCacheKey();
                indexCacheKey.setOrcDataSourceId(new OrcDataSourceIdWithTimeStamp(orcDataSource.getId(), orcDataSource.getLastModifiedTime()));
                indexCacheKey.setStripeOffset(stripe.getOffset());
                indexCacheKey.setStreamId(entry.getKey());
                try {
                    rowGroupIndexes = orcCacheStore.getRowIndexCache().get(indexCacheKey, () -> metadataReader.readRowIndexes(hiveWriterVersion, inputStream));
                } catch (UncheckedExecutionException | ExecutionException executionException) {
                    handleCacheLoadException(executionException);
                    log.debug(executionException.getCause(), "Error while caching row group indexes. Falling back to default flow");
                    rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
                }
            } else {
                rowGroupIndexes = metadataReader.readRowIndexes(hiveWriterVersion, inputStream);
            }
            if (bloomFilters != null && !bloomFilters.isEmpty()) {
                ImmutableList.Builder<RowGroupIndex> newRowGroupIndexes = ImmutableList.builder();
                for (int i = 0; i < rowGroupIndexes.size(); i++) {
                    RowGroupIndex rowGroupIndex = rowGroupIndexes.get(i);
                    ColumnStatistics columnStatistics = rowGroupIndex.getColumnStatistics().withBloomFilter(bloomFilters.get(i));
                    newRowGroupIndexes.add(new RowGroupIndex(rowGroupIndex.getPositions(), columnStatistics));
                }
                rowGroupIndexes = newRowGroupIndexes.build();
            }
            columnIndexes.put(entry.getKey(), rowGroupIndexes);
        }
    }
    return columnIndexes.build();
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ImmutableList(com.google.common.collect.ImmutableList) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Checkpoints.getDictionaryStreamCheckpoint(io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint) StreamCheckpoint(io.prestosql.orc.checkpoint.StreamCheckpoint) RowGroupIndex(io.prestosql.orc.metadata.RowGroupIndex) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) OrcInputStream(io.prestosql.orc.stream.OrcInputStream) ValueInputStream(io.prestosql.orc.stream.ValueInputStream) Stream(io.prestosql.orc.metadata.Stream) InputStream(java.io.InputStream) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with HashableBloomFilter

use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.

the class TupleDomainOrcPredicate method columnOverlaps.

private boolean columnOverlaps(Domain predicateDomain, long numberOfRows, ColumnStatistics columnStatistics) {
    Domain stripeDomain = getDomain(predicateDomain.getType(), numberOfRows, columnStatistics);
    if (!stripeDomain.overlaps(predicateDomain)) {
        // there is no overlap between the predicate and this column
        return false;
    }
    // if bloom filters are not enabled, we can not restrict the range overlap
    if (!orcBloomFiltersEnabled) {
        return true;
    }
    // if there an overlap in null values, the bloom filter can not eliminate the overlap
    if (predicateDomain.isNullAllowed() && stripeDomain.isNullAllowed()) {
        return true;
    }
    // extract the discrete values from the predicate
    Optional<Collection<Object>> discreteValues = extractDiscreteValues(predicateDomain.getValues());
    if (!discreteValues.isPresent()) {
        // values are not discrete, so we can't exclude this section
        return true;
    }
    HashableBloomFilter bloomFilter = columnStatistics.getBloomFilter();
    if (bloomFilter == null) {
        // no bloom filter so we can't exclude this section
        return true;
    }
    // if none of the discrete predicate values are found in the bloom filter, there is no overlap and the section should be skipped
    if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType()))) {
        return false;
    }
    return true;
}
Also used : Collection(java.util.Collection) Domain(io.prestosql.spi.predicate.Domain) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter)

Example 3 with HashableBloomFilter

use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.

the class TestOrcBloomFilters method testBloomFilterPredicateValuesExisting.

@Test
public void testBloomFilterPredicateValuesExisting() {
    HashableBloomFilter bloomFilter = new HashableBloomFilter(TEST_VALUES.size() * 10, 0.01);
    for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) {
        Object o = testValue.getKey();
        if (o instanceof Long) {
            if (testValue.getValue() instanceof RealType) {
                bloomFilter.add(intBitsToFloat(((Number) o).intValue()));
            } else {
                bloomFilter.addLong((Long) o);
            }
        } else if (o instanceof Integer) {
            bloomFilter.addLong((Integer) o);
        } else if (o instanceof String) {
            bloomFilter.add(((String) o).getBytes(UTF_8));
        } else if (o instanceof BigDecimal) {
            bloomFilter.add(o.toString().getBytes(UTF_8));
        } else if (o instanceof Slice) {
            bloomFilter.add(((Slice) o).getBytes());
        } else if (o instanceof Timestamp) {
            bloomFilter.addLong(((Timestamp) o).getTime());
        } else if (o instanceof Double) {
            bloomFilter.add((Double) o);
        } else {
            fail("Unsupported type " + o.getClass());
        }
    }
    for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) {
        boolean matched = checkInBloomFilter(bloomFilter, testValue.getKey(), testValue.getValue());
        assertTrue(matched, "type " + testValue.getClass());
    }
}
Also used : HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) RealType(io.prestosql.spi.type.RealType) Timestamp(java.sql.Timestamp) BigDecimal(java.math.BigDecimal) Type(io.prestosql.spi.type.Type) RealType(io.prestosql.spi.type.RealType) Slice(io.airlift.slice.Slice) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.testng.annotations.Test)

Example 4 with HashableBloomFilter

use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.

the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.

@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
    HashableBloomFilter bloomFilterWrite = new HashableBloomFilter(1000L, 0.05);
    bloomFilterWrite.add(TEST_STRING);
    assertTrue(bloomFilterWrite.test(TEST_STRING));
    assertTrue(bloomFilterWrite.test(wrappedBuffer(TEST_STRING)));
    OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
    bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
    bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());
    OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
    OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
    byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);
    // Read through method
    InputStream inputStream = new ByteArrayInputStream(bytes);
    OrcMetadataReader metadataReader = new OrcMetadataReader();
    List<HashableBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
    assertEquals(bloomFilters.size(), 1);
    assertTrue(bloomFilters.get(0).test(TEST_STRING));
    assertTrue(bloomFilters.get(0).test(wrappedBuffer(TEST_STRING)));
    assertFalse(bloomFilters.get(0).test(TEST_STRING_NOT_WRITTEN));
    assertFalse(bloomFilters.get(0).test(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
    assertEquals(bloomFilterWrite.getNumBits(), bloomFilters.get(0).getNumBits());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bytes);
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);
    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
    // Validate contents of ORC bloom filter bit set
    assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
Also used : CodedInputStream(io.prestosql.orc.protobuf.CodedInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) CodedInputStream(io.prestosql.orc.protobuf.CodedInputStream) OrcProto(io.prestosql.orc.proto.OrcProto) OrcMetadataReader(io.prestosql.orc.metadata.OrcMetadataReader) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(io.prestosql.orc.TupleDomainOrcPredicate.checkInBloomFilter) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) ByteArrayInputStream(java.io.ByteArrayInputStream) Test(org.testng.annotations.Test)

Example 5 with HashableBloomFilter

use of io.prestosql.orc.metadata.statistics.HashableBloomFilter in project hetu-core by openlookeng.

the class TestOrcBloomFilters method testMatches.

@Test
public // simulate query on a 2 columns where 1 is used as part of the where, with and without bloom filter
void testMatches() {
    TupleDomainOrcPredicate predicate = TupleDomainOrcPredicate.builder().setBloomFiltersEnabled(true).addColumn(ROOT_COLUMN, Domain.singleValue(BIGINT, 1234L)).build();
    TupleDomainOrcPredicate emptyPredicate = TupleDomainOrcPredicate.builder().build();
    // assemble a matching and a non-matching bloom filter
    HashableBloomFilter bloomFilter = new HashableBloomFilter(1000, 0.01);
    OrcProto.BloomFilter emptyOrcBloomFilter = toOrcBloomFilter(bloomFilter);
    bloomFilter.addLong(1234);
    OrcProto.BloomFilter orcBloomFilter = toOrcBloomFilter(bloomFilter);
    ColumnMetadata<ColumnStatistics> matchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, toBloomFilter(orcBloomFilter))));
    ColumnMetadata<ColumnStatistics> nonMatchingStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, toBloomFilter(emptyOrcBloomFilter))));
    ColumnMetadata<ColumnStatistics> withoutBloomFilterStatisticsByColumnIndex = new ColumnMetadata<>(ImmutableList.of(new ColumnStatistics(null, 0, null, new IntegerStatistics(10L, 2000L, null), null, null, null, null, null, null)));
    assertTrue(predicate.matches(1L, matchingStatisticsByColumnIndex));
    assertTrue(predicate.matches(1L, withoutBloomFilterStatisticsByColumnIndex));
    assertFalse(predicate.matches(1L, nonMatchingStatisticsByColumnIndex));
    assertTrue(emptyPredicate.matches(1L, matchingStatisticsByColumnIndex));
}
Also used : ColumnStatistics(io.prestosql.orc.metadata.statistics.ColumnStatistics) ColumnMetadata(io.prestosql.orc.metadata.ColumnMetadata) OrcProto(io.prestosql.orc.proto.OrcProto) HashableBloomFilter(io.prestosql.orc.metadata.statistics.HashableBloomFilter) IntegerStatistics(io.prestosql.orc.metadata.statistics.IntegerStatistics) Test(org.testng.annotations.Test)

Aggregations

HashableBloomFilter (io.prestosql.orc.metadata.statistics.HashableBloomFilter)9 Test (org.testng.annotations.Test)6 ImmutableMap (com.google.common.collect.ImmutableMap)3 OrcProto (io.prestosql.orc.proto.OrcProto)3 ImmutableList (com.google.common.collect.ImmutableList)2 Slice (io.airlift.slice.Slice)2 TupleDomainOrcPredicate.checkInBloomFilter (io.prestosql.orc.TupleDomainOrcPredicate.checkInBloomFilter)2 ColumnStatistics (io.prestosql.orc.metadata.statistics.ColumnStatistics)2 CodedInputStream (io.prestosql.orc.protobuf.CodedInputStream)2 RealType (io.prestosql.spi.type.RealType)2 Type (io.prestosql.spi.type.Type)2 InputStream (java.io.InputStream)2 Map (java.util.Map)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1 ImmutableMap.toImmutableMap (com.google.common.collect.ImmutableMap.toImmutableMap)1 UncheckedExecutionException (com.google.common.util.concurrent.UncheckedExecutionException)1 Checkpoints.getDictionaryStreamCheckpoint (io.prestosql.orc.checkpoint.Checkpoints.getDictionaryStreamCheckpoint)1 StreamCheckpoint (io.prestosql.orc.checkpoint.StreamCheckpoint)1 ColumnMetadata (io.prestosql.orc.metadata.ColumnMetadata)1 OrcMetadataReader (io.prestosql.orc.metadata.OrcMetadataReader)1