Search in sources :

Example 1 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.

the class StripeReader method readColumnIndexes.

private Map<StreamId, List<RowGroupIndex>> readColumnIndexes(Map<StreamId, Stream> streams, Map<StreamId, OrcInputStream> streamsData, StripeId stripeId) throws IOException {
    // read the bloom filter for each column
    Map<Integer, List<HiveBloomFilter>> bloomFilterIndexes = readBloomFilterIndexes(streams, streamsData);
    ImmutableMap.Builder<StreamId, List<RowGroupIndex>> columnIndexes = ImmutableMap.builder();
    for (Entry<StreamId, Stream> entry : streams.entrySet()) {
        StreamId streamId = entry.getKey();
        Stream stream = entry.getValue();
        if (stream.getStreamKind() == ROW_INDEX) {
            OrcInputStream inputStream = streamsData.get(streamId);
            List<HiveBloomFilter> bloomFilters = bloomFilterIndexes.get(streamId.getColumn());
            List<RowGroupIndex> rowGroupIndexes = stripeMetadataSource.getRowIndexes(metadataReader, hiveWriterVersion, stripeId, streamId, inputStream, bloomFilters, runtimeStats);
            columnIndexes.put(entry.getKey(), rowGroupIndexes);
        }
    }
    return columnIndexes.build();
}
Also used : OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) ImmutableMap(com.google.common.collect.ImmutableMap) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) RowGroupIndex(com.facebook.presto.orc.metadata.RowGroupIndex) List(java.util.List) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) ValueInputStream(com.facebook.presto.orc.stream.ValueInputStream) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) Stream(com.facebook.presto.orc.metadata.Stream) InputStream(java.io.InputStream)

Example 2 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.

the class OrcMetadataReader method readBloomFilterIndexes.

@Override
public List<HiveBloomFilter> readBloomFilterIndexes(InputStream inputStream) throws IOException {
    CodedInputStream input = CodedInputStream.newInstance(inputStream);
    OrcProto.BloomFilterIndex bloomFilter = OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = bloomFilter.getBloomFilterList();
    ImmutableList.Builder<HiveBloomFilter> builder = ImmutableList.builder();
    for (OrcProto.BloomFilter orcBloomFilter : bloomFilterList) {
        builder.add(new HiveBloomFilter(orcBloomFilter.getBitsetList(), orcBloomFilter.getBitsetCount() * 64, orcBloomFilter.getNumHashFunctions()));
    }
    return builder.build();
}
Also used : HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) OrcProto(com.facebook.presto.orc.proto.OrcProto) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter)

Example 3 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.

the class TupleDomainOrcPredicate method columnOverlaps.

private boolean columnOverlaps(ColumnReference<C> columnReference, Domain predicateDomain, long numberOfRows, ColumnStatistics columnStatistics) {
    Domain stripeDomain = getDomain(columnReference.getType(), numberOfRows, columnStatistics);
    if (!stripeDomain.overlaps(predicateDomain)) {
        // there is no overlap between the predicate and this column
        return false;
    }
    // if bloom filters are not enabled, we can not restrict the range overlap
    if (!orcBloomFiltersEnabled) {
        return true;
    }
    // if there an overlap in null values, the bloom filter can not eliminate the overlap
    if (predicateDomain.isNullAllowed() && stripeDomain.isNullAllowed()) {
        return true;
    }
    // extract the discrete values from the predicate
    Optional<Collection<Object>> discreteValues = extractDiscreteValues(predicateDomain.getValues());
    if (!discreteValues.isPresent()) {
        // values are not discrete, so we can't exclude this section
        return true;
    }
    HiveBloomFilter bloomFilter = columnStatistics.getBloomFilter();
    if (bloomFilter == null) {
        // no bloom filter so we can't exclude this section
        return true;
    }
    // if none of the discrete predicate values are found in the bloom filter, there is no overlap and the section should be skipped
    if (discreteValues.get().stream().noneMatch(value -> checkInBloomFilter(bloomFilter, value, stripeDomain.getType()))) {
        return false;
    }
    return true;
}
Also used : HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) Collection(java.util.Collection) Domain(com.facebook.presto.common.predicate.Domain) TupleDomain(com.facebook.presto.common.predicate.TupleDomain)

Example 4 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.

the class DwrfMetadataReader method readRowIndexes.

@Override
public List<RowGroupIndex> readRowIndexes(HiveWriterVersion hiveWriterVersion, InputStream inputStream, List<HiveBloomFilter> bloomFilters) throws IOException {
    long cpuStart = THREAD_MX_BEAN.getCurrentThreadCpuTime();
    CodedInputStream input = CodedInputStream.newInstance(inputStream);
    DwrfProto.RowIndex rowIndex = DwrfProto.RowIndex.parseFrom(input);
    runtimeStats.addMetricValue("DwrfReadRowIndexesTimeNanos", THREAD_MX_BEAN.getCurrentThreadCpuTime() - cpuStart);
    return IntStream.range(0, rowIndex.getEntryCount()).mapToObj(i -> toRowGroupIndex(hiveWriterVersion, rowIndex.getEntry(i), bloomFilters == null || bloomFilters.isEmpty() ? null : bloomFilters.get(i))).collect(toImmutableList());
}
Also used : ORIGINAL(com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion.ORIGINAL) ThreadMXBean(com.sun.management.ThreadMXBean) DoubleStatistics(com.facebook.presto.orc.metadata.statistics.DoubleStatistics) OrcTypeKind(com.facebook.presto.orc.metadata.OrcType.OrcTypeKind) BinaryStatistics(com.facebook.presto.orc.metadata.statistics.BinaryStatistics) EncryptionLibrary(com.facebook.presto.orc.EncryptionLibrary) Map(java.util.Map) RuntimeStats(com.facebook.presto.common.RuntimeStats) OrcDataSource(com.facebook.presto.orc.OrcDataSource) StreamKind(com.facebook.presto.orc.metadata.Stream.StreamKind) OrcMetadataReader.byteStringToSlice(com.facebook.presto.orc.metadata.OrcMetadataReader.byteStringToSlice) STATIC_METADATA(com.facebook.presto.orc.metadata.DwrfMetadataWriter.STATIC_METADATA) ImmutableMap(com.google.common.collect.ImmutableMap) NONE(com.facebook.presto.orc.metadata.CompressionKind.NONE) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ColumnStatistics.createColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics.createColumnStatistics) NOOP_ORC_AGGREGATED_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcAggregatedMemoryContext.NOOP_ORC_AGGREGATED_MEMORY_CONTEXT) ColumnStatistics(com.facebook.presto.orc.metadata.statistics.ColumnStatistics) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) BooleanStatistics(com.facebook.presto.orc.metadata.statistics.BooleanStatistics) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) Optional(java.util.Optional) OrcInputStream(com.facebook.presto.orc.stream.OrcInputStream) SortedMap(java.util.SortedMap) IntStream(java.util.stream.IntStream) DwrfDataEncryptor(com.facebook.presto.orc.DwrfDataEncryptor) Iterables(com.google.common.collect.Iterables) Slice(io.airlift.slice.Slice) DwrfKeyProvider(com.facebook.presto.orc.DwrfKeyProvider) HashMap(java.util.HashMap) StringStatistics(com.facebook.presto.orc.metadata.statistics.StringStatistics) OptionalInt(java.util.OptionalInt) ArrayList(java.util.ArrayList) OptionalLong(java.util.OptionalLong) ZLIB(com.facebook.presto.orc.metadata.CompressionKind.ZLIB) ImmutableList(com.google.common.collect.ImmutableList) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) HiveWriterVersion(com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion) Objects.requireNonNull(java.util.Objects.requireNonNull) OrcMetadataReader.minStringTruncateToValidRange(com.facebook.presto.orc.metadata.OrcMetadataReader.minStringTruncateToValidRange) ManagementFactory(java.lang.management.ManagementFactory) Math.toIntExact(java.lang.Math.toIntExact) ImmutableSortedMap(com.google.common.collect.ImmutableSortedMap) DwrfEncryptionProvider(com.facebook.presto.orc.DwrfEncryptionProvider) OrcDataSourceId(com.facebook.presto.orc.OrcDataSourceId) OrcDecompressor(com.facebook.presto.orc.OrcDecompressor) NOOP_ORC_LOCAL_MEMORY_CONTEXT(com.facebook.presto.orc.NoopOrcLocalMemoryContext.NOOP_ORC_LOCAL_MEMORY_CONTEXT) ORC_HIVE_8732(com.facebook.presto.orc.metadata.PostScript.HiveWriterVersion.ORC_HIVE_8732) SharedBuffer(com.facebook.presto.orc.stream.SharedBuffer) ColumnEncodingKind(com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind) OrcCorruptionException(com.facebook.presto.orc.OrcCorruptionException) SNAPPY(com.facebook.presto.orc.metadata.CompressionKind.SNAPPY) OrcMetadataReader.maxStringTruncateToValidRange(com.facebook.presto.orc.metadata.OrcMetadataReader.maxStringTruncateToValidRange) IOException(java.io.IOException) BasicSliceInput(io.airlift.slice.BasicSliceInput) IntegerStatistics(com.facebook.presto.orc.metadata.statistics.IntegerStatistics) LZ4(com.facebook.presto.orc.metadata.CompressionKind.LZ4) ByteString(com.facebook.presto.orc.protobuf.ByteString) DwrfProto(com.facebook.presto.orc.proto.DwrfProto) VisibleForTesting(com.google.common.annotations.VisibleForTesting) InputStream(java.io.InputStream) ZSTD(com.facebook.presto.orc.metadata.CompressionKind.ZSTD) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) DwrfProto(com.facebook.presto.orc.proto.DwrfProto)

Example 5 with HiveBloomFilter

use of com.facebook.presto.orc.metadata.statistics.HiveBloomFilter in project presto by prestodb.

the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.

@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);
    bloomFilterWrite.addString(TEST_STRING);
    assertTrue(bloomFilterWrite.testString(TEST_STRING));
    OrcProto.BloomFilter.Builder bloomFilterBuilder = OrcProto.BloomFilter.newBuilder();
    bloomFilterBuilder.addAllBitset(Longs.asList(bloomFilterWrite.getBitSet()));
    bloomFilterBuilder.setNumHashFunctions(bloomFilterWrite.getNumHashFunctions());
    OrcProto.BloomFilter bloomFilter = bloomFilterBuilder.build();
    OrcProto.BloomFilterIndex bloomFilterIndex = OrcProto.BloomFilterIndex.getDefaultInstance();
    byte[] bytes = serializeBloomFilterToIndex(bloomFilter, bloomFilterIndex);
    // Read through method
    InputStream inputStream = new ByteArrayInputStream(bytes);
    OrcMetadataReader metadataReader = new OrcMetadataReader(new RuntimeStats());
    List<HiveBloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
    assertEquals(bloomFilters.size(), 1);
    assertTrue(bloomFilters.get(0).testString(TEST_STRING));
    assertFalse(bloomFilters.get(0).testString(TEST_STRING_NOT_WRITTEN));
    assertEquals(bloomFilterWrite.getBitSize(), bloomFilters.get(0).getBitSize());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bytes);
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);
    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
    // Validate contents of ORC bloom filter bit set
    assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) InputStream(java.io.InputStream) RuntimeStats(com.facebook.presto.common.RuntimeStats) CodedInputStream(com.facebook.presto.orc.protobuf.CodedInputStream) OrcProto(com.facebook.presto.orc.proto.OrcProto) OrcMetadataReader(com.facebook.presto.orc.metadata.OrcMetadataReader) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter) BloomFilter(com.facebook.presto.orc.metadata.statistics.BloomFilter) ByteArrayInputStream(java.io.ByteArrayInputStream) HiveBloomFilter(com.facebook.presto.orc.metadata.statistics.HiveBloomFilter) Test(org.testng.annotations.Test)

Aggregations

HiveBloomFilter (com.facebook.presto.orc.metadata.statistics.HiveBloomFilter)8 CodedInputStream (com.facebook.presto.orc.protobuf.CodedInputStream)4 InputStream (java.io.InputStream)4 RuntimeStats (com.facebook.presto.common.RuntimeStats)3 TupleDomainOrcPredicate.checkInBloomFilter (com.facebook.presto.orc.TupleDomainOrcPredicate.checkInBloomFilter)3 BloomFilter (com.facebook.presto.orc.metadata.statistics.BloomFilter)3 OrcProto (com.facebook.presto.orc.proto.OrcProto)3 ImmutableList (com.google.common.collect.ImmutableList)3 Test (org.testng.annotations.Test)3 Domain (com.facebook.presto.common.predicate.Domain)2 TupleDomain (com.facebook.presto.common.predicate.TupleDomain)2 DwrfEncryptionProvider (com.facebook.presto.orc.DwrfEncryptionProvider)2 DwrfKeyProvider (com.facebook.presto.orc.DwrfKeyProvider)2 OrcDataSource (com.facebook.presto.orc.OrcDataSource)2 OrcDataSourceId (com.facebook.presto.orc.OrcDataSourceId)2 OrcDecompressor (com.facebook.presto.orc.OrcDecompressor)2 ColumnEncodingKind (com.facebook.presto.orc.metadata.ColumnEncoding.ColumnEncodingKind)2 LZ4 (com.facebook.presto.orc.metadata.CompressionKind.LZ4)2 NONE (com.facebook.presto.orc.metadata.CompressionKind.NONE)2 SNAPPY (com.facebook.presto.orc.metadata.CompressionKind.SNAPPY)2