Search in sources :

Example 1 with BloomFilter

use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.

the class FloatColumnWriter method getBloomFilters.

@Override
public List<StreamDataOutput> getBloomFilters(CompressedMetadataWriter metadataWriter) throws IOException {
    List<BloomFilter> bloomFilters = rowGroupColumnStatistics.stream().map(ColumnStatistics::getBloomFilter).filter(Objects::nonNull).collect(toImmutableList());
    if (!bloomFilters.isEmpty()) {
        Slice slice = metadataWriter.writeBloomFilters(bloomFilters);
        Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false);
        return ImmutableList.of(new StreamDataOutput(slice, stream));
    }
    return ImmutableList.of();
}
Also used : ColumnStatistics(io.trino.orc.metadata.statistics.ColumnStatistics) Slice(io.airlift.slice.Slice) PresentOutputStream(io.trino.orc.stream.PresentOutputStream) Stream(io.trino.orc.metadata.Stream) FloatOutputStream(io.trino.orc.stream.FloatOutputStream) StreamDataOutput(io.trino.orc.stream.StreamDataOutput) BloomFilter(io.trino.orc.metadata.statistics.BloomFilter)

Example 2 with BloomFilter

use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.

the class TestOrcBloomFilters method testHiveBloomFilterSerde.

@Test
public void testHiveBloomFilterSerde() {
    BloomFilter bloomFilter = new BloomFilter(1_000_000L, 0.05);
    // String
    bloomFilter.add(TEST_STRING);
    assertTrue(bloomFilter.test(TEST_STRING));
    assertTrue(bloomFilter.testSlice(wrappedBuffer(TEST_STRING)));
    assertFalse(bloomFilter.test(TEST_STRING_NOT_WRITTEN));
    assertFalse(bloomFilter.testSlice(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
    // Integer
    bloomFilter.addLong(TEST_INTEGER);
    assertTrue(bloomFilter.testLong(TEST_INTEGER));
    assertFalse(bloomFilter.testLong(TEST_INTEGER + 1));
    // Re-construct
    BloomFilter newBloomFilter = new BloomFilter(bloomFilter.getBitSet(), bloomFilter.getNumHashFunctions());
    // String
    assertTrue(newBloomFilter.test(TEST_STRING));
    assertTrue(newBloomFilter.testSlice(wrappedBuffer(TEST_STRING)));
    assertFalse(newBloomFilter.test(TEST_STRING_NOT_WRITTEN));
    assertFalse(newBloomFilter.testSlice(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
    // Integer
    assertTrue(newBloomFilter.testLong(TEST_INTEGER));
    assertFalse(newBloomFilter.testLong(TEST_INTEGER + 1));
}
Also used : BloomFilter(io.trino.orc.metadata.statistics.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(io.trino.orc.TupleDomainOrcPredicate.checkInBloomFilter) Test(org.testng.annotations.Test)

Example 3 with BloomFilter

use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.

the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.

@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
    BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);
    bloomFilterWrite.add(TEST_STRING);
    assertTrue(bloomFilterWrite.test(TEST_STRING));
    assertTrue(bloomFilterWrite.testSlice(wrappedBuffer(TEST_STRING)));
    Slice bloomFilterBytes = new CompressedMetadataWriter(new OrcMetadataWriter(WriterIdentification.TRINO), CompressionKind.NONE, 1024).writeBloomFilters(ImmutableList.of(bloomFilterWrite));
    // Read through method
    InputStream inputStream = bloomFilterBytes.getInput();
    OrcMetadataReader metadataReader = new OrcMetadataReader();
    List<BloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
    assertEquals(bloomFilters.size(), 1);
    assertTrue(bloomFilters.get(0).test(TEST_STRING));
    assertTrue(bloomFilters.get(0).testSlice(wrappedBuffer(TEST_STRING)));
    assertFalse(bloomFilters.get(0).test(TEST_STRING_NOT_WRITTEN));
    assertFalse(bloomFilters.get(0).testSlice(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
    assertEquals(bloomFilterWrite.getNumBits(), bloomFilters.get(0).getNumBits());
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
    // Validate bit set
    assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
    // Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
    CodedInputStream input = CodedInputStream.newInstance(bloomFilterBytes.getBytes());
    OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
    List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
    assertEquals(bloomFilterList.size(), 1);
    OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
    // Validate contents of ORC bloom filter bit set
    assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
    // hash functions
    assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
    // bit size
    assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
Also used : CompressedMetadataWriter(io.trino.orc.metadata.CompressedMetadataWriter) Slice(io.airlift.slice.Slice) CodedInputStream(io.trino.orc.protobuf.CodedInputStream) InputStream(java.io.InputStream) CodedInputStream(io.trino.orc.protobuf.CodedInputStream) OrcMetadataReader(io.trino.orc.metadata.OrcMetadataReader) OrcProto(io.trino.orc.proto.OrcProto) OrcMetadataWriter(io.trino.orc.metadata.OrcMetadataWriter) BloomFilter(io.trino.orc.metadata.statistics.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(io.trino.orc.TupleDomainOrcPredicate.checkInBloomFilter) Test(org.testng.annotations.Test)

Example 4 with BloomFilter

use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.

the class TestOrcBloomFilters method testBloomFilterPredicateValuesNonExisting.

@Test
public void testBloomFilterPredicateValuesNonExisting() {
    BloomFilter bloomFilter = new BloomFilter(TEST_VALUES.size() * 10, 0.01);
    for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) {
        boolean matched = checkInBloomFilter(bloomFilter, testValue.getKey(), testValue.getValue());
        assertFalse(matched, "type " + testValue.getKey().getClass());
    }
}
Also used : Type(io.trino.spi.type.Type) RealType(io.trino.spi.type.RealType) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) BloomFilter(io.trino.orc.metadata.statistics.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(io.trino.orc.TupleDomainOrcPredicate.checkInBloomFilter) Test(org.testng.annotations.Test)

Example 5 with BloomFilter

use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.

the class TestOrcBloomFilters method testBloomFilterCompatibility.

@Test
public void testBloomFilterCompatibility() {
    for (int n = 0; n < 200; n++) {
        double fpp = ThreadLocalRandom.current().nextDouble(0.01, 0.10);
        int size = ThreadLocalRandom.current().nextInt(100, 10000);
        int entries = ThreadLocalRandom.current().nextInt(size / 2, size);
        BloomFilter actual = new BloomFilter(size, fpp);
        org.apache.orc.util.BloomFilter expected = new org.apache.orc.util.BloomFilter(size, fpp);
        assertFalse(actual.test(null));
        assertFalse(expected.test(null));
        byte[][] binaryValue = new byte[entries][];
        long[] longValue = new long[entries];
        double[] doubleValue = new double[entries];
        float[] floatValue = new float[entries];
        for (int i = 0; i < entries; i++) {
            binaryValue[i] = randomBytes(ThreadLocalRandom.current().nextInt(100));
            longValue[i] = ThreadLocalRandom.current().nextLong();
            doubleValue[i] = ThreadLocalRandom.current().nextDouble();
            floatValue[i] = ThreadLocalRandom.current().nextFloat();
        }
        for (int i = 0; i < entries; i++) {
            assertFalse(actual.test(binaryValue[i]));
            assertFalse(actual.testSlice(wrappedBuffer(binaryValue[i])));
            assertFalse(actual.testLong(longValue[i]));
            assertFalse(actual.testDouble(doubleValue[i]));
            assertFalse(actual.testFloat(floatValue[i]));
            assertFalse(expected.test(binaryValue[i]));
            assertFalse(expected.testLong(longValue[i]));
            assertFalse(expected.testDouble(doubleValue[i]));
            assertFalse(expected.testDouble(floatValue[i]));
        }
        for (int i = 0; i < entries; i++) {
            actual.add(binaryValue[i]);
            actual.addLong(longValue[i]);
            actual.addDouble(doubleValue[i]);
            actual.addFloat(floatValue[i]);
            expected.add(binaryValue[i]);
            expected.addLong(longValue[i]);
            expected.addDouble(doubleValue[i]);
            expected.addDouble(floatValue[i]);
        }
        for (int i = 0; i < entries; i++) {
            assertTrue(actual.test(binaryValue[i]));
            assertTrue(actual.testSlice(wrappedBuffer(binaryValue[i])));
            assertTrue(actual.testLong(longValue[i]));
            assertTrue(actual.testDouble(doubleValue[i]));
            assertTrue(actual.testFloat(floatValue[i]));
            assertTrue(expected.test(binaryValue[i]));
            assertTrue(expected.testLong(longValue[i]));
            assertTrue(expected.testDouble(doubleValue[i]));
            assertTrue(expected.testDouble(floatValue[i]));
        }
        actual.add((byte[]) null);
        expected.add(null);
        assertTrue(actual.test(null));
        assertTrue(actual.testSlice(null));
        assertTrue(expected.test(null));
        assertEquals(actual.getBitSet(), expected.getBitSet());
    }
}
Also used : BloomFilter(io.trino.orc.metadata.statistics.BloomFilter) TupleDomainOrcPredicate.checkInBloomFilter(io.trino.orc.TupleDomainOrcPredicate.checkInBloomFilter) Test(org.testng.annotations.Test)

Aggregations

BloomFilter (io.trino.orc.metadata.statistics.BloomFilter)14 Slice (io.airlift.slice.Slice)8 Stream (io.trino.orc.metadata.Stream)7 ColumnStatistics (io.trino.orc.metadata.statistics.ColumnStatistics)6 PresentOutputStream (io.trino.orc.stream.PresentOutputStream)6 StreamDataOutput (io.trino.orc.stream.StreamDataOutput)6 TupleDomainOrcPredicate.checkInBloomFilter (io.trino.orc.TupleDomainOrcPredicate.checkInBloomFilter)5 Test (org.testng.annotations.Test)5 ImmutableMap (com.google.common.collect.ImmutableMap)3 LongOutputStream (io.trino.orc.stream.LongOutputStream)3 ImmutableList (com.google.common.collect.ImmutableList)2 OrcProto (io.trino.orc.proto.OrcProto)2 CodedInputStream (io.trino.orc.protobuf.CodedInputStream)2 ByteArrayOutputStream (io.trino.orc.stream.ByteArrayOutputStream)2 LongOutputStream.createLengthOutputStream (io.trino.orc.stream.LongOutputStream.createLengthOutputStream)2 RealType (io.trino.spi.type.RealType)2 Type (io.trino.spi.type.Type)2 InputStream (java.io.InputStream)2 Map (java.util.Map)2 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)1