use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class FloatColumnWriter method getBloomFilters.
@Override
public List<StreamDataOutput> getBloomFilters(CompressedMetadataWriter metadataWriter) throws IOException {
List<BloomFilter> bloomFilters = rowGroupColumnStatistics.stream().map(ColumnStatistics::getBloomFilter).filter(Objects::nonNull).collect(toImmutableList());
if (!bloomFilters.isEmpty()) {
Slice slice = metadataWriter.writeBloomFilters(bloomFilters);
Stream stream = new Stream(columnId, StreamKind.BLOOM_FILTER_UTF8, slice.length(), false);
return ImmutableList.of(new StreamDataOutput(slice, stream));
}
return ImmutableList.of();
}
use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class TestOrcBloomFilters method testHiveBloomFilterSerde.
@Test
public void testHiveBloomFilterSerde() {
BloomFilter bloomFilter = new BloomFilter(1_000_000L, 0.05);
// String
bloomFilter.add(TEST_STRING);
assertTrue(bloomFilter.test(TEST_STRING));
assertTrue(bloomFilter.testSlice(wrappedBuffer(TEST_STRING)));
assertFalse(bloomFilter.test(TEST_STRING_NOT_WRITTEN));
assertFalse(bloomFilter.testSlice(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
// Integer
bloomFilter.addLong(TEST_INTEGER);
assertTrue(bloomFilter.testLong(TEST_INTEGER));
assertFalse(bloomFilter.testLong(TEST_INTEGER + 1));
// Re-construct
BloomFilter newBloomFilter = new BloomFilter(bloomFilter.getBitSet(), bloomFilter.getNumHashFunctions());
// String
assertTrue(newBloomFilter.test(TEST_STRING));
assertTrue(newBloomFilter.testSlice(wrappedBuffer(TEST_STRING)));
assertFalse(newBloomFilter.test(TEST_STRING_NOT_WRITTEN));
assertFalse(newBloomFilter.testSlice(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
// Integer
assertTrue(newBloomFilter.testLong(TEST_INTEGER));
assertFalse(newBloomFilter.testLong(TEST_INTEGER + 1));
}
use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class TestOrcBloomFilters method testOrcHiveBloomFilterSerde.
@Test
public void testOrcHiveBloomFilterSerde() throws Exception {
BloomFilter bloomFilterWrite = new BloomFilter(1000L, 0.05);
bloomFilterWrite.add(TEST_STRING);
assertTrue(bloomFilterWrite.test(TEST_STRING));
assertTrue(bloomFilterWrite.testSlice(wrappedBuffer(TEST_STRING)));
Slice bloomFilterBytes = new CompressedMetadataWriter(new OrcMetadataWriter(WriterIdentification.TRINO), CompressionKind.NONE, 1024).writeBloomFilters(ImmutableList.of(bloomFilterWrite));
// Read through method
InputStream inputStream = bloomFilterBytes.getInput();
OrcMetadataReader metadataReader = new OrcMetadataReader();
List<BloomFilter> bloomFilters = metadataReader.readBloomFilterIndexes(inputStream);
assertEquals(bloomFilters.size(), 1);
assertTrue(bloomFilters.get(0).test(TEST_STRING));
assertTrue(bloomFilters.get(0).testSlice(wrappedBuffer(TEST_STRING)));
assertFalse(bloomFilters.get(0).test(TEST_STRING_NOT_WRITTEN));
assertFalse(bloomFilters.get(0).testSlice(wrappedBuffer(TEST_STRING_NOT_WRITTEN)));
assertEquals(bloomFilterWrite.getNumBits(), bloomFilters.get(0).getNumBits());
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilters.get(0).getNumHashFunctions());
// Validate bit set
assertTrue(Arrays.equals(bloomFilters.get(0).getBitSet(), bloomFilterWrite.getBitSet()));
// Read directly: allows better inspection of the bit sets (helped to fix a lot of bugs)
CodedInputStream input = CodedInputStream.newInstance(bloomFilterBytes.getBytes());
OrcProto.BloomFilterIndex deserializedBloomFilterIndex = OrcProto.BloomFilterIndex.parseFrom(input);
List<OrcProto.BloomFilter> bloomFilterList = deserializedBloomFilterIndex.getBloomFilterList();
assertEquals(bloomFilterList.size(), 1);
OrcProto.BloomFilter bloomFilterRead = bloomFilterList.get(0);
// Validate contents of ORC bloom filter bit set
assertTrue(Arrays.equals(Longs.toArray(bloomFilterRead.getBitsetList()), bloomFilterWrite.getBitSet()));
// hash functions
assertEquals(bloomFilterWrite.getNumHashFunctions(), bloomFilterRead.getNumHashFunctions());
// bit size
assertEquals(bloomFilterWrite.getBitSet().length, bloomFilterRead.getBitsetCount());
}
use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class TestOrcBloomFilters method testBloomFilterPredicateValuesNonExisting.
@Test
public void testBloomFilterPredicateValuesNonExisting() {
BloomFilter bloomFilter = new BloomFilter(TEST_VALUES.size() * 10, 0.01);
for (Map.Entry<Object, Type> testValue : TEST_VALUES.entrySet()) {
boolean matched = checkInBloomFilter(bloomFilter, testValue.getKey(), testValue.getValue());
assertFalse(matched, "type " + testValue.getKey().getClass());
}
}
use of io.trino.orc.metadata.statistics.BloomFilter in project trino by trinodb.
the class TestOrcBloomFilters method testBloomFilterCompatibility.
@Test
public void testBloomFilterCompatibility() {
for (int n = 0; n < 200; n++) {
double fpp = ThreadLocalRandom.current().nextDouble(0.01, 0.10);
int size = ThreadLocalRandom.current().nextInt(100, 10000);
int entries = ThreadLocalRandom.current().nextInt(size / 2, size);
BloomFilter actual = new BloomFilter(size, fpp);
org.apache.orc.util.BloomFilter expected = new org.apache.orc.util.BloomFilter(size, fpp);
assertFalse(actual.test(null));
assertFalse(expected.test(null));
byte[][] binaryValue = new byte[entries][];
long[] longValue = new long[entries];
double[] doubleValue = new double[entries];
float[] floatValue = new float[entries];
for (int i = 0; i < entries; i++) {
binaryValue[i] = randomBytes(ThreadLocalRandom.current().nextInt(100));
longValue[i] = ThreadLocalRandom.current().nextLong();
doubleValue[i] = ThreadLocalRandom.current().nextDouble();
floatValue[i] = ThreadLocalRandom.current().nextFloat();
}
for (int i = 0; i < entries; i++) {
assertFalse(actual.test(binaryValue[i]));
assertFalse(actual.testSlice(wrappedBuffer(binaryValue[i])));
assertFalse(actual.testLong(longValue[i]));
assertFalse(actual.testDouble(doubleValue[i]));
assertFalse(actual.testFloat(floatValue[i]));
assertFalse(expected.test(binaryValue[i]));
assertFalse(expected.testLong(longValue[i]));
assertFalse(expected.testDouble(doubleValue[i]));
assertFalse(expected.testDouble(floatValue[i]));
}
for (int i = 0; i < entries; i++) {
actual.add(binaryValue[i]);
actual.addLong(longValue[i]);
actual.addDouble(doubleValue[i]);
actual.addFloat(floatValue[i]);
expected.add(binaryValue[i]);
expected.addLong(longValue[i]);
expected.addDouble(doubleValue[i]);
expected.addDouble(floatValue[i]);
}
for (int i = 0; i < entries; i++) {
assertTrue(actual.test(binaryValue[i]));
assertTrue(actual.testSlice(wrappedBuffer(binaryValue[i])));
assertTrue(actual.testLong(longValue[i]));
assertTrue(actual.testDouble(doubleValue[i]));
assertTrue(actual.testFloat(floatValue[i]));
assertTrue(expected.test(binaryValue[i]));
assertTrue(expected.testLong(longValue[i]));
assertTrue(expected.testDouble(doubleValue[i]));
assertTrue(expected.testDouble(floatValue[i]));
}
actual.add((byte[]) null);
expected.add(null);
assertTrue(actual.test(null));
assertTrue(actual.testSlice(null));
assertTrue(expected.test(null));
assertEquals(actual.getBitSet(), expected.getBitSet());
}
}
Aggregations