Search in sources :

Example 6 with BloomFilter

use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.

the class BloomFilterReader method readBloomFilter.

public BloomFilter readBloomFilter(ColumnChunkMetaData meta) {
    if (cache.containsKey(meta.getPath())) {
        return cache.get(meta.getPath());
    }
    try {
        if (!cache.containsKey(meta.getPath())) {
            BloomFilter bloomFilter = reader.readBloomFilter(meta);
            if (bloomFilter == null) {
                return null;
            }
            cache.put(meta.getPath(), bloomFilter);
        }
        return cache.get(meta.getPath());
    } catch (IOException e) {
        logger.error("Failed to read Bloom filter data", e);
    }
    return null;
}
Also used : IOException(java.io.IOException) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter)

Example 7 with BloomFilter

use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.

the class BloomFilterImpl method visit.

@Override
public <T extends Comparable<T>> Boolean visit(Operators.Eq<T> eq) {
    T value = eq.getValue();
    if (value == null) {
        // could check the column stats, but the StatisticsFilter is responsible
        return BLOCK_MIGHT_MATCH;
    }
    Operators.Column<T> filterColumn = eq.getColumn();
    ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
    if (meta == null) {
        // must be non-null because of the above check.
        return BLOCK_CANNOT_MATCH;
    }
    try {
        BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(meta);
        if (bloomFilter != null && !bloomFilter.findHash(bloomFilter.hash(value))) {
            return BLOCK_CANNOT_MATCH;
        }
    } catch (RuntimeException e) {
        LOG.warn(e.getMessage());
        return BLOCK_MIGHT_MATCH;
    }
    return BLOCK_MIGHT_MATCH;
}
Also used : Operators(org.apache.parquet.filter2.predicate.Operators) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter)

Example 8 with BloomFilter

use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.

the class BloomFilterImpl method visit.

@Override
public <T extends Comparable<T>> Boolean visit(Operators.In<T> in) {
    Set<T> values = in.getValues();
    if (values.contains(null)) {
        // could check the column stats, but the StatisticsFilter is responsible
        return BLOCK_MIGHT_MATCH;
    }
    Operators.Column<T> filterColumn = in.getColumn();
    ColumnChunkMetaData meta = getColumnChunk(filterColumn.getColumnPath());
    if (meta == null) {
        // must be non-null because of the above check.
        return BLOCK_CANNOT_MATCH;
    }
    BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(meta);
    if (bloomFilter != null) {
        for (T value : values) {
            if (bloomFilter.findHash(bloomFilter.hash(value))) {
                return BLOCK_MIGHT_MATCH;
            }
        }
        return BLOCK_CANNOT_MATCH;
    }
    return BLOCK_MIGHT_MATCH;
}
Also used : Operators(org.apache.parquet.filter2.predicate.Operators) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter)

Aggregations

BloomFilter (org.apache.parquet.column.values.bloomfilter.BloomFilter)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)4 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)3 MessageType (org.apache.parquet.schema.MessageType)3 ByteArrayOutputStream (java.io.ByteArrayOutputStream)2 File (java.io.File)2 IOException (java.io.IOException)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)2 Operators (org.apache.parquet.filter2.predicate.Operators)2 BlockCipher (org.apache.parquet.format.BlockCipher)2 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)2 Test (org.junit.Test)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)1 BlockSplitBloomFilter (org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter)1 Group (org.apache.parquet.example.data.Group)1 GroupFactory (org.apache.parquet.example.data.GroupFactory)1 SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)1