Search in sources :

Example 1 with BlockSplitBloomFilter

use of org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter in project parquet-mr by apache.

the class TestParquetFileWriter method testBloomFilterWriteRead.

@Test
public void testBloomFilterWriteRead() throws Exception {
    MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    configuration.set("parquet.bloom.filter.column.names", "foo");
    String[] colPath = { "foo" };
    ColumnDescriptor col = schema.getColumnDescription(colPath);
    BinaryStatistics stats1 = new BinaryStatistics();
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(col, 5, CODEC);
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0);
    blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello")));
    blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world")));
    w.addBloomFilter("foo", blockSplitBloomFilter);
    w.endBlock();
    w.end(new HashMap<>());
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    try (ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath)))) {
        BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0));
        BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0));
        assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))));
        assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world"))));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockSplitBloomFilter(org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter) Configuration(org.apache.hadoop.conf.Configuration) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter) BlockSplitBloomFilter(org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 2 with BlockSplitBloomFilter

use of org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter in project parquet-mr by apache.

the class ParquetFileReader method readBloomFilter.

/**
 * Reads Bloom filter data for the given column chunk.
 *
 * @param meta a column's ColumnChunkMetaData to read the dictionary from
 * @return an BloomFilter object.
 * @throws IOException if there is an error while reading the Bloom filter.
 */
public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException {
    long bloomFilterOffset = meta.getBloomFilterOffset();
    if (bloomFilterOffset < 0) {
        return null;
    }
    // Prepare to decrypt Bloom filter (for encrypted columns)
    BlockCipher.Decryptor bloomFilterDecryptor = null;
    byte[] bloomFilterHeaderAAD = null;
    byte[] bloomFilterBitsetAAD = null;
    if (null != fileDecryptor && !fileDecryptor.plaintextFile()) {
        InternalColumnDecryptionSetup columnDecryptionSetup = fileDecryptor.getColumnSetup(meta.getPath());
        if (columnDecryptionSetup.isEncrypted()) {
            bloomFilterDecryptor = columnDecryptionSetup.getMetaDataDecryptor();
            bloomFilterHeaderAAD = AesCipher.createModuleAAD(fileDecryptor.getFileAAD(), ModuleType.BloomFilterHeader, meta.getRowGroupOrdinal(), columnDecryptionSetup.getOrdinal(), -1);
            bloomFilterBitsetAAD = AesCipher.createModuleAAD(fileDecryptor.getFileAAD(), ModuleType.BloomFilterBitset, meta.getRowGroupOrdinal(), columnDecryptionSetup.getOrdinal(), -1);
        }
    }
    // Read Bloom filter data header.
    f.seek(bloomFilterOffset);
    BloomFilterHeader bloomFilterHeader;
    try {
        bloomFilterHeader = Util.readBloomFilterHeader(f, bloomFilterDecryptor, bloomFilterHeaderAAD);
    } catch (IOException e) {
        LOG.warn("read no bloom filter");
        return null;
    }
    int numBytes = bloomFilterHeader.getNumBytes();
    if (numBytes <= 0 || numBytes > BlockSplitBloomFilter.UPPER_BOUND_BYTES) {
        LOG.warn("the read bloom filter size is wrong, size is {}", bloomFilterHeader.getNumBytes());
        return null;
    }
    if (!bloomFilterHeader.getHash().isSetXXHASH() || !bloomFilterHeader.getAlgorithm().isSetBLOCK() || !bloomFilterHeader.getCompression().isSetUNCOMPRESSED()) {
        LOG.warn("the read bloom filter is not supported yet,  algorithm = {}, hash = {}, compression = {}", bloomFilterHeader.getAlgorithm(), bloomFilterHeader.getHash(), bloomFilterHeader.getCompression());
        return null;
    }
    byte[] bitset;
    if (null == bloomFilterDecryptor) {
        bitset = new byte[numBytes];
        f.readFully(bitset);
    } else {
        bitset = bloomFilterDecryptor.decrypt(f, bloomFilterBitsetAAD);
        if (bitset.length != numBytes) {
            throw new ParquetCryptoRuntimeException("Wrong length of decrypted bloom filter bitset");
        }
    }
    return new BlockSplitBloomFilter(bitset);
}
Also used : BlockSplitBloomFilter(org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter) BlockCipher(org.apache.parquet.format.BlockCipher) InternalColumnDecryptionSetup(org.apache.parquet.crypto.InternalColumnDecryptionSetup) ParquetCryptoRuntimeException(org.apache.parquet.crypto.ParquetCryptoRuntimeException) BloomFilterHeader(org.apache.parquet.format.BloomFilterHeader) IOException(java.io.IOException)

Aggregations

BlockSplitBloomFilter (org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter)2 File (java.io.File)1 IOException (java.io.IOException)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)1 BloomFilter (org.apache.parquet.column.values.bloomfilter.BloomFilter)1 InternalColumnDecryptionSetup (org.apache.parquet.crypto.InternalColumnDecryptionSetup)1 ParquetCryptoRuntimeException (org.apache.parquet.crypto.ParquetCryptoRuntimeException)1 BlockCipher (org.apache.parquet.format.BlockCipher)1 BloomFilterHeader (org.apache.parquet.format.BloomFilterHeader)1 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)1 MessageType (org.apache.parquet.schema.MessageType)1 Test (org.junit.Test)1