use of org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter in project parquet-mr by apache.
the class TestParquetFileWriter method testBloomFilterWriteRead.
@Test
public void testBloomFilterWriteRead() throws Exception {
MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
configuration.set("parquet.bloom.filter.column.names", "foo");
String[] colPath = { "foo" };
ColumnDescriptor col = schema.getColumnDescription(colPath);
BinaryStatistics stats1 = new BinaryStatistics();
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(col, 5, CODEC);
w.writeDataPage(2, 4, BytesInput.from(BYTES1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0);
blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello")));
blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world")));
w.addBloomFilter("foo", blockSplitBloomFilter);
w.endBlock();
w.end(new HashMap<>());
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
try (ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath)))) {
BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0));
BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0));
assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))));
assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world"))));
}
}
use of org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter in project parquet-mr by apache.
the class ParquetFileReader method readBloomFilter.
/**
* Reads Bloom filter data for the given column chunk.
*
* @param meta a column's ColumnChunkMetaData to read the dictionary from
* @return an BloomFilter object.
* @throws IOException if there is an error while reading the Bloom filter.
*/
public BloomFilter readBloomFilter(ColumnChunkMetaData meta) throws IOException {
long bloomFilterOffset = meta.getBloomFilterOffset();
if (bloomFilterOffset < 0) {
return null;
}
// Prepare to decrypt Bloom filter (for encrypted columns)
BlockCipher.Decryptor bloomFilterDecryptor = null;
byte[] bloomFilterHeaderAAD = null;
byte[] bloomFilterBitsetAAD = null;
if (null != fileDecryptor && !fileDecryptor.plaintextFile()) {
InternalColumnDecryptionSetup columnDecryptionSetup = fileDecryptor.getColumnSetup(meta.getPath());
if (columnDecryptionSetup.isEncrypted()) {
bloomFilterDecryptor = columnDecryptionSetup.getMetaDataDecryptor();
bloomFilterHeaderAAD = AesCipher.createModuleAAD(fileDecryptor.getFileAAD(), ModuleType.BloomFilterHeader, meta.getRowGroupOrdinal(), columnDecryptionSetup.getOrdinal(), -1);
bloomFilterBitsetAAD = AesCipher.createModuleAAD(fileDecryptor.getFileAAD(), ModuleType.BloomFilterBitset, meta.getRowGroupOrdinal(), columnDecryptionSetup.getOrdinal(), -1);
}
}
// Read Bloom filter data header.
f.seek(bloomFilterOffset);
BloomFilterHeader bloomFilterHeader;
try {
bloomFilterHeader = Util.readBloomFilterHeader(f, bloomFilterDecryptor, bloomFilterHeaderAAD);
} catch (IOException e) {
LOG.warn("read no bloom filter");
return null;
}
int numBytes = bloomFilterHeader.getNumBytes();
if (numBytes <= 0 || numBytes > BlockSplitBloomFilter.UPPER_BOUND_BYTES) {
LOG.warn("the read bloom filter size is wrong, size is {}", bloomFilterHeader.getNumBytes());
return null;
}
if (!bloomFilterHeader.getHash().isSetXXHASH() || !bloomFilterHeader.getAlgorithm().isSetBLOCK() || !bloomFilterHeader.getCompression().isSetUNCOMPRESSED()) {
LOG.warn("the read bloom filter is not supported yet, algorithm = {}, hash = {}, compression = {}", bloomFilterHeader.getAlgorithm(), bloomFilterHeader.getHash(), bloomFilterHeader.getCompression());
return null;
}
byte[] bitset;
if (null == bloomFilterDecryptor) {
bitset = new byte[numBytes];
f.readFully(bitset);
} else {
bitset = bloomFilterDecryptor.decrypt(f, bloomFilterBitsetAAD);
if (bitset.length != numBytes) {
throw new ParquetCryptoRuntimeException("Wrong length of decrypted bloom filter bitset");
}
}
return new BlockSplitBloomFilter(bitset);
}
Aggregations