use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project drill by apache.
the class ParquetFileWriter method serializeBloomFilters.
private static void serializeBloomFilters(List<Map<String, BloomFilter>> bloomFilters, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
LOG.debug("{}: bloom filters", out.getPos());
for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
BlockMetaData block = blocks.get(bIndex);
List<ColumnChunkMetaData> columns = block.getColumns();
Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
if (blockBloomFilters.isEmpty()) {
continue;
}
for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
ColumnChunkMetaData column = columns.get(cIndex);
BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
if (bloomFilter == null) {
continue;
}
long offset = out.getPos();
column.setBloomFilterOffset(offset);
BlockCipher.Encryptor bloomFilterEncryptor = null;
byte[] bloomFilterHeaderAAD = null;
byte[] bloomFilterBitsetAAD = null;
if (null != fileEncryptor) {
InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
if (columnEncryptionSetup.isEncrypted()) {
bloomFilterEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
int columnOrdinal = columnEncryptionSetup.getOrdinal();
bloomFilterHeaderAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterHeader, block.getOrdinal(), columnOrdinal, -1);
bloomFilterBitsetAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterBitset, block.getOrdinal(), columnOrdinal, -1);
}
}
Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out, bloomFilterEncryptor, bloomFilterHeaderAAD);
ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream();
bloomFilter.writeTo(tempOutStream);
byte[] serializedBitset = tempOutStream.toByteArray();
if (null != bloomFilterEncryptor) {
serializedBitset = bloomFilterEncryptor.encrypt(serializedBitset, bloomFilterBitsetAAD);
}
out.write(serializedBitset);
}
}
}
use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.
the class ColumnMasker method processChunk.
private void processChunk(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, ColumnReadStoreImpl crStore, TransParquetFileReader reader, ParquetFileWriter writer, MessageType schema, Set<ColumnPath> paths, MaskMode maskMode) throws IOException {
reader.setStreamPosition(chunk.getStartingPos());
if (paths.contains(chunk.getPath())) {
if (maskMode.equals(MaskMode.NULLIFY)) {
Type.Repetition repetition = descriptor.getPrimitiveType().getRepetition();
if (repetition.equals(Type.Repetition.REQUIRED)) {
throw new IOException("Required column [" + descriptor.getPrimitiveType().getName() + "] cannot be nullified");
}
nullifyColumn(descriptor, chunk, crStore, writer, schema);
} else {
throw new UnsupportedOperationException("Only nullify is supported for now");
}
} else {
BloomFilter bloomFilter = reader.readBloomFilter(chunk);
ColumnIndex columnIndex = reader.readColumnIndex(chunk);
OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);
writer.appendColumnChunk(descriptor, reader.getStream(), chunk, bloomFilter, columnIndex, offsetIndex);
}
}
use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.
the class TestParquetFileWriter method testBloomFilterWriteRead.
@Test
public void testBloomFilterWriteRead() throws Exception {
MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
File testFile = temp.newFile();
testFile.delete();
Path path = new Path(testFile.toURI());
Configuration configuration = new Configuration();
configuration.set("parquet.bloom.filter.column.names", "foo");
String[] colPath = { "foo" };
ColumnDescriptor col = schema.getColumnDescription(colPath);
BinaryStatistics stats1 = new BinaryStatistics();
ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
w.start();
w.startBlock(3);
w.startColumn(col, 5, CODEC);
w.writeDataPage(2, 4, BytesInput.from(BYTES1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.writeDataPage(3, 4, BytesInput.from(BYTES1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
w.endColumn();
BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0);
blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello")));
blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world")));
w.addBloomFilter("foo", blockSplitBloomFilter);
w.endBlock();
w.end(new HashMap<>());
ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
try (ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath)))) {
BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0));
BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0));
assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))));
assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world"))));
}
}
use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.
the class TestParquetWriter method testParquetFileWithBloomFilter.
@Test
public void testParquetFileWithBloomFilter() throws IOException {
MessageType schema = Types.buildMessage().required(BINARY).as(stringType()).named("name").named("msg");
String[] testNames = { "hello", "parquet", "bloom", "filter" };
Configuration conf = new Configuration();
GroupWriteSupport.setSchema(schema, conf);
GroupFactory factory = new SimpleGroupFactory(schema);
File file = temp.newFile();
file.delete();
Path path = new Path(file.getAbsolutePath());
try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withPageRowCountLimit(10).withConf(conf).withDictionaryEncoding(false).withBloomFilterEnabled("name", true).build()) {
for (String testName : testNames) {
writer.write(factory.newGroup().append("name", testName));
}
}
try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData).readBloomFilter(blockMetaData.getColumns().get(0));
for (String name : testNames) {
assertTrue(bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(name).toByteBuffer())));
}
}
}
use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.
the class ParquetFileWriter method serializeBloomFilters.
private static void serializeBloomFilters(List<Map<String, BloomFilter>> bloomFilters, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
LOG.debug("{}: bloom filters", out.getPos());
for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
BlockMetaData block = blocks.get(bIndex);
List<ColumnChunkMetaData> columns = block.getColumns();
Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
if (blockBloomFilters.isEmpty())
continue;
for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
ColumnChunkMetaData column = columns.get(cIndex);
BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
if (bloomFilter == null) {
continue;
}
long offset = out.getPos();
column.setBloomFilterOffset(offset);
BlockCipher.Encryptor bloomFilterEncryptor = null;
byte[] bloomFilterHeaderAAD = null;
byte[] bloomFilterBitsetAAD = null;
if (null != fileEncryptor) {
InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
if (columnEncryptionSetup.isEncrypted()) {
bloomFilterEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
int columnOrdinal = columnEncryptionSetup.getOrdinal();
bloomFilterHeaderAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterHeader, block.getOrdinal(), columnOrdinal, -1);
bloomFilterBitsetAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterBitset, block.getOrdinal(), columnOrdinal, -1);
}
}
Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out, bloomFilterEncryptor, bloomFilterHeaderAAD);
ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream();
bloomFilter.writeTo(tempOutStream);
byte[] serializedBitset = tempOutStream.toByteArray();
if (null != bloomFilterEncryptor) {
serializedBitset = bloomFilterEncryptor.encrypt(serializedBitset, bloomFilterBitsetAAD);
}
out.write(serializedBitset);
}
}
}
Aggregations