Search in sources :

Example 1 with BloomFilter

use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project drill by apache.

the class ParquetFileWriter method serializeBloomFilters.

private static void serializeBloomFilters(List<Map<String, BloomFilter>> bloomFilters, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
    LOG.debug("{}: bloom filters", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
        BlockMetaData block = blocks.get(bIndex);
        List<ColumnChunkMetaData> columns = block.getColumns();
        Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
        if (blockBloomFilters.isEmpty()) {
            continue;
        }
        for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
            ColumnChunkMetaData column = columns.get(cIndex);
            BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
            if (bloomFilter == null) {
                continue;
            }
            long offset = out.getPos();
            column.setBloomFilterOffset(offset);
            BlockCipher.Encryptor bloomFilterEncryptor = null;
            byte[] bloomFilterHeaderAAD = null;
            byte[] bloomFilterBitsetAAD = null;
            if (null != fileEncryptor) {
                InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
                if (columnEncryptionSetup.isEncrypted()) {
                    bloomFilterEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
                    int columnOrdinal = columnEncryptionSetup.getOrdinal();
                    bloomFilterHeaderAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterHeader, block.getOrdinal(), columnOrdinal, -1);
                    bloomFilterBitsetAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterBitset, block.getOrdinal(), columnOrdinal, -1);
                }
            }
            Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out, bloomFilterEncryptor, bloomFilterHeaderAAD);
            ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream();
            bloomFilter.writeTo(tempOutStream);
            byte[] serializedBitset = tempOutStream.toByteArray();
            if (null != bloomFilterEncryptor) {
                serializedBitset = bloomFilterEncryptor.encrypt(serializedBitset, bloomFilterBitsetAAD);
            }
            out.write(serializedBitset);
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BlockCipher(org.apache.parquet.format.BlockCipher) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup)

Example 2 with BloomFilter

use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.

the class ColumnMasker method processChunk.

private void processChunk(ColumnDescriptor descriptor, ColumnChunkMetaData chunk, ColumnReadStoreImpl crStore, TransParquetFileReader reader, ParquetFileWriter writer, MessageType schema, Set<ColumnPath> paths, MaskMode maskMode) throws IOException {
    reader.setStreamPosition(chunk.getStartingPos());
    if (paths.contains(chunk.getPath())) {
        if (maskMode.equals(MaskMode.NULLIFY)) {
            Type.Repetition repetition = descriptor.getPrimitiveType().getRepetition();
            if (repetition.equals(Type.Repetition.REQUIRED)) {
                throw new IOException("Required column [" + descriptor.getPrimitiveType().getName() + "] cannot be nullified");
            }
            nullifyColumn(descriptor, chunk, crStore, writer, schema);
        } else {
            throw new UnsupportedOperationException("Only nullify is supported for now");
        }
    } else {
        BloomFilter bloomFilter = reader.readBloomFilter(chunk);
        ColumnIndex columnIndex = reader.readColumnIndex(chunk);
        OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);
        writer.appendColumnChunk(descriptor, reader.getStream(), chunk, bloomFilter, columnIndex, offsetIndex);
    }
}
Also used : GroupType(org.apache.parquet.schema.GroupType) MessageType(org.apache.parquet.schema.MessageType) Type(org.apache.parquet.schema.Type) ColumnIndex(org.apache.parquet.internal.column.columnindex.ColumnIndex) IOException(java.io.IOException) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter) OffsetIndex(org.apache.parquet.internal.column.columnindex.OffsetIndex)

Example 3 with BloomFilter

use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.

the class TestParquetFileWriter method testBloomFilterWriteRead.

@Test
public void testBloomFilterWriteRead() throws Exception {
    MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }");
    File testFile = temp.newFile();
    testFile.delete();
    Path path = new Path(testFile.toURI());
    Configuration configuration = new Configuration();
    configuration.set("parquet.bloom.filter.column.names", "foo");
    String[] colPath = { "foo" };
    ColumnDescriptor col = schema.getColumnDescription(colPath);
    BinaryStatistics stats1 = new BinaryStatistics();
    ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
    w.start();
    w.startBlock(3);
    w.startColumn(col, 5, CODEC);
    w.writeDataPage(2, 4, BytesInput.from(BYTES1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.writeDataPage(3, 4, BytesInput.from(BYTES1), stats1, BIT_PACKED, BIT_PACKED, PLAIN);
    w.endColumn();
    BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0);
    blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello")));
    blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world")));
    w.addBloomFilter("foo", blockSplitBloomFilter);
    w.endBlock();
    w.end(new HashMap<>());
    ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path);
    try (ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath)))) {
        BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0));
        BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0));
        assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))));
        assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world"))));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) BlockSplitBloomFilter(org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter) Configuration(org.apache.hadoop.conf.Configuration) ColumnDescriptor(org.apache.parquet.column.ColumnDescriptor) BinaryStatistics(org.apache.parquet.column.statistics.BinaryStatistics) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter) BlockSplitBloomFilter(org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.Test)

Example 4 with BloomFilter

use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.

the class TestParquetWriter method testParquetFileWithBloomFilter.

@Test
public void testParquetFileWithBloomFilter() throws IOException {
    MessageType schema = Types.buildMessage().required(BINARY).as(stringType()).named("name").named("msg");
    String[] testNames = { "hello", "parquet", "bloom", "filter" };
    Configuration conf = new Configuration();
    GroupWriteSupport.setSchema(schema, conf);
    GroupFactory factory = new SimpleGroupFactory(schema);
    File file = temp.newFile();
    file.delete();
    Path path = new Path(file.getAbsolutePath());
    try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(path).withPageRowCountLimit(10).withConf(conf).withDictionaryEncoding(false).withBloomFilterEnabled("name", true).build()) {
        for (String testName : testNames) {
            writer.write(factory.newGroup().append("name", testName));
        }
    }
    try (ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(path, new Configuration()))) {
        BlockMetaData blockMetaData = reader.getFooter().getBlocks().get(0);
        BloomFilter bloomFilter = reader.getBloomFilterDataReader(blockMetaData).readBloomFilter(blockMetaData.getColumns().get(0));
        for (String name : testNames) {
            assertTrue(bloomFilter.findHash(LongHashFunction.xx(0).hashBytes(Binary.fromString(name).toByteBuffer())));
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Group(org.apache.parquet.example.data.Group) BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) Configuration(org.apache.hadoop.conf.Configuration) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) GroupFactory(org.apache.parquet.example.data.GroupFactory) SimpleGroupFactory(org.apache.parquet.example.data.simple.SimpleGroupFactory) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter) OutputFile(org.apache.parquet.io.OutputFile) HadoopInputFile(org.apache.parquet.hadoop.util.HadoopInputFile) HadoopOutputFile(org.apache.parquet.hadoop.util.HadoopOutputFile) File(java.io.File) MessageType(org.apache.parquet.schema.MessageType) MessageTypeParser.parseMessageType(org.apache.parquet.schema.MessageTypeParser.parseMessageType) Test(org.junit.Test)

Example 5 with BloomFilter

use of org.apache.parquet.column.values.bloomfilter.BloomFilter in project parquet-mr by apache.

the class ParquetFileWriter method serializeBloomFilters.

private static void serializeBloomFilters(List<Map<String, BloomFilter>> bloomFilters, List<BlockMetaData> blocks, PositionOutputStream out, InternalFileEncryptor fileEncryptor) throws IOException {
    LOG.debug("{}: bloom filters", out.getPos());
    for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
        BlockMetaData block = blocks.get(bIndex);
        List<ColumnChunkMetaData> columns = block.getColumns();
        Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
        if (blockBloomFilters.isEmpty())
            continue;
        for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
            ColumnChunkMetaData column = columns.get(cIndex);
            BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
            if (bloomFilter == null) {
                continue;
            }
            long offset = out.getPos();
            column.setBloomFilterOffset(offset);
            BlockCipher.Encryptor bloomFilterEncryptor = null;
            byte[] bloomFilterHeaderAAD = null;
            byte[] bloomFilterBitsetAAD = null;
            if (null != fileEncryptor) {
                InternalColumnEncryptionSetup columnEncryptionSetup = fileEncryptor.getColumnSetup(column.getPath(), false, cIndex);
                if (columnEncryptionSetup.isEncrypted()) {
                    bloomFilterEncryptor = columnEncryptionSetup.getMetaDataEncryptor();
                    int columnOrdinal = columnEncryptionSetup.getOrdinal();
                    bloomFilterHeaderAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterHeader, block.getOrdinal(), columnOrdinal, -1);
                    bloomFilterBitsetAAD = AesCipher.createModuleAAD(fileEncryptor.getFileAAD(), ModuleType.BloomFilterBitset, block.getOrdinal(), columnOrdinal, -1);
                }
            }
            Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out, bloomFilterEncryptor, bloomFilterHeaderAAD);
            ByteArrayOutputStream tempOutStream = new ByteArrayOutputStream();
            bloomFilter.writeTo(tempOutStream);
            byte[] serializedBitset = tempOutStream.toByteArray();
            if (null != bloomFilterEncryptor) {
                serializedBitset = bloomFilterEncryptor.encrypt(serializedBitset, bloomFilterBitsetAAD);
            }
            out.write(serializedBitset);
        }
    }
}
Also used : BlockMetaData(org.apache.parquet.hadoop.metadata.BlockMetaData) ColumnChunkMetaData(org.apache.parquet.hadoop.metadata.ColumnChunkMetaData) BlockCipher(org.apache.parquet.format.BlockCipher) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BloomFilter(org.apache.parquet.column.values.bloomfilter.BloomFilter) InternalColumnEncryptionSetup(org.apache.parquet.crypto.InternalColumnEncryptionSetup)

Aggregations

BloomFilter (org.apache.parquet.column.values.bloomfilter.BloomFilter)8 ColumnChunkMetaData (org.apache.parquet.hadoop.metadata.ColumnChunkMetaData)4 BlockMetaData (org.apache.parquet.hadoop.metadata.BlockMetaData)3 MessageType (org.apache.parquet.schema.MessageType)3 ByteArrayOutputStream (java.io.ByteArrayOutputStream)2 File (java.io.File)2 IOException (java.io.IOException)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 InternalColumnEncryptionSetup (org.apache.parquet.crypto.InternalColumnEncryptionSetup)2 Operators (org.apache.parquet.filter2.predicate.Operators)2 BlockCipher (org.apache.parquet.format.BlockCipher)2 HadoopInputFile (org.apache.parquet.hadoop.util.HadoopInputFile)2 Test (org.junit.Test)2 ColumnDescriptor (org.apache.parquet.column.ColumnDescriptor)1 BinaryStatistics (org.apache.parquet.column.statistics.BinaryStatistics)1 BlockSplitBloomFilter (org.apache.parquet.column.values.bloomfilter.BlockSplitBloomFilter)1 Group (org.apache.parquet.example.data.Group)1 GroupFactory (org.apache.parquet.example.data.GroupFactory)1 SimpleGroupFactory (org.apache.parquet.example.data.simple.SimpleGroupFactory)1