use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.
the class TestHoodieOrcReaderWriter method testWriteReadMetadata.
@Test
public void testWriteReadMetadata() throws Exception {
Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
HoodieOrcWriter writer = createOrcWriter(avroSchema);
for (int i = 0; i < 3; i++) {
GenericRecord record = new GenericData.Record(avroSchema);
record.put("_row_key", "key" + i);
record.put("time", Integer.toString(i));
record.put("number", i);
writer.writeAvro("key" + i, record);
}
writer.close();
Configuration conf = new Configuration();
Reader orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
assertEquals(4, orcReader.getMetadataKeys().size());
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_MIN_RECORD_KEY_FOOTER));
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_MAX_RECORD_KEY_FOOTER));
assertTrue(orcReader.getMetadataKeys().contains(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY));
assertTrue(orcReader.getMetadataKeys().contains(AVRO_SCHEMA_METADATA_KEY));
assertEquals(CompressionKind.ZLIB.name(), orcReader.getCompressionKind().toString());
HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
BloomFilter filter = hoodieReader.readBloomFilter();
for (int i = 0; i < 3; i++) {
assertTrue(filter.mightContain("key" + i));
}
assertFalse(filter.mightContain("non-existent-key"));
assertEquals(3, hoodieReader.getTotalRecords());
String[] minMaxRecordKeys = hoodieReader.readMinMaxRecordKeys();
assertEquals(2, minMaxRecordKeys.length);
assertEquals("key0", minMaxRecordKeys[0]);
assertEquals("key2", minMaxRecordKeys[1]);
}
use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.
the class HoodieFileWriterFactory method newParquetFileWriter.
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable, TaskContextSupplier taskContextSupplier, boolean populateMetaFields, boolean enableBloomFilter) throws IOException {
Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(hoodieTable.getHadoopConf()).convert(schema), schema, filter);
HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(), config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(), config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier, populateMetaFields);
}
use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.
the class HoodieHFileWriter method close.
@Override
public void close() throws IOException {
if (hfileConfig.useBloomFilter()) {
final BloomFilter bloomFilter = hfileConfig.getBloomFilter();
if (minRecordKey == null) {
minRecordKey = "";
}
if (maxRecordKey == null) {
maxRecordKey = "";
}
writer.appendFileInfo(HoodieHFileReader.KEY_MIN_RECORD.getBytes(), minRecordKey.getBytes());
writer.appendFileInfo(HoodieHFileReader.KEY_MAX_RECORD.getBytes(), maxRecordKey.getBytes());
writer.appendFileInfo(HoodieHFileReader.KEY_BLOOM_FILTER_TYPE_CODE.getBytes(), bloomFilter.getBloomFilterTypeCode().toString().getBytes());
writer.appendMetaBlock(HoodieHFileReader.KEY_BLOOM_FILTER_META_BLOCK, new Writable() {
@Override
public void write(DataOutput out) throws IOException {
out.write(bloomFilter.serializeToString().getBytes());
}
@Override
public void readFields(DataInput in) throws IOException {
}
});
}
writer.close();
writer = null;
}
use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.
the class HoodieKeyLookupHandle method getBloomFilter.
private BloomFilter getBloomFilter() {
BloomFilter bloomFilter = null;
HoodieTimer timer = new HoodieTimer().startTimer();
try {
if (config.isMetadataBloomFilterIndexEnabled()) {
bloomFilter = hoodieTable.getMetadataTable().getBloomFilter(partitionPathFileIDPair.getLeft(), partitionPathFileIDPair.getRight()).orElseThrow(() -> new HoodieIndexException("BloomFilter missing for " + partitionPathFileIDPair.getRight()));
} else {
try (HoodieFileReader reader = createNewFileReader()) {
bloomFilter = reader.readBloomFilter();
}
}
} catch (IOException e) {
throw new HoodieIndexException(String.format("Error reading bloom filter from %s", getPartitionPathFileIDPair()), e);
}
LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFileIDPair, timer.endTimer()));
return bloomFilter;
}
use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.
the class HiveTestUtil method generateParquetData.
@SuppressWarnings({ "unchecked", "deprecation" })
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple) throws IOException, URISyntaxException {
Schema schema = getTestDataSchema(isParquetSchemaSimple);
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1, BloomFilterTypeCode.SIMPLE.name());
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, Option.of(filter));
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
testRecords.forEach(s -> {
try {
writer.write(s);
} catch (IOException e) {
fail("IOException while writing test records as parquet" + e.toString());
}
});
writer.close();
}
Aggregations