Search in sources :

Example 21 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestHoodieOrcReaderWriter method testWriteReadMetadata.

@Test
public void testWriteReadMetadata() throws Exception {
    Schema avroSchema = getSchemaFromResource(TestHoodieOrcReaderWriter.class, "/exampleSchema.avsc");
    HoodieOrcWriter writer = createOrcWriter(avroSchema);
    for (int i = 0; i < 3; i++) {
        GenericRecord record = new GenericData.Record(avroSchema);
        record.put("_row_key", "key" + i);
        record.put("time", Integer.toString(i));
        record.put("number", i);
        writer.writeAvro("key" + i, record);
    }
    writer.close();
    Configuration conf = new Configuration();
    Reader orcReader = OrcFile.createReader(filePath, OrcFile.readerOptions(conf));
    assertEquals(4, orcReader.getMetadataKeys().size());
    assertTrue(orcReader.getMetadataKeys().contains(HOODIE_MIN_RECORD_KEY_FOOTER));
    assertTrue(orcReader.getMetadataKeys().contains(HOODIE_MAX_RECORD_KEY_FOOTER));
    assertTrue(orcReader.getMetadataKeys().contains(HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY));
    assertTrue(orcReader.getMetadataKeys().contains(AVRO_SCHEMA_METADATA_KEY));
    assertEquals(CompressionKind.ZLIB.name(), orcReader.getCompressionKind().toString());
    HoodieFileReader<GenericRecord> hoodieReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
    BloomFilter filter = hoodieReader.readBloomFilter();
    for (int i = 0; i < 3; i++) {
        assertTrue(filter.mightContain("key" + i));
    }
    assertFalse(filter.mightContain("non-existent-key"));
    assertEquals(3, hoodieReader.getTotalRecords());
    String[] minMaxRecordKeys = hoodieReader.readMinMaxRecordKeys();
    assertEquals(2, minMaxRecordKeys.length);
    assertEquals("key0", minMaxRecordKeys[0]);
    assertEquals("key2", minMaxRecordKeys[1]);
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) Reader(org.apache.orc.Reader) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) Test(org.junit.jupiter.api.Test)

Example 22 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class HoodieFileWriterFactory method newParquetFileWriter.

private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable, TaskContextSupplier taskContextSupplier, boolean populateMetaFields, boolean enableBloomFilter) throws IOException {
    Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(hoodieTable.getHadoopConf()).convert(schema), schema, filter);
    HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(), config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(), hoodieTable.getHadoopConf(), config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
    return new HoodieParquetWriter<>(instantTime, path, parquetConfig, schema, taskContextSupplier, populateMetaFields);
}
Also used : AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Example 23 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class HoodieHFileWriter method close.

@Override
public void close() throws IOException {
    if (hfileConfig.useBloomFilter()) {
        final BloomFilter bloomFilter = hfileConfig.getBloomFilter();
        if (minRecordKey == null) {
            minRecordKey = "";
        }
        if (maxRecordKey == null) {
            maxRecordKey = "";
        }
        writer.appendFileInfo(HoodieHFileReader.KEY_MIN_RECORD.getBytes(), minRecordKey.getBytes());
        writer.appendFileInfo(HoodieHFileReader.KEY_MAX_RECORD.getBytes(), maxRecordKey.getBytes());
        writer.appendFileInfo(HoodieHFileReader.KEY_BLOOM_FILTER_TYPE_CODE.getBytes(), bloomFilter.getBloomFilterTypeCode().toString().getBytes());
        writer.appendMetaBlock(HoodieHFileReader.KEY_BLOOM_FILTER_META_BLOCK, new Writable() {

            @Override
            public void write(DataOutput out) throws IOException {
                out.write(bloomFilter.serializeToString().getBytes());
            }

            @Override
            public void readFields(DataInput in) throws IOException {
            }
        });
    }
    writer.close();
    writer = null;
}
Also used : DataInput(java.io.DataInput) DataOutput(java.io.DataOutput) Writable(org.apache.hadoop.io.Writable) IOException(java.io.IOException) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Example 24 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class HoodieKeyLookupHandle method getBloomFilter.

private BloomFilter getBloomFilter() {
    BloomFilter bloomFilter = null;
    HoodieTimer timer = new HoodieTimer().startTimer();
    try {
        if (config.isMetadataBloomFilterIndexEnabled()) {
            bloomFilter = hoodieTable.getMetadataTable().getBloomFilter(partitionPathFileIDPair.getLeft(), partitionPathFileIDPair.getRight()).orElseThrow(() -> new HoodieIndexException("BloomFilter missing for " + partitionPathFileIDPair.getRight()));
        } else {
            try (HoodieFileReader reader = createNewFileReader()) {
                bloomFilter = reader.readBloomFilter();
            }
        }
    } catch (IOException e) {
        throw new HoodieIndexException(String.format("Error reading bloom filter from %s", getPartitionPathFileIDPair()), e);
    }
    LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFileIDPair, timer.endTimer()));
    return bloomFilter;
}
Also used : HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) IOException(java.io.IOException) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Example 25 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class HiveTestUtil method generateParquetData.

@SuppressWarnings({ "unchecked", "deprecation" })
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple) throws IOException, URISyntaxException {
    Schema schema = getTestDataSchema(isParquetSchemaSimple);
    org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
    BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1, BloomFilterTypeCode.SIMPLE.name());
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, Option.of(filter));
    ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
    List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
    testRecords.forEach(s -> {
        try {
            writer.write(s);
        } catch (IOException e) {
            fail("IOException while writing test records as parquet" + e.toString());
        }
    });
    writer.close();
}
Also used : AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) IndexedRecord(org.apache.avro.generic.IndexedRecord) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) Schema(org.apache.avro.Schema) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) IOException(java.io.IOException) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Aggregations

BloomFilter (org.apache.hudi.common.bloom.BloomFilter)30 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)13 Path (org.apache.hadoop.fs.Path)11 ArrayList (java.util.ArrayList)9 GenericRecord (org.apache.avro.generic.GenericRecord)9 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)9 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)8 IOException (java.io.IOException)7 Schema (org.apache.avro.Schema)7 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)7 HoodieKey (org.apache.hudi.common.model.HoodieKey)7 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)7 AvroSchemaConverter (org.apache.parquet.avro.AvroSchemaConverter)6 IndexedRecord (org.apache.avro.generic.IndexedRecord)5 HoodieAvroWriteSupport (org.apache.hudi.avro.HoodieAvroWriteSupport)5 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)5 MethodSource (org.junit.jupiter.params.provider.MethodSource)5 HashMap (java.util.HashMap)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 Test (org.junit.jupiter.api.Test)4