Examples with BloomFilter - org.apache.hudi.common.bloom.BloomFilter

Example 26 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestHoodieBloomIndex method testCheckUUIDsAgainstOneFile.

@Test
public void testCheckUUIDsAgainstOneFile() throws Exception {
    final String partition = "2016/01/31";
    // Create some records to use
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    // We write record1, record2 to a parquet file, but the bloom filter contains (record1,
    // record2, record3).
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record3.getRecordKey());
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, SCHEMA, filter, metadataWriter);
    final Map<String, List<Pair<String, Integer>>> partitionToFilesNameLengthMap = new HashMap<>();
    final String commitTime = "0000001";
    final String fileId = UUID.randomUUID().toString();
    Path baseFilePath = testTable.forCommit(commitTime).withInserts(partition, fileId, Arrays.asList(record1, record2));
    long baseFileLength = fs.getFileStatus(baseFilePath).getLen();
    partitionToFilesNameLengthMap.computeIfAbsent(partition, k -> new ArrayList<>()).add(Pair.of(fileId, Integer.valueOf((int) baseFileLength)));
    testTable.doWriteOperation(commitTime, WriteOperationType.UPSERT, Collections.singletonList(partition), partitionToFilesNameLengthMap, false, false);
    final String filename = testTable.getBaseFileNameById(fileId);
    // The bloom filter contains 3 records
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    assertTrue(filter.mightContain(record3.getRecordKey()));
    assertFalse(filter.mightContain(record4.getRecordKey()));
    // Compare with file
    List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
    HoodieSparkTable table = HoodieSparkTable.create(config, context, metaClient);
    List<String> results = HoodieIndexUtils.filterKeysFromFile(new Path(Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf);
    assertEquals(results.size(), 2);
    assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
    assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames
// assertTrue(results.get(0)._2().equals(filename));
// assertTrue(results.get(1)._2().equals(filename));
}

Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HoodieIndexUtils(org.apache.hudi.index.HoodieIndexUtils) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) TestHoodieMetadataBase(org.apache.hudi.client.functional.TestHoodieMetadataBase) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) List(java.util.List) ArrayList(java.util.ArrayList) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 27 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testBloomFilterFalseError.

@ParameterizedTest(name = TEST_NAME_WITH_PARAMS)
@MethodSource("configParams")
public void testBloomFilterFalseError(boolean rangePruning, boolean treeFiltering, boolean bucketizedChecking) throws Exception {
    // We have two hoodie records
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    // We write record1 to a base file, using a bloom filter having both records
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record2.getRecordKey());
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter);
    String fileId = testTable.addCommit("000").getFileIdWithInserts("2016/01/31", record1);
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    // We do the tag
    List<HoodieRecord> records = asList(record1, record2);
    HoodieWriteConfig config = makeConfig(rangePruning, treeFiltering, bucketizedChecking);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable table = HoodieFlinkTable.create(config, context, metaClient);
    HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, ListBasedHoodieBloomIndexHelper.getInstance());
    List<HoodieRecord> taggedRecords = tagLocation(bloomIndex, records, table);
    // Check results
    for (HoodieRecord record : taggedRecords) {
        if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
            assertEquals(record.getCurrentLocation().getFileId(), fileId);
        } else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
            assertFalse(record.isCurrentLocationKnown());
        }
    }
}

Also used : RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 28 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testUpdateRecords.

@Test
public void testUpdateRecords() throws Exception {
    // Prepare the AvroParquetIO
    HoodieWriteConfig config = makeHoodieClientConfig();
    int startInstant = 1;
    String firstCommitTime = makeNewCommitTime(startInstant++);
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    writeClient.startCommitWithTime(firstCommitTime);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
    String partitionPath = "2016/01/31";
    // Get some records belong to the same partition (2016/01/31)
    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
    List<HoodieRecord> records = new ArrayList<>();
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
    // Insert new records
    writeClient.insert(records, firstCommitTime);
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
    GenericRecord newRecord;
    int index = 0;
    for (GenericRecord record : fileRecords) {
        // System.out.println("Got :" + record.get("_row_key").toString() + ", Exp :" + records.get(index).getRecordKey());
        assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString());
        index++;
    }
    // We update the 1st record & add a new record
    String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1);
    HoodieRecord updatedRecord1 = new HoodieAvroRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord insertedRecord1 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
    String newCommitTime = makeNewCommitTime(startInstant++);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    writeClient.startCommitWithTime(newCommitTime);
    List<WriteStatus> statuses = writeClient.upsert(updatedRecords, newCommitTime);
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    // Check whether the record has been updated
    Path updatedFilePath = allFiles[0].getPath();
    BloomFilter updatedFilter = fileUtils.readBloomFilterFromMetadata(hadoopConf, updatedFilePath);
    for (HoodieRecord record : records) {
        // No change to the _row_key
        assertTrue(updatedFilter.mightContain(record.getRecordKey()));
    }
    assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
    // add this so it can further check below
    records.add(insertedRecord1);
    ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build();
    index = 0;
    while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
        assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey());
        if (index == 0) {
            assertEquals("15", newRecord.get("number").toString());
        }
        index++;
    }
    updatedReader.close();
    // Also check the numRecordsWritten
    WriteStatus writeStatus = statuses.get(0);
    assertEquals(1, statuses.size(), "Should be only one file generated");
    // 3 rewritten records + 1 new record
    assertEquals(4, writeStatus.getStat().getNumWrites());
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParquetReader(org.apache.parquet.hadoop.ParquetReader) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) HoodieKey(org.apache.hudi.common.model.HoodieKey) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) GenericRecord(org.apache.avro.generic.GenericRecord) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test)

Example 29 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestHoodieAvroWriteSupport method testAddKey.

@Test
public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException {
    List<String> rowKeys = new ArrayList<>();
    for (int i = 0; i < 1000; i++) {
        rowKeys.add(UUID.randomUUID().toString());
    }
    String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString();
    Schema schema = HoodieAvroUtils.getRecordKeySchema();
    BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, 10000, BloomFilterTypeCode.SIMPLE.name());
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
    ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
    for (String rowKey : rowKeys) {
        GenericRecord rec = new GenericData.Record(schema);
        rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
        writer.write(rec);
        writeSupport.add(rowKey);
    }
    writer.close();
}

Also used : Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) Test(org.junit.jupiter.api.Test)

Example 30 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class BaseTableMetadata method getBloomFilters.

@Override
public Map<Pair<String, String>, BloomFilter> getBloomFilters(final List<Pair<String, String>> partitionNameFileNameList) throws HoodieMetadataException {
    if (!isBloomFilterIndexEnabled) {
        LOG.error("Metadata bloom filter index is disabled!");
        return Collections.emptyMap();
    }
    if (partitionNameFileNameList.isEmpty()) {
        return Collections.emptyMap();
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    Set<String> partitionIDFileIDSortedStrings = new TreeSet<>();
    Map<String, Pair<String, String>> fileToKeyMap = new HashMap<>();
    partitionNameFileNameList.forEach(partitionNameFileNamePair -> {
        final String bloomFilterIndexKey = HoodieMetadataPayload.getBloomFilterIndexKey(new PartitionIndexID(partitionNameFileNamePair.getLeft()), new FileIndexID(partitionNameFileNamePair.getRight()));
        partitionIDFileIDSortedStrings.add(bloomFilterIndexKey);
        fileToKeyMap.put(bloomFilterIndexKey, partitionNameFileNamePair);
    });
    List<String> partitionIDFileIDStrings = new ArrayList<>(partitionIDFileIDSortedStrings);
    List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> hoodieRecordList = getRecordsByKeys(partitionIDFileIDStrings, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath());
    metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_BLOOM_FILTERS_METADATA_STR, (timer.endTimer() / partitionIDFileIDStrings.size())));
    Map<Pair<String, String>, BloomFilter> partitionFileToBloomFilterMap = new HashMap<>();
    for (final Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : hoodieRecordList) {
        if (entry.getRight().isPresent()) {
            final Option<HoodieMetadataBloomFilter> bloomFilterMetadata = entry.getRight().get().getData().getBloomFilterMetadata();
            if (bloomFilterMetadata.isPresent()) {
                if (!bloomFilterMetadata.get().getIsDeleted()) {
                    ValidationUtils.checkState(fileToKeyMap.containsKey(entry.getLeft()));
                    final ByteBuffer bloomFilterByteBuffer = bloomFilterMetadata.get().getBloomFilter();
                    final String bloomFilterType = bloomFilterMetadata.get().getType();
                    final BloomFilter bloomFilter = BloomFilterFactory.fromString(StandardCharsets.UTF_8.decode(bloomFilterByteBuffer).toString(), bloomFilterType);
                    partitionFileToBloomFilterMap.put(fileToKeyMap.get(entry.getLeft()), bloomFilter);
                }
            } else {
                LOG.error("Meta index bloom filter missing for: " + fileToKeyMap.get(entry.getLeft()));
            }
        }
    }
    return partitionFileToBloomFilterMap;
}

Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ByteBuffer(java.nio.ByteBuffer) HoodieMetadataBloomFilter(org.apache.hudi.avro.model.HoodieMetadataBloomFilter) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) PartitionIndexID(org.apache.hudi.common.util.hash.PartitionIndexID) HoodieMetadataBloomFilter(org.apache.hudi.avro.model.HoodieMetadataBloomFilter) TreeSet(java.util.TreeSet) Option(org.apache.hudi.common.util.Option) FileIndexID(org.apache.hudi.common.util.hash.FileIndexID) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

BloomFilter (org.apache.hudi.common.bloom.BloomFilter)30 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)13 Path (org.apache.hadoop.fs.Path)11 ArrayList (java.util.ArrayList)9 GenericRecord (org.apache.avro.generic.GenericRecord)9 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)9 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)8 IOException (java.io.IOException)7 Schema (org.apache.avro.Schema)7 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)7 HoodieKey (org.apache.hudi.common.model.HoodieKey)7 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)7 AvroSchemaConverter (org.apache.parquet.avro.AvroSchemaConverter)6 IndexedRecord (org.apache.avro.generic.IndexedRecord)5 HoodieAvroWriteSupport (org.apache.hudi.avro.HoodieAvroWriteSupport)5 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)5 MethodSource (org.junit.jupiter.params.provider.MethodSource)5 HashMap (java.util.HashMap)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 Test (org.junit.jupiter.api.Test)4