Examples with BloomFilter - org.apache.hudi.common.bloom.BloomFilter

Example 11 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestHoodieConcatHandle method testInsert.

@Test
public void testInsert() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMergeAllowDuplicateOnInserts(true).build();
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
    // Get some records belong to the same partition (2021/09/11)
    String insertRecordStr1 = "{\"_row_key\":\"1\"," + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":1}";
    String insertRecordStr2 = "{\"_row_key\":\"2\"," + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":2}";
    List<HoodieRecord> records1 = new ArrayList<>();
    RawTripTestPayload insertRow1 = new RawTripTestPayload(insertRecordStr1);
    RawTripTestPayload insertRow2 = new RawTripTestPayload(insertRecordStr2);
    records1.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1));
    records1.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2));
    int startInstant = 1;
    String firstCommitTime = makeNewCommitTime(startInstant++);
    // First insert
    writeClient.startCommitWithTime(firstCommitTime);
    writeClient.insert(records1, firstCommitTime);
    String partitionPath = "2021/09/11";
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records1) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    insertRecordStr1 = "{\"_row_key\":\"1\"," + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":3}";
    insertRecordStr2 = "{\"_row_key\":\"2\"," + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":4}";
    List<HoodieRecord> records2 = new ArrayList<>();
    insertRow1 = new RawTripTestPayload(insertRecordStr1);
    insertRow2 = new RawTripTestPayload(insertRecordStr2);
    // The recordKey of records2 and records1 are the same, but the values of other fields are different
    records2.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1));
    records2.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2));
    String newCommitTime = makeNewCommitTime(startInstant++);
    writeClient.startCommitWithTime(newCommitTime);
    // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts
    writeClient.insert(records2, newCommitTime);
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    filePath = allFiles[0].getPath();
    // The final result should be a collection of records1 and records2
    records1.addAll(records2);
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
    int index = 0;
    for (GenericRecord record : fileRecords) {
        assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString());
        assertEquals(index + 1, record.get("number"));
        index++;
    }
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) HoodieKey(org.apache.hudi.common.model.HoodieKey) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 12 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestHoodieConcatHandle method testInsertWithDataGenerator.

@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testInsertWithDataGenerator(boolean mergeAllowDuplicateOnInsertsEnable) throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder(TRIP_EXAMPLE_SCHEMA).withMergeAllowDuplicateOnInserts(mergeAllowDuplicateOnInsertsEnable).build();
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
    String partitionPath = "2021/09/11";
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { partitionPath });
    int startInstant = 1;
    String firstCommitTime = makeNewCommitTime(startInstant++);
    List<HoodieRecord> records1 = dataGenerator.generateInserts(firstCommitTime, 100);
    // First insert
    writeClient.startCommitWithTime(firstCommitTime);
    writeClient.insert(records1, firstCommitTime);
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records1) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    String newCommitTime = makeNewCommitTime(startInstant++);
    List<HoodieRecord> records2 = dataGenerator.generateUpdates(newCommitTime, 100);
    writeClient.startCommitWithTime(newCommitTime);
    // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts
    writeClient.insert(records2, newCommitTime);
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    filePath = allFiles[0].getPath();
    // If mergeAllowDuplicateOnInsertsEnable is true, the final result should be a collection of records1 and records2
    records1.addAll(records2);
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
    assertEquals(fileRecords.size(), mergeAllowDuplicateOnInsertsEnable ? records1.size() : records2.size());
    int index = 0;
    for (GenericRecord record : fileRecords) {
        assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString());
        index++;
    }
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 13 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestFlinkHoodieBloomIndex method testCheckUUIDsAgainstOneFile.

@Test
public void testCheckUUIDsAgainstOneFile() throws Exception {
    final String partition = "2016/01/31";
    // Create some records to use
    String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    HoodieRecord record1 = new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    HoodieRecord record2 = new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    HoodieRecord record3 = new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord record4 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    // We write record1, record2 to a base file, but the bloom filter contains (record1,
    // record2, record3).
    BloomFilter filter = BloomFilterFactory.createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
    filter.add(record3.getRecordKey());
    HoodieFlinkWriteableTestTable testTable = HoodieFlinkWriteableTestTable.of(metaClient, SCHEMA, filter);
    String fileId = testTable.addCommit("000").getFileIdWithInserts(partition, record1, record2);
    String filename = testTable.getBaseFileNameById(fileId);
    // The bloom filter contains 3 records
    assertTrue(filter.mightContain(record1.getRecordKey()));
    assertTrue(filter.mightContain(record2.getRecordKey()));
    assertTrue(filter.mightContain(record3.getRecordKey()));
    assertFalse(filter.mightContain(record4.getRecordKey()));
    // Compare with file
    List<String> uuids = asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
    HoodieFlinkTable table = HoodieFlinkTable.create(config, context, metaClient);
    List<String> results = HoodieIndexUtils.filterKeysFromFile(new Path(java.nio.file.Paths.get(basePath, partition, filename).toString()), uuids, hadoopConf);
    assertEquals(results.size(), 2);
    assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
    assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames
// assertTrue(results.get(0)._2().equals(filename));
// assertTrue(results.get(1)._2().equals(filename));
}

Also used : Path(org.apache.hadoop.fs.Path) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable) HoodieFlinkWriteableTestTable(org.apache.hudi.testutils.HoodieFlinkWriteableTestTable) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 14 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestParquetUtils method testHoodieWriteSupport.

@ParameterizedTest
@MethodSource("bloomFilterTypeCodes")
public void testHoodieWriteSupport(String typeCode) throws Exception {
    List<String> rowKeys = new ArrayList<>();
    for (int i = 0; i < 1000; i++) {
        rowKeys.add(UUID.randomUUID().toString());
    }
    String filePath = Paths.get(basePath, "test.parquet").toUri().toString();
    writeParquetFile(typeCode, filePath, rowKeys);
    // Read and verify
    List<String> rowKeysInFile = new ArrayList<>(parquetUtils.readRowKeys(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)));
    Collections.sort(rowKeysInFile);
    Collections.sort(rowKeys);
    assertEquals(rowKeys, rowKeysInFile, "Did not read back the expected list of keys");
    BloomFilter filterInFile = parquetUtils.readBloomFilterFromMetadata(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath));
    for (String rowKey : rowKeys) {
        assertTrue(filterInFile.mightContain(rowKey), "key should be found in bloom filter");
    }
}

Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 15 with BloomFilter

use of org.apache.hudi.common.bloom.BloomFilter in project hudi by apache.

the class TestParquetUtils method writeParquetFile.

private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPathValue, boolean useMetaFields, String recordFieldName, String partitionFieldName) throws Exception {
    // Write out a parquet file
    BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, 10000, typeCode);
    HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
    ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
    for (String rowKey : rowKeys) {
        GenericRecord rec = new GenericData.Record(schema);
        rec.put(useMetaFields ? HoodieRecord.RECORD_KEY_METADATA_FIELD : recordFieldName, rowKey);
        if (addPartitionPathField) {
            rec.put(useMetaFields ? HoodieRecord.PARTITION_PATH_METADATA_FIELD : partitionFieldName, partitionPathValue);
        }
        writer.write(rec);
        writeSupport.add(rowKey);
    }
    writer.close();
}

Also used : Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) GenericRecord(org.apache.avro.generic.GenericRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter)

Aggregations

BloomFilter (org.apache.hudi.common.bloom.BloomFilter)30 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)13 Path (org.apache.hadoop.fs.Path)11 ArrayList (java.util.ArrayList)9 GenericRecord (org.apache.avro.generic.GenericRecord)9 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)9 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)8 IOException (java.io.IOException)7 Schema (org.apache.avro.Schema)7 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)7 HoodieKey (org.apache.hudi.common.model.HoodieKey)7 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)7 AvroSchemaConverter (org.apache.parquet.avro.AvroSchemaConverter)6 IndexedRecord (org.apache.avro.generic.IndexedRecord)5 HoodieAvroWriteSupport (org.apache.hudi.avro.HoodieAvroWriteSupport)5 ParquetWriter (org.apache.parquet.hadoop.ParquetWriter)5 MethodSource (org.junit.jupiter.params.provider.MethodSource)5 HashMap (java.util.HashMap)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 Test (org.junit.jupiter.api.Test)4