Search in sources :

Example 6 with HoodieJavaWriteClient

use of org.apache.hudi.client.HoodieJavaWriteClient in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testUpdateRecords.

@Test
public void testUpdateRecords() throws Exception {
    // Prepare the AvroParquetIO
    HoodieWriteConfig config = makeHoodieClientConfig();
    int startInstant = 1;
    String firstCommitTime = makeNewCommitTime(startInstant++);
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    writeClient.startCommitWithTime(firstCommitTime);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
    String partitionPath = "2016/01/31";
    // Get some records belong to the same partition (2016/01/31)
    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
    List<HoodieRecord> records = new ArrayList<>();
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
    // Insert new records
    writeClient.insert(records, firstCommitTime);
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
    GenericRecord newRecord;
    int index = 0;
    for (GenericRecord record : fileRecords) {
        // System.out.println("Got :" + record.get("_row_key").toString() + ", Exp :" + records.get(index).getRecordKey());
        assertEquals(records.get(index).getRecordKey(), record.get("_row_key").toString());
        index++;
    }
    // We update the 1st record & add a new record
    String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    RawTripTestPayload updateRowChanges1 = new RawTripTestPayload(updateRecordStr1);
    HoodieRecord updatedRecord1 = new HoodieAvroRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
    RawTripTestPayload rowChange4 = new RawTripTestPayload(recordStr4);
    HoodieRecord insertedRecord1 = new HoodieAvroRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
    List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
    String newCommitTime = makeNewCommitTime(startInstant++);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    writeClient.startCommitWithTime(newCommitTime);
    List<WriteStatus> statuses = writeClient.upsert(updatedRecords, newCommitTime);
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    // Check whether the record has been updated
    Path updatedFilePath = allFiles[0].getPath();
    BloomFilter updatedFilter = fileUtils.readBloomFilterFromMetadata(hadoopConf, updatedFilePath);
    for (HoodieRecord record : records) {
        // No change to the _row_key
        assertTrue(updatedFilter.mightContain(record.getRecordKey()));
    }
    assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
    // add this so it can further check below
    records.add(insertedRecord1);
    ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedFilePath).build();
    index = 0;
    while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
        assertEquals(newRecord.get("_row_key").toString(), records.get(index).getRecordKey());
        if (index == 0) {
            assertEquals("15", newRecord.get("number").toString());
        }
        index++;
    }
    updatedReader.close();
    // Also check the numRecordsWritten
    WriteStatus writeStatus = statuses.get(0);
    assertEquals(1, statuses.size(), "Should be only one file generated");
    // 3 rewritten records + 1 new record
    assertEquals(4, writeStatus.getStat().getNumWrites());
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParquetReader(org.apache.parquet.hadoop.ParquetReader) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) HoodieKey(org.apache.hudi.common.model.HoodieKey) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) GenericRecord(org.apache.avro.generic.GenericRecord) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieJavaWriteClient (org.apache.hudi.client.HoodieJavaWriteClient)6 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)5 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)5 ArrayList (java.util.ArrayList)4 Path (org.apache.hadoop.fs.Path)4 GenericRecord (org.apache.avro.generic.GenericRecord)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 BloomFilter (org.apache.hudi.common.bloom.BloomFilter)3 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 BaseFileUtils (org.apache.hudi.common.util.BaseFileUtils)3 List (java.util.List)2 WriteStatus (org.apache.hudi.client.WriteStatus)2 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)2 MetadataMergeWriteStatus (org.apache.hudi.testutils.MetadataMergeWriteStatus)2 Test (org.junit.jupiter.api.Test)2 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)2 Collectors (java.util.stream.Collectors)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1