Search in sources :

Example 1 with HoodieJavaWriteClient

use of org.apache.hudi.client.HoodieJavaWriteClient in project hudi by apache.

the class HoodieJavaWriteClientExample method main.

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("Usage: HoodieJavaWriteClientExample <tablePath> <tableName>");
        System.exit(1);
    }
    String tablePath = args[0];
    String tableName = args[1];
    // Generator of some records to be loaded in.
    HoodieExampleDataGenerator<HoodieAvroPayload> dataGen = new HoodieExampleDataGenerator<>();
    Configuration hadoopConf = new Configuration();
    // initialize the table, if not done already
    Path path = new Path(tablePath);
    FileSystem fs = FSUtils.getFs(tablePath, hadoopConf);
    if (!fs.exists(path)) {
        HoodieTableMetaClient.withPropertyBuilder().setTableType(tableType).setTableName(tableName).setPayloadClassName(HoodieAvroPayload.class.getName()).initTable(hadoopConf, tablePath);
    }
    // Create the write client to write some records in
    HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieExampleDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(20, 30).build()).build();
    HoodieJavaWriteClient<HoodieAvroPayload> client = new HoodieJavaWriteClient<>(new HoodieJavaEngineContext(hadoopConf), cfg);
    // inserts
    String newCommitTime = client.startCommit();
    LOG.info("Starting commit " + newCommitTime);
    List<HoodieRecord<HoodieAvroPayload>> records = dataGen.generateInserts(newCommitTime, 10);
    List<HoodieRecord<HoodieAvroPayload>> recordsSoFar = new ArrayList<>(records);
    List<HoodieRecord<HoodieAvroPayload>> writeRecords = recordsSoFar.stream().map(r -> new HoodieAvroRecord<HoodieAvroPayload>(r)).collect(Collectors.toList());
    client.insert(writeRecords, newCommitTime);
    // updates
    newCommitTime = client.startCommit();
    LOG.info("Starting commit " + newCommitTime);
    List<HoodieRecord<HoodieAvroPayload>> toBeUpdated = dataGen.generateUpdates(newCommitTime, 2);
    records.addAll(toBeUpdated);
    recordsSoFar.addAll(toBeUpdated);
    writeRecords = recordsSoFar.stream().map(r -> new HoodieAvroRecord<HoodieAvroPayload>(r)).collect(Collectors.toList());
    client.upsert(writeRecords, newCommitTime);
    // Delete
    newCommitTime = client.startCommit();
    LOG.info("Starting commit " + newCommitTime);
    // just delete half of the records
    int numToDelete = recordsSoFar.size() / 2;
    List<HoodieKey> toBeDeleted = recordsSoFar.stream().map(HoodieRecord::getKey).limit(numToDelete).collect(Collectors.toList());
    client.delete(toBeDeleted, newCommitTime);
    client.close();
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieJavaEngineContext(org.apache.hudi.client.common.HoodieJavaEngineContext) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) ArrayList(java.util.ArrayList) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) List(java.util.List) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) Path(org.apache.hadoop.fs.Path) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Configuration(org.apache.hadoop.conf.Configuration) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) HoodieExampleDataGenerator(org.apache.hudi.examples.common.HoodieExampleDataGenerator) HoodieJavaEngineContext(org.apache.hudi.client.common.HoodieJavaEngineContext)

Example 2 with HoodieJavaWriteClient

use of org.apache.hudi.client.HoodieJavaWriteClient in project hudi by apache.

the class TestHoodieConcatHandle method testInsert.

@Test
public void testInsert() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withMergeAllowDuplicateOnInserts(true).build();
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
    // Get some records belong to the same partition (2021/09/11)
    String insertRecordStr1 = "{\"_row_key\":\"1\"," + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":1}";
    String insertRecordStr2 = "{\"_row_key\":\"2\"," + "\"time\":\"2021-09-11T16:16:41.415Z\",\"number\":2}";
    List<HoodieRecord> records1 = new ArrayList<>();
    RawTripTestPayload insertRow1 = new RawTripTestPayload(insertRecordStr1);
    RawTripTestPayload insertRow2 = new RawTripTestPayload(insertRecordStr2);
    records1.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1));
    records1.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2));
    int startInstant = 1;
    String firstCommitTime = makeNewCommitTime(startInstant++);
    // First insert
    writeClient.startCommitWithTime(firstCommitTime);
    writeClient.insert(records1, firstCommitTime);
    String partitionPath = "2021/09/11";
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records1) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    insertRecordStr1 = "{\"_row_key\":\"1\"," + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":3}";
    insertRecordStr2 = "{\"_row_key\":\"2\"," + "\"time\":\"2021-09-11T16:39:41.415Z\",\"number\":4}";
    List<HoodieRecord> records2 = new ArrayList<>();
    insertRow1 = new RawTripTestPayload(insertRecordStr1);
    insertRow2 = new RawTripTestPayload(insertRecordStr2);
    // The recordKey of records2 and records1 are the same, but the values of other fields are different
    records2.add(new HoodieAvroRecord(new HoodieKey(insertRow1.getRowKey(), insertRow1.getPartitionPath()), insertRow1));
    records2.add(new HoodieAvroRecord(new HoodieKey(insertRow2.getRowKey(), insertRow2.getPartitionPath()), insertRow2));
    String newCommitTime = makeNewCommitTime(startInstant++);
    writeClient.startCommitWithTime(newCommitTime);
    // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts
    writeClient.insert(records2, newCommitTime);
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    filePath = allFiles[0].getPath();
    // The final result should be a collection of records1 and records2
    records1.addAll(records2);
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
    int index = 0;
    for (GenericRecord record : fileRecords) {
        assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString());
        assertEquals(index + 1, record.get("number"));
        index++;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) HoodieKey(org.apache.hudi.common.model.HoodieKey) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) GenericRecord(org.apache.avro.generic.GenericRecord) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 3 with HoodieJavaWriteClient

use of org.apache.hudi.client.HoodieJavaWriteClient in project hudi by apache.

the class TestHoodieConcatHandle method testInsertWithDataGenerator.

@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testInsertWithDataGenerator(boolean mergeAllowDuplicateOnInsertsEnable) throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder(TRIP_EXAMPLE_SCHEMA).withMergeAllowDuplicateOnInserts(mergeAllowDuplicateOnInsertsEnable).build();
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    BaseFileUtils fileUtils = BaseFileUtils.getInstance(metaClient);
    String partitionPath = "2021/09/11";
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { partitionPath });
    int startInstant = 1;
    String firstCommitTime = makeNewCommitTime(startInstant++);
    List<HoodieRecord> records1 = dataGenerator.generateInserts(firstCommitTime, 100);
    // First insert
    writeClient.startCommitWithTime(firstCommitTime);
    writeClient.insert(records1, firstCommitTime);
    FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
    assertEquals(1, allFiles.length);
    // Read out the bloom filter and make sure filter can answer record exist or not
    Path filePath = allFiles[0].getPath();
    BloomFilter filter = fileUtils.readBloomFilterFromMetadata(hadoopConf, filePath);
    for (HoodieRecord record : records1) {
        assertTrue(filter.mightContain(record.getRecordKey()));
    }
    String newCommitTime = makeNewCommitTime(startInstant++);
    List<HoodieRecord> records2 = dataGenerator.generateUpdates(newCommitTime, 100);
    writeClient.startCommitWithTime(newCommitTime);
    // Second insert is the same as the _row_key of the first one,test allowDuplicateInserts
    writeClient.insert(records2, newCommitTime);
    allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
    assertEquals(1, allFiles.length);
    // verify new incremental file group is same as the previous one
    assertEquals(FSUtils.getFileId(filePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
    filePath = allFiles[0].getPath();
    // If mergeAllowDuplicateOnInsertsEnable is true, the final result should be a collection of records1 and records2
    records1.addAll(records2);
    // Read the base file, check the record content
    List<GenericRecord> fileRecords = fileUtils.readAvroRecords(hadoopConf, filePath);
    assertEquals(fileRecords.size(), mergeAllowDuplicateOnInsertsEnable ? records1.size() : records2.size());
    int index = 0;
    for (GenericRecord record : fileRecords) {
        assertEquals(records1.get(index).getRecordKey(), record.get("_row_key").toString());
        index++;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 4 with HoodieJavaWriteClient

use of org.apache.hudi.client.HoodieJavaWriteClient in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testBulkInsertRecords.

public void testBulkInsertRecords(String bulkInsertMode) throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withBulkInsertParallelism(2).withBulkInsertSortMode(bulkInsertMode).build();
    String instantTime = makeNewCommitTime();
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    writeClient.startCommitWithTime(instantTime);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
    // Insert new records
    final List<HoodieRecord> inputRecords = generateTestRecordsForBulkInsert();
    JavaBulkInsertCommitActionExecutor bulkInsertExecutor = new JavaBulkInsertCommitActionExecutor(context, config, table, instantTime, inputRecords, Option.empty());
    List<WriteStatus> returnedStatuses = (List<WriteStatus>) bulkInsertExecutor.execute().getWriteStatuses();
    verifyStatusResult(returnedStatuses, generateExpectedPartitionNumRecords(inputRecords));
}
Also used : HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) List(java.util.List) ArrayList(java.util.ArrayList) HoodieJavaCopyOnWriteTable(org.apache.hudi.table.HoodieJavaCopyOnWriteTable) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 5 with HoodieJavaWriteClient

use of org.apache.hudi.client.HoodieJavaWriteClient in project hudi by apache.

the class HoodieJavaClientTestHarness method getHoodieWriteClient.

public HoodieJavaWriteClient getHoodieWriteClient(HoodieWriteConfig cfg) {
    if (null != writeClient) {
        writeClient.close();
        writeClient = null;
    }
    writeClient = new HoodieJavaWriteClient(context, cfg);
    return writeClient;
}
Also used : HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient)

Aggregations

HoodieJavaWriteClient (org.apache.hudi.client.HoodieJavaWriteClient)6 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)5 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)5 ArrayList (java.util.ArrayList)4 Path (org.apache.hadoop.fs.Path)4 GenericRecord (org.apache.avro.generic.GenericRecord)3 FileStatus (org.apache.hadoop.fs.FileStatus)3 BloomFilter (org.apache.hudi.common.bloom.BloomFilter)3 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 BaseFileUtils (org.apache.hudi.common.util.BaseFileUtils)3 List (java.util.List)2 WriteStatus (org.apache.hudi.client.WriteStatus)2 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)2 MetadataMergeWriteStatus (org.apache.hudi.testutils.MetadataMergeWriteStatus)2 Test (org.junit.jupiter.api.Test)2 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)2 Collectors (java.util.stream.Collectors)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1