Search in sources :

Example 1 with HoodieJavaCopyOnWriteTable

use of org.apache.hudi.table.HoodieJavaCopyOnWriteTable in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testFileSizeUpsertRecords.

@Test
public void testFileSizeUpsertRecords() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(64 * 1024).hfileMaxFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024).build()).build();
    String instantTime = makeNewCommitTime();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
    List<HoodieRecord> records = new ArrayList<>();
    // Approx 1150 records are written for block size of 64KB
    for (int i = 0; i < 2000; i++) {
        String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
        RawTripTestPayload rowChange = new RawTripTestPayload(recordStr);
        records.add(new HoodieAvroRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
    }
    // Insert new records
    BaseJavaCommitActionExecutor actionExecutor = new JavaUpsertCommitActionExecutor(context, config, table, instantTime, records);
    Arrays.asList(1).stream().map(i -> actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator())).map(Transformations::flatten).collect(Collectors.toList());
    // Check the updated file
    int counts = 0;
    for (File file : Paths.get(basePath, "2016/01/31").toFile().listFiles()) {
        if (file.getName().endsWith(table.getBaseFileExtension()) && FSUtils.getCommitTime(file.getName()).equals(instantTime)) {
            LOG.info(file.getName() + "-" + file.length());
            counts++;
        }
    }
    assertEquals(3, counts, "If the number of records are more than 1150, then there should be a new file");
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) File(java.io.File) HoodieJavaCopyOnWriteTable(org.apache.hudi.table.HoodieJavaCopyOnWriteTable) Test(org.junit.jupiter.api.Test)

Example 2 with HoodieJavaCopyOnWriteTable

use of org.apache.hudi.table.HoodieJavaCopyOnWriteTable in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testBulkInsertRecords.

public void testBulkInsertRecords(String bulkInsertMode) throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withBulkInsertParallelism(2).withBulkInsertSortMode(bulkInsertMode).build();
    String instantTime = makeNewCommitTime();
    HoodieJavaWriteClient writeClient = getHoodieWriteClient(config);
    writeClient.startCommitWithTime(instantTime);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
    // Insert new records
    final List<HoodieRecord> inputRecords = generateTestRecordsForBulkInsert();
    JavaBulkInsertCommitActionExecutor bulkInsertExecutor = new JavaBulkInsertCommitActionExecutor(context, config, table, instantTime, inputRecords, Option.empty());
    List<WriteStatus> returnedStatuses = (List<WriteStatus>) bulkInsertExecutor.execute().getWriteStatuses();
    verifyStatusResult(returnedStatuses, generateExpectedPartitionNumRecords(inputRecords));
}
Also used : HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) List(java.util.List) ArrayList(java.util.ArrayList) HoodieJavaCopyOnWriteTable(org.apache.hudi.table.HoodieJavaCopyOnWriteTable) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 3 with HoodieJavaCopyOnWriteTable

use of org.apache.hudi.table.HoodieJavaCopyOnWriteTable in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testInsertRecords.

@Test
public void testInsertRecords() throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfig();
    String instantTime = makeNewCommitTime();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
    // Case 1:
    // 10 records for partition 1, 1 record for partition 2.
    List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
    records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
    // Insert new records
    final List<HoodieRecord> recs2 = records;
    BaseJavaCommitActionExecutor actionExecutor = new JavaInsertPreppedCommitActionExecutor(context, config, table, instantTime, recs2);
    final List<WriteStatus> returnedStatuses = new ArrayList<>();
    actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), recs2.iterator()).forEachRemaining(x -> returnedStatuses.addAll((List<WriteStatus>) x));
    assertEquals(2, returnedStatuses.size());
    Map<String, Long> expectedPartitionNumRecords = new HashMap<>();
    expectedPartitionNumRecords.put("2016/01/31", 10L);
    expectedPartitionNumRecords.put("2016/02/01", 1L);
    verifyStatusResult(returnedStatuses, expectedPartitionNumRecords);
    // Case 2:
    // 1 record for partition 1, 5 record for partition 2, 1 records for partition 3.
    records = newHoodieRecords(1, "2016-01-31T03:16:41.415Z");
    records.addAll(newHoodieRecords(5, "2016-02-01T03:16:41.415Z"));
    records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
    // Insert new records
    final List<HoodieRecord> recs3 = records;
    BaseJavaCommitActionExecutor newActionExecutor = new JavaUpsertPreppedCommitActionExecutor(context, config, table, instantTime, recs3);
    final List<WriteStatus> returnedStatuses1 = new ArrayList<>();
    newActionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), recs3.iterator()).forEachRemaining(x -> returnedStatuses1.addAll((List<WriteStatus>) x));
    assertEquals(3, returnedStatuses1.size());
    expectedPartitionNumRecords.clear();
    expectedPartitionNumRecords.put("2016/01/31", 1L);
    expectedPartitionNumRecords.put("2016/02/01", 5L);
    expectedPartitionNumRecords.put("2016/02/02", 1L);
    verifyStatusResult(returnedStatuses1, expectedPartitionNumRecords);
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) List(java.util.List) ArrayList(java.util.ArrayList) HoodieJavaCopyOnWriteTable(org.apache.hudi.table.HoodieJavaCopyOnWriteTable) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test)

Example 4 with HoodieJavaCopyOnWriteTable

use of org.apache.hudi.table.HoodieJavaCopyOnWriteTable in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testInsertUpsertWithHoodieAvroPayload.

@Test
public void testInsertUpsertWithHoodieAvroPayload() throws Exception {
    Schema schema = getSchemaFromResource(TestJavaCopyOnWriteActionExecutor.class, "/testDataGeneratorSchema.txt");
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withEngineType(EngineType.JAVA).withPath(basePath).withSchema(schema.toString()).withStorageConfig(HoodieStorageConfig.newBuilder().parquetMaxFileSize(1000 * 1024).hfileMaxFileSize(1000 * 1024).build()).build();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    final HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
    String instantTime = "000";
    // Perform inserts of 100 records to test CreateHandle and BufferedExecutor
    final List<HoodieRecord> inserts = dataGen.generateInsertsWithHoodieAvroPayload(instantTime, 100);
    BaseJavaCommitActionExecutor actionExecutor = new JavaInsertCommitActionExecutor(context, config, table, instantTime, inserts);
    final List<List<WriteStatus>> ws = new ArrayList<>();
    actionExecutor.handleInsert(UUID.randomUUID().toString(), inserts.iterator()).forEachRemaining(x -> ws.add((List<WriteStatus>) x));
    WriteStatus writeStatus = ws.get(0).get(0);
    String fileId = writeStatus.getFileId();
    metaClient.getFs().create(new Path(Paths.get(basePath, ".hoodie", "000.commit").toString())).close();
    // TODO : Find race condition that causes the timeline sometime to reflect 000.commit and sometimes not
    final HoodieJavaCopyOnWriteTable reloadedTable = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, HoodieTableMetaClient.reload(metaClient));
    final List<HoodieRecord> updates = dataGen.generateUpdatesWithHoodieAvroPayload(instantTime, inserts);
    String partitionPath = writeStatus.getPartitionPath();
    long numRecordsInPartition = updates.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count();
    BaseJavaCommitActionExecutor newActionExecutor = new JavaUpsertCommitActionExecutor(context, config, reloadedTable, instantTime, updates);
    taskContextSupplier.reset();
    final List<List<WriteStatus>> updateStatus = new ArrayList<>();
    newActionExecutor.handleUpdate(partitionPath, fileId, updates.iterator()).forEachRemaining(x -> updateStatus.add((List<WriteStatus>) x));
    assertEquals(updates.size() - numRecordsInPartition, updateStatus.get(0).get(0).getTotalErrorRecords());
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaWriteClient(org.apache.hudi.client.HoodieJavaWriteClient) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieHiveUtils(org.apache.hudi.hadoop.utils.HoodieHiveUtils) HoodieJavaClientTestBase(org.apache.hudi.testutils.HoodieJavaClientTestBase) Schema(org.apache.avro.Schema) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieJavaTable(org.apache.hudi.table.HoodieJavaTable) ParquetReader(org.apache.parquet.hadoop.ParquetReader) HoodieCreateHandle(org.apache.hudi.io.HoodieCreateHandle) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) EngineType(org.apache.hudi.common.engine.EngineType) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Mockito.mock(org.mockito.Mockito.mock) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieJavaCopyOnWriteTable(org.apache.hudi.table.HoodieJavaCopyOnWriteTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) SchemaTestUtil.getSchemaFromResource(org.apache.hudi.common.testutils.SchemaTestUtil.getSchemaFromResource) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Mockito.when(org.mockito.Mockito.when) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieTestTable.makeNewCommitTime(org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime) Transformations(org.apache.hudi.common.testutils.Transformations) Paths(java.nio.file.Paths) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) List(java.util.List) ArrayList(java.util.ArrayList) HoodieJavaCopyOnWriteTable(org.apache.hudi.table.HoodieJavaCopyOnWriteTable) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test)

Example 5 with HoodieJavaCopyOnWriteTable

use of org.apache.hudi.table.HoodieJavaCopyOnWriteTable in project hudi by apache.

the class TestJavaCopyOnWriteActionExecutor method testMetadataAggregateFromWriteStatus.

// Check if record level metadata is aggregated properly at the end of write.
@Test
public void testMetadataAggregateFromWriteStatus() throws Exception {
    // Prepare the AvroParquetIO
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build();
    String firstCommitTime = makeNewCommitTime();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieJavaCopyOnWriteTable table = (HoodieJavaCopyOnWriteTable) HoodieJavaTable.create(config, context, metaClient);
    // Get some records belong to the same partition (2016/01/31)
    String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
    String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
    String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\"," + "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
    List<HoodieRecord> records = new ArrayList<>();
    RawTripTestPayload rowChange1 = new RawTripTestPayload(recordStr1);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
    RawTripTestPayload rowChange2 = new RawTripTestPayload(recordStr2);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
    RawTripTestPayload rowChange3 = new RawTripTestPayload(recordStr3);
    records.add(new HoodieAvroRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
    // Insert new records
    BaseJavaCommitActionExecutor actionExecutor = new JavaInsertCommitActionExecutor(context, config, table, firstCommitTime, records);
    List<WriteStatus> writeStatuses = new ArrayList<>();
    actionExecutor.handleInsert(FSUtils.createNewFileIdPfx(), records.iterator()).forEachRemaining(x -> writeStatuses.addAll((List<WriteStatus>) x));
    Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus.mergeMetadataForWriteStatuses(writeStatuses);
    assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
    // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
    // should be 2 * 3
    assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) List(java.util.List) ArrayList(java.util.ArrayList) HoodieJavaCopyOnWriteTable(org.apache.hudi.table.HoodieJavaCopyOnWriteTable) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test)

Aggregations

ArrayList (java.util.ArrayList)5 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)5 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)5 HoodieJavaCopyOnWriteTable (org.apache.hudi.table.HoodieJavaCopyOnWriteTable)5 List (java.util.List)4 WriteStatus (org.apache.hudi.client.WriteStatus)4 MetadataMergeWriteStatus (org.apache.hudi.testutils.MetadataMergeWriteStatus)4 Test (org.junit.jupiter.api.Test)4 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 RawTripTestPayload (org.apache.hudi.common.testutils.RawTripTestPayload)3 File (java.io.File)2 HashMap (java.util.HashMap)2 HoodieJavaWriteClient (org.apache.hudi.client.HoodieJavaWriteClient)2 Paths (java.nio.file.Paths)1 Arrays (java.util.Arrays)1 Map (java.util.Map)1 UUID (java.util.UUID)1 Collectors (java.util.stream.Collectors)1 Schema (org.apache.avro.Schema)1