Search in sources :

Example 1 with HoodieFailedWritesCleaningPolicy

use of org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testRollbackFailedCommitsToggleCleaningPolicy.

@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testRollbackFailedCommitsToggleCleaningPolicy(boolean populateMetaFields) throws Exception {
    HoodieTestUtils.init(hadoopConf, basePath);
    HoodieFailedWritesCleaningPolicy cleaningPolicy = EAGER;
    SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    // Perform 1 successful writes to table
    writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, true);
    // Perform 1 failed writes to table
    writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
    client.close();
    // Toggle cleaning policy to LAZY
    cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
    // Perform 2 failed writes to table
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
    client.close();
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
    client.close();
    // Await till enough time passes such that the 2 failed commits heartbeats are expired
    boolean conditionMet = false;
    while (!conditionMet) {
        conditionMet = client.getHeartbeatClient().isHeartbeatExpired("400");
        Thread.sleep(2000);
    }
    client.clean();
    HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
    assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 3);
    // Perform 2 failed commits
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    writeBatch(client, "500", "400", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
    client.close();
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    writeBatch(client, "600", "500", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
    client.close();
    // Toggle cleaning policy to EAGER
    cleaningPolicy = EAGER;
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
    client.startCommit();
    timeline = metaClient.getActiveTimeline().reload();
    assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 5);
    assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 1);
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 2 with HoodieFailedWritesCleaningPolicy

use of org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy in project hudi by apache.

the class TestHoodieClientOnCopyOnWriteStorage method testParallelInsertAndCleanPreviousFailedCommits.

@Test
public void testParallelInsertAndCleanPreviousFailedCommits() throws Exception {
    HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
    ExecutorService service = Executors.newFixedThreadPool(2);
    HoodieTestUtils.init(hadoopConf, basePath);
    SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
    // perform 1 successfull write
    writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true);
    // Perform 2 failed writes to table
    writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, false);
    client.close();
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
    writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, false);
    client.close();
    // refresh data generator to delete records generated from failed commits
    dataGen = new HoodieTestDataGenerator();
    // Create a succesful commit
    Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), "400", "300", Option.of(Arrays.asList("400")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
    commit3.get();
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
    assertTrue(metaClient.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0);
    assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
    assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 2);
    client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
    // Await till enough time passes such that the first 2 failed commits heartbeats are expired
    boolean conditionMet = false;
    while (!conditionMet) {
        conditionMet = client.getHeartbeatClient().isHeartbeatExpired("300");
        Thread.sleep(2000);
    }
    Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), "500", "400", Option.of(Arrays.asList("500")), "500", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
    Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)).clean());
    commit4.get();
    clean1.get();
    HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
    assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 2);
    // Since we write rollbacks not clean, there should be no clean action on the timeline
    assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)).countInstants() == 0);
    assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3);
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ExecutorService(java.util.concurrent.ExecutorService) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Aggregations

SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)2 HoodieFailedWritesCleaningPolicy (org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy)2 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)2 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)2 ExecutorService (java.util.concurrent.ExecutorService)1 HoodieCleanMetadata (org.apache.hudi.avro.model.HoodieCleanMetadata)1 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)1 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)1 HoodieJavaRDD (org.apache.hudi.data.HoodieJavaRDD)1 JavaRDD (org.apache.spark.api.java.JavaRDD)1 Test (org.junit.jupiter.api.Test)1 MethodSource (org.junit.jupiter.params.provider.MethodSource)1