use of org.apache.hudi.common.model.WriteOperationType.INSERT in project hudi by apache.
the class TestHoodieBackedMetadata method testReattemptOfFailedClusteringCommit.
/**
* Lets say clustering commit succeeded in metadata table, but failed before committing to datatable.
* Next time, when clustering kicks in, hudi will rollback pending clustering (in data table) and re-attempt the clustering with same
* instant time. So, this test ensures the 2nd attempt succeeds with metadata enabled.
* This is applicable to any table service where instant time is fixed. So, how many ever times the operation fails, re attempt will
* be made with same commit time.
* Tests uses clustering to test out the scenario.
*/
@Test
public void testReattemptOfFailedClusteringCommit() throws Exception {
tableType = HoodieTableType.COPY_ON_WRITE;
init(tableType);
context = new HoodieSparkEngineContext(jsc);
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false);
SparkRDDWriteClient client = getHoodieWriteClient(config);
// Write 1 (Bulk insert)
String newCommitTime = "0000001";
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
client.startCommitWithTime(newCommitTime);
List<WriteStatus> writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Write 2 (inserts)
newCommitTime = "0000002";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringSortColumns("_row_key").withInlineClustering(true).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
HoodieWriteConfig newWriteConfig = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER).withAutoCommit(false).withClusteringConfig(clusteringConfig).build();
// trigger clustering
SparkRDDWriteClient newClient = getHoodieWriteClient(newWriteConfig);
String clusteringCommitTime = newClient.scheduleClustering(Option.empty()).get().toString();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = newClient.cluster(clusteringCommitTime, true);
// collect replaceFileIds for validation later.
Set<HoodieFileGroupId> replacedFileIds = new HashSet<>();
clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
// trigger new write to mimic other writes succeeding before re-attempt.
newCommitTime = "0000003";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// manually remove clustering completed instant from .hoodie folder and to mimic succeeded clustering in metadata table, but failed in data table.
FileCreateUtils.deleteReplaceCommit(basePath, clusteringCommitTime);
HoodieWriteMetadata<JavaRDD<WriteStatus>> updatedClusterMetadata = newClient.cluster(clusteringCommitTime, true);
metaClient.reloadActiveTimeline();
Set<HoodieFileGroupId> updatedReplacedFileIds = new HashSet<>();
updatedClusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> updatedReplacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
assertEquals(replacedFileIds, updatedReplacedFileIds);
validateMetadata(client);
}
Aggregations