use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA in project hudi by apache.
the class TestHoodieBackedMetadata method testReattemptOfFailedClusteringCommit.
/**
* Lets say clustering commit succeeded in metadata table, but failed before committing to datatable.
* Next time, when clustering kicks in, hudi will rollback pending clustering (in data table) and re-attempt the clustering with same
* instant time. So, this test ensures the 2nd attempt succeeds with metadata enabled.
* This is applicable to any table service where instant time is fixed. So, how many ever times the operation fails, re attempt will
* be made with same commit time.
* Tests uses clustering to test out the scenario.
*/
@Test
public void testReattemptOfFailedClusteringCommit() throws Exception {
tableType = HoodieTableType.COPY_ON_WRITE;
init(tableType);
context = new HoodieSparkEngineContext(jsc);
HoodieWriteConfig config = getSmallInsertWriteConfig(2000, TRIP_EXAMPLE_SCHEMA, 10, false);
SparkRDDWriteClient client = getHoodieWriteClient(config);
// Write 1 (Bulk insert)
String newCommitTime = "0000001";
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
client.startCommitWithTime(newCommitTime);
List<WriteStatus> writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// Write 2 (inserts)
newCommitTime = "0000002";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// setup clustering config.
HoodieClusteringConfig clusteringConfig = HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringSortColumns("_row_key").withInlineClustering(true).withClusteringTargetPartitions(0).withInlineClusteringNumCommits(1).build();
HoodieWriteConfig newWriteConfig = getConfigBuilder(TRIP_EXAMPLE_SCHEMA, HoodieIndex.IndexType.BLOOM, HoodieFailedWritesCleaningPolicy.EAGER).withAutoCommit(false).withClusteringConfig(clusteringConfig).build();
// trigger clustering
SparkRDDWriteClient newClient = getHoodieWriteClient(newWriteConfig);
String clusteringCommitTime = newClient.scheduleClustering(Option.empty()).get().toString();
HoodieWriteMetadata<JavaRDD<WriteStatus>> clusterMetadata = newClient.cluster(clusteringCommitTime, true);
// collect replaceFileIds for validation later.
Set<HoodieFileGroupId> replacedFileIds = new HashSet<>();
clusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> replacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
// trigger new write to mimic other writes succeeding before re-attempt.
newCommitTime = "0000003";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateInserts(newCommitTime, 20);
writeStatuses = client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
assertNoWriteErrors(writeStatuses);
validateMetadata(client);
// manually remove clustering completed instant from .hoodie folder and to mimic succeeded clustering in metadata table, but failed in data table.
FileCreateUtils.deleteReplaceCommit(basePath, clusteringCommitTime);
HoodieWriteMetadata<JavaRDD<WriteStatus>> updatedClusterMetadata = newClient.cluster(clusteringCommitTime, true);
metaClient.reloadActiveTimeline();
Set<HoodieFileGroupId> updatedReplacedFileIds = new HashSet<>();
updatedClusterMetadata.getPartitionToReplaceFileIds().entrySet().forEach(partitionFiles -> partitionFiles.getValue().stream().forEach(file -> updatedReplacedFileIds.add(new HoodieFileGroupId(partitionFiles.getKey(), file))));
assertEquals(replacedFileIds, updatedReplacedFileIds);
validateMetadata(client);
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testUpdateRejectForClustering.
@Test
public void testUpdateRejectForClustering() throws IOException {
final String testPartitionPath = "2016/09/26";
dataGen = new HoodieTestDataGenerator(new String[] { testPartitionPath });
Properties props = new Properties();
props.setProperty(ASYNC_CLUSTERING_ENABLE.key(), "true");
HoodieWriteConfig config = getSmallInsertWriteConfig(100, TRIP_EXAMPLE_SCHEMA, dataGen.getEstimatedFileSizeInBytes(150), true, props);
SparkRDDWriteClient client = getHoodieWriteClient(config);
HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
// 1. insert to generate 2 file group
String commitTime1 = "001";
Pair<List<WriteStatus>, List<HoodieRecord>> upsertResult = insertBatchRecords(client, commitTime1, 600, 2);
List<HoodieRecord> inserts1 = upsertResult.getValue();
List<String> fileGroupIds1 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(2, fileGroupIds1.size());
// 2. generate clustering plan for fileGroupIds1 file groups
String commitTime2 = "002";
List<List<FileSlice>> firstInsertFileSlicesList = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getAllFileSlices().collect(Collectors.toList())).collect(Collectors.toList());
List<FileSlice>[] fileSlices = (List<FileSlice>[]) firstInsertFileSlicesList.toArray(new List[firstInsertFileSlicesList.size()]);
createRequestedReplaceInstant(this.metaClient, commitTime2, fileSlices);
// 3. insert one record with no updating reject exception, and not merge the small file, just generate a new file group
String commitTime3 = "003";
insertBatchRecords(client, commitTime3, 1, 1).getKey();
List<String> fileGroupIds2 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(3, fileGroupIds2.size());
// 4. update one record for the clustering two file groups, throw reject update exception
String commitTime4 = "004";
client.startCommitWithTime(commitTime4);
List<HoodieRecord> insertsAndUpdates3 = new ArrayList<>();
insertsAndUpdates3.addAll(dataGen.generateUpdates(commitTime4, inserts1));
String assertMsg = String.format("Not allowed to update the clustering files in partition: %s " + "For pending clustering operations, we are not going to support update for now.", testPartitionPath);
assertThrows(HoodieUpsertException.class, () -> {
writeClient.upsert(jsc.parallelize(insertsAndUpdates3, 1), commitTime3).collect();
}, assertMsg);
// 5. insert one record with no updating reject exception, will merge the small file
String commitTime5 = "005";
List<WriteStatus> statuses = insertBatchRecords(client, commitTime5, 1, 1).getKey();
fileGroupIds2.removeAll(fileGroupIds1);
assertEquals(fileGroupIds2.get(0), statuses.get(0).getFileId());
List<String> firstInsertFileGroupIds4 = table.getFileSystemView().getAllFileGroups(testPartitionPath).map(fileGroup -> fileGroup.getFileGroupId().getFileId()).collect(Collectors.toList());
assertEquals(3, firstInsertFileGroupIds4.size());
}
Aggregations