use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestAsyncCompaction method testRollbackForInflightCompaction.
@Test
public void testRollbackForInflightCompaction() throws Exception {
// Rollback inflight compaction
HoodieWriteConfig cfg = getConfig(false);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath());
String firstInstantTime = "001";
String secondInstantTime = "004";
String compactionInstantTime = "005";
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>());
// Schedule compaction but do not run them
scheduleCompaction(compactionInstantTime, client, cfg);
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
HoodieInstant pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
assertEquals(compactionInstantTime, pendingCompactionInstant.getTimestamp(), "Pending Compaction instant has expected instant time");
assertEquals(State.REQUESTED, pendingCompactionInstant.getState(), "Pending Compaction instant has expected state");
moveCompactionFromRequestedToInflight(compactionInstantTime, cfg);
// Reload and rollback inflight compaction
metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context, metaClient);
hoodieTable.rollbackInflightCompaction(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionInstantTime));
metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
pendingCompactionInstant = metaClient.getCommitsAndCompactionTimeline().filterPendingCompactionTimeline().getInstants().findFirst().get();
assertEquals("compaction", pendingCompactionInstant.getAction());
assertEquals(State.REQUESTED, pendingCompactionInstant.getState());
assertEquals(compactionInstantTime, pendingCompactionInstant.getTimestamp());
// We indirectly test for the race condition where a inflight instant was first deleted then created new. Every
// time this happens, the pending compaction instant file in Hoodie Meta path becomes an empty file (Note: Hoodie
// reads compaction plan from aux path which is untouched). TO test for regression, we simply get file status
// and look at the file size
FileStatus fstatus = metaClient.getFs().getFileStatus(new Path(metaClient.getMetaPath(), pendingCompactionInstant.getFileName()));
assertTrue(fstatus.getLen() > 0);
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestAsyncCompaction method testInflightCompaction.
@Test
public void testInflightCompaction() throws Exception {
// There is inflight compaction. Subsequent compaction run must work correctly
HoodieWriteConfig cfg = getConfig(true);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath());
String firstInstantTime = "001";
String secondInstantTime = "004";
String compactionInstantTime = "005";
String thirdInstantTime = "006";
String fourthInstantTime = "007";
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>());
// Schedule and mark compaction instant as inflight
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
scheduleCompaction(compactionInstantTime, client, cfg);
moveCompactionFromRequestedToInflight(compactionInstantTime, cfg);
// Complete ingestions
runNextDeltaCommits(client, readClient, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false, Arrays.asList(compactionInstantTime));
// execute inflight compaction
executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestAsyncCompaction method testCompactionOnReplacedFiles.
@Test
public void testCompactionOnReplacedFiles() throws Exception {
// Schedule a compaction. Replace those file groups and ensure compaction completes successfully.
HoodieWriteConfig cfg = getConfig(true);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath());
String firstInstantTime = "001";
String secondInstantTime = "004";
String compactionInstantTime = "005";
String replaceInstantTime = "006";
String fourthInstantTime = "007";
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>());
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
scheduleCompaction(compactionInstantTime, client, cfg);
metaClient.reloadActiveTimeline();
HoodieInstant pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
assertEquals(compactionInstantTime, pendingCompactionInstant.getTimestamp(), "Pending Compaction instant has expected instant time");
Set<HoodieFileGroupId> fileGroupsBeforeReplace = getAllFileGroups(hoodieTable, dataGen.getPartitionPaths());
// replace by using insertOverwrite
JavaRDD<HoodieRecord> replaceRecords = jsc.parallelize(dataGen.generateInserts(replaceInstantTime, numRecs), 1);
client.startCommitWithTime(replaceInstantTime, HoodieTimeline.REPLACE_COMMIT_ACTION);
client.insertOverwrite(replaceRecords, replaceInstantTime);
metaClient.reloadActiveTimeline();
hoodieTable = getHoodieTable(metaClient, cfg);
Set<HoodieFileGroupId> newFileGroups = getAllFileGroups(hoodieTable, dataGen.getPartitionPaths());
// make sure earlier file groups are not visible
assertEquals(0, newFileGroups.stream().filter(fg -> fileGroupsBeforeReplace.contains(fg)).count());
// compaction should run with associated file groups are replaced
executeCompactionWithReplacedFiles(compactionInstantTime, client, hoodieTable, cfg, dataGen.getPartitionPaths(), fileGroupsBeforeReplace);
}
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestAsyncCompaction method testScheduleCompactionAfterPendingIngestion.
@Test
public void testScheduleCompactionAfterPendingIngestion() throws Exception {
// Case: Failure case. Earliest ingestion inflight instant time must be later than compaction time
HoodieWriteConfig cfg = getConfig(false);
SparkRDDWriteClient client = getHoodieWriteClient(cfg);
HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath());
String firstInstantTime = "001";
String secondInstantTime = "004";
String inflightInstantTime = "005";
String compactionInstantTime = "006";
int numRecs = 2000;
List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>());
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
createNextDeltaCommit(inflightInstantTime, records, client, metaClient, cfg, true);
metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
HoodieInstant inflightInstant = metaClient.getActiveTimeline().filterPendingExcludingCompaction().firstInstant().get();
assertEquals(inflightInstantTime, inflightInstant.getTimestamp(), "inflight instant has expected instant time");
assertThrows(IllegalArgumentException.class, () -> {
// Schedule compaction but do not run them
scheduleCompaction(compactionInstantTime, client, cfg);
}, "Earliest ingestion inflight instant time must be later than compaction time");
}
use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.
the class TestHoodieCompactor method testWriteStatusContentsAfterCompaction.
@Test
public void testWriteStatusContentsAfterCompaction() throws Exception {
// insert 100 records
HoodieWriteConfig config = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).build();
try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
String newCommitTime = "100";
writeClient.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
writeClient.insert(recordsRDD, newCommitTime).collect();
// Update all the 100 records
HoodieTable table = HoodieSparkTable.create(config, context);
newCommitTime = "101";
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
HoodieIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance());
JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = tagLocation(index, updatedRecordsRDD, table);
writeClient.startCommitWithTime(newCommitTime);
writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
metaClient.reloadActiveTimeline();
// Verify that all data file has one log file
table = HoodieSparkTable.create(config, context);
for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
for (FileSlice fileSlice : groupedLogFiles) {
assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for every data file");
}
}
// Do a compaction
table = HoodieSparkTable.create(config, context);
String compactionInstantTime = "102";
table.scheduleCompaction(context, compactionInstantTime, Option.empty());
table.getMetaClient().reloadActiveTimeline();
HoodieData<WriteStatus> result = (HoodieData<WriteStatus>) table.compact(context, compactionInstantTime).getWriteStatuses();
// Verify that all partition paths are present in the WriteStatus result
for (String partitionPath : dataGen.getPartitionPaths()) {
List<WriteStatus> writeStatuses = result.collectAsList();
assertTrue(writeStatuses.stream().filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath)).count() > 0);
}
}
}
Aggregations