use of org.apache.hudi.common.util.Option in project hudi by apache.
the class TestCleaner method testPendingCompactions.
/**
* Common test method for validating pending compactions.
*
* @param config Hoodie Write Config
* @param expNumFilesDeleted Number of files deleted
*/
private void testPendingCompactions(HoodieWriteConfig config, int expNumFilesDeleted, int expNumFilesUnderCompactionDeleted, boolean retryFailure) throws Exception {
HoodieTableMetaClient metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
final String partition = "2016/03/15";
Map<String, String> expFileIdToPendingCompaction = new HashMap<String, String>() {
{
put("fileId2", "004");
put("fileId3", "006");
put("fileId4", "008");
put("fileId5", "010");
}
};
Map<String, String> fileIdToLatestInstantBeforeCompaction = new HashMap<String, String>() {
{
put("fileId1", "000");
put("fileId2", "000");
put("fileId3", "001");
put("fileId4", "003");
put("fileId5", "005");
put("fileId6", "009");
put("fileId7", "011");
}
};
// Generate 7 file-groups. First one has only one slice and no pending compaction. File Slices (2 - 5) has
// multiple versions with pending compaction. File Slices (6 - 7) have multiple file-slices but not under
// compactions
// FileIds 2-5 will be under compaction
HoodieTestTable.of(metaClient).addCommit("000").withBaseFilesInPartition(partition, "fileId1", "fileId2", "fileId3", "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId1", 1, 2).withLogFile(partition, "fileId2", 1, 2).withLogFile(partition, "fileId3", 1, 2).withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addCommit("001").withBaseFilesInPartition(partition, "fileId3", "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId3", 1, 2).withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addCommit("003").withBaseFilesInPartition(partition, "fileId4", "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId4", 1, 2).withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("004", new FileSlice(partition, "000", "fileId2")).withLogFile(partition, "fileId2", 1, 2).addCommit("005").withBaseFilesInPartition(partition, "fileId5", "fileId6", "fileId7").withLogFile(partition, "fileId5", 1, 2).withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("006", new FileSlice(partition, "001", "fileId3")).withLogFile(partition, "fileId3", 1, 2).addCommit("007").withBaseFilesInPartition(partition, "fileId6", "fileId7").withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("008", new FileSlice(partition, "003", "fileId4")).withLogFile(partition, "fileId4", 1, 2).addCommit("009").withBaseFilesInPartition(partition, "fileId6", "fileId7").withLogFile(partition, "fileId6", 1, 2).withLogFile(partition, "fileId7", 1, 2).addRequestedCompaction("010", new FileSlice(partition, "005", "fileId5")).withLogFile(partition, "fileId5", 1, 2).addCommit("011").withBaseFilesInPartition(partition, "fileId7").withLogFile(partition, "fileId7", 1, 2).addCommit("013");
// Clean now
metaClient = HoodieTableMetaClient.reload(metaClient);
List<HoodieCleanStat> hoodieCleanStats = runCleaner(config, retryFailure);
// Test for safety
final HoodieTableMetaClient newMetaClient = HoodieTableMetaClient.reload(metaClient);
final HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
expFileIdToPendingCompaction.forEach((fileId, value) -> {
String baseInstantForCompaction = fileIdToLatestInstantBeforeCompaction.get(fileId);
Option<FileSlice> fileSliceForCompaction = Option.fromJavaOptional(hoodieTable.getSliceView().getLatestFileSlicesBeforeOrOn(partition, baseInstantForCompaction, true).filter(fs -> fs.getFileId().equals(fileId)).findFirst());
assertTrue(fileSliceForCompaction.isPresent(), "Base Instant for Compaction must be preserved");
assertTrue(fileSliceForCompaction.get().getBaseFile().isPresent(), "FileSlice has data-file");
assertEquals(2, fileSliceForCompaction.get().getLogFiles().count(), "FileSlice has log-files");
});
// Test for progress (Did we clean some files ?)
long numFilesUnderCompactionDeleted = hoodieCleanStats.stream().flatMap(cleanStat -> convertPathToFileIdWithCommitTime(newMetaClient, cleanStat.getDeletePathPatterns()).map(fileIdWithCommitTime -> {
if (expFileIdToPendingCompaction.containsKey(fileIdWithCommitTime.getKey())) {
assertTrue(HoodieTimeline.compareTimestamps(fileIdToLatestInstantBeforeCompaction.get(fileIdWithCommitTime.getKey()), HoodieTimeline.GREATER_THAN, fileIdWithCommitTime.getValue()), "Deleted instant time must be less than pending compaction");
return true;
}
return false;
})).filter(x -> x).count();
long numDeleted = hoodieCleanStats.stream().mapToLong(cleanStat -> cleanStat.getDeletePathPatterns().size()).sum();
// Tighter check for regression
assertEquals(expNumFilesDeleted, numDeleted, "Correct number of files deleted");
assertEquals(expNumFilesUnderCompactionDeleted, numFilesUnderCompactionDeleted, "Correct number of files under compaction deleted");
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class TestHoodieMergeOnReadTable method testUpsertPartitioner.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testUpsertPartitioner(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
/**
* Write 1 (only inserts, written as base file)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
Map<String, Long> fileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
dataFilesToRead = roView.getLatestBaseFiles();
List<HoodieBaseFile> dataFilesList = dataFilesToRead.collect(Collectors.toList());
assertTrue(dataFilesList.size() > 0, "Should list the base files we wrote in the delta commit");
/**
* Write 2 (only updates + inserts, written to .log file + correction of existing base file size)
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> newRecords = dataGen.generateUpdates(newCommitTime, records);
newRecords.addAll(dataGen.generateInserts(newCommitTime, 20));
statuses = client.upsert(jsc().parallelize(newRecords), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
metaClient = HoodieTableMetaClient.reload(metaClient);
deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("002", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 002");
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
allFiles = listAllBaseFilesInPath(hoodieTable);
roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
dataFilesToRead = roView.getLatestBaseFiles();
List<HoodieBaseFile> newDataFilesList = dataFilesToRead.collect(Collectors.toList());
Map<String, Long> fileIdToNewSize = newDataFilesList.stream().collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
assertTrue(fileIdToNewSize.entrySet().stream().anyMatch(entry -> fileIdToSize.get(entry.getKey()) < entry.getValue()));
List<String> inputPaths = roView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, false);
// Wrote 20 records in 2 batches
assertEquals(40, recordsRead.size(), "Must contain 40 records");
}
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class HoodieClientTestHarness method buildProfile.
public static Pair<HashMap<String, WorkloadStat>, WorkloadStat> buildProfile(JavaRDD<HoodieRecord> inputRecordsRDD) {
HashMap<String, WorkloadStat> partitionPathStatMap = new HashMap<>();
WorkloadStat globalStat = new WorkloadStat();
// group the records by partitionPath + currentLocation combination, count the number of
// records in each partition
Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = inputRecordsRDD.mapToPair(record -> new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)).countByKey();
// count the number of both inserts and updates in each partition, update the counts to workLoadStats
for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
String partitionPath = e.getKey()._1();
Long count = e.getValue();
Option<HoodieRecordLocation> locOption = e.getKey()._2();
if (!partitionPathStatMap.containsKey(partitionPath)) {
partitionPathStatMap.put(partitionPath, new WorkloadStat());
}
if (locOption.isPresent()) {
// update
partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
globalStat.addUpdates(locOption.get(), count);
} else {
// insert
partitionPathStatMap.get(partitionPath).addInserts(count);
globalStat.addInserts(count);
}
}
return Pair.of(partitionPathStatMap, globalStat);
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testSimpleInsertUpdateAndDelete.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testSimpleInsertUpdateAndDelete(boolean populateMetaFields) throws Exception {
Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen();
properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
HoodieWriteConfig cfg = cfgBuilder.build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
/*
* Write 1 (only inserts, written as base file)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
assertFalse(dataFilesToRead.findAny().isPresent());
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
dataFilesToRead = tableView.getLatestBaseFiles();
assertTrue(dataFilesToRead.findAny().isPresent(), "should list the base files we wrote in the delta commit");
/*
* Write 2 (only updates, written to .log file)
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, records);
writeRecords = jsc().parallelize(records, 1);
statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
/*
* Write 2 (only deletes, written to .log file)
*/
newCommitTime = "004";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records);
statuses = client.upsert(jsc().parallelize(fewRecordsForDelete, 1), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
metaClient = HoodieTableMetaClient.reload(metaClient);
deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("004", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 004");
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
allFiles = listAllBaseFilesInPath(hoodieTable);
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
dataFilesToRead = tableView.getLatestBaseFiles();
assertTrue(dataFilesToRead.findAny().isPresent());
List<String> inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, populateMetaFields);
// Wrote 20 records and deleted 20 records, so remaining 20-20 = 0
assertEquals(0, recordsRead.size(), "Must contain 0 records");
}
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class TestHoodieSparkMergeOnReadTableRollback method testCOWToMORConvertedTableRollback.
@ParameterizedTest
@ValueSource(booleans = { true, false })
void testCOWToMORConvertedTableRollback(boolean rollbackUsingMarkers) throws Exception {
// Set TableType to COW
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.COPY_ON_WRITE);
HoodieWriteConfig cfg = getConfig(false, rollbackUsingMarkers);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
/*
* Write 1 (only inserts)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
// verify there are no errors
assertNoWriteErrors(statuses);
client.commit(newCommitTime, jsc().parallelize(statuses));
metaClient = HoodieTableMetaClient.reload(metaClient);
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertTrue(commit.isPresent());
assertEquals("001", commit.get().getTimestamp(), "commit should be 001");
/*
* Write 2 (updates)
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, records);
statuses = client.upsert(jsc().parallelize(records, 1), newCommitTime).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
// Set TableType to MOR
metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ);
// rollback a COW commit when TableType is MOR
client.rollback(newCommitTime);
metaClient = HoodieTableMetaClient.reload(metaClient);
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
final String absentCommit = newCommitTime;
assertAll(tableView.getLatestBaseFiles().map(file -> () -> assertNotEquals(absentCommit, file.getCommitTime())));
}
}
Aggregations