use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class TestHoodieTableFileSystemView method testPendingClusteringOperations.
@Test
public void testPendingClusteringOperations() throws IOException {
String partitionPath1 = "2020/06/27";
new File(basePath + "/" + partitionPath1).mkdirs();
// create 2 fileId in partition1 - fileId1 is replaced later on.
String fileId1 = UUID.randomUUID().toString();
String fileId2 = UUID.randomUUID().toString();
String fileId3 = UUID.randomUUID().toString();
assertFalse(roView.getLatestBaseFiles(partitionPath1).anyMatch(dfile -> dfile.getFileId().equals(fileId1) || dfile.getFileId().equals(fileId2) || dfile.getFileId().equals(fileId3)), "No commit, should not find any data file");
// Only one commit
String commitTime1 = "1";
String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1);
String fileName2 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2);
String fileName3 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId3);
new File(basePath + "/" + partitionPath1 + "/" + fileName1).createNewFile();
new File(basePath + "/" + partitionPath1 + "/" + fileName2).createNewFile();
new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile();
HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1);
saveAsComplete(commitTimeline, instant1, Option.empty());
refreshFsView();
assertEquals(1, roView.getLatestBaseFiles(partitionPath1).filter(dfile -> dfile.getFileId().equals(fileId1)).count());
assertEquals(1, roView.getLatestBaseFiles(partitionPath1).filter(dfile -> dfile.getFileId().equals(fileId2)).count());
assertEquals(1, roView.getLatestBaseFiles(partitionPath1).filter(dfile -> dfile.getFileId().equals(fileId3)).count());
List<FileSlice>[] fileSliceGroups = new List[] { Collections.singletonList(fsView.getLatestFileSlice(partitionPath1, fileId1).get()), Collections.singletonList(fsView.getLatestFileSlice(partitionPath1, fileId2).get()) };
// create pending clustering operation - fileId1, fileId2 are being clustered in different groups
HoodieClusteringPlan plan = ClusteringUtils.createClusteringPlan("strategy", new HashMap<>(), fileSliceGroups, Collections.emptyMap());
String clusterTime = "2";
HoodieInstant instant2 = new HoodieInstant(State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, clusterTime);
HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setClusteringPlan(plan).setOperationType(WriteOperationType.CLUSTER.name()).build();
metaClient.getActiveTimeline().saveToPendingReplaceCommit(instant2, TimelineMetadataUtils.serializeRequestedReplaceMetadata(requestedReplaceMetadata));
// make sure view doesnt include fileId1
refreshFsView();
Set<String> fileIds = fsView.getFileGroupsInPendingClustering().map(e -> e.getLeft().getFileId()).collect(Collectors.toSet());
assertTrue(fileIds.contains(fileId1));
assertTrue(fileIds.contains(fileId2));
assertFalse(fileIds.contains(fileId3));
}
use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class TestHoodieTableFileSystemView method testViewForFileSlicesWithNoBaseFile.
protected void testViewForFileSlicesWithNoBaseFile(int expNumTotalFileSlices, int expNumTotalDataFiles, String partitionPath) throws Exception {
Paths.get(basePath, partitionPath).toFile().mkdirs();
String fileId = UUID.randomUUID().toString();
String instantTime1 = "1";
String deltaInstantTime1 = "2";
String deltaInstantTime2 = "3";
String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN);
String fileName2 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 1, TEST_WRITE_TOKEN);
Paths.get(basePath, partitionPath, fileName1).toFile().createNewFile();
Paths.get(basePath, partitionPath, fileName2).toFile().createNewFile();
HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1);
HoodieInstant deltaInstant2 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime1);
HoodieInstant deltaInstant3 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime2);
saveAsComplete(commitTimeline, instant1, Option.empty());
saveAsComplete(commitTimeline, deltaInstant2, Option.empty());
saveAsComplete(commitTimeline, deltaInstant3, Option.empty());
refreshFsView();
List<HoodieBaseFile> dataFiles = roView.getLatestBaseFiles().collect(Collectors.toList());
assertTrue(dataFiles.isEmpty(), "No data file expected");
List<FileSlice> fileSliceList = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
assertEquals(1, fileSliceList.size());
FileSlice fileSlice = fileSliceList.get(0);
assertEquals(fileId, fileSlice.getFileId(), "File-Id must be set correctly");
assertFalse(fileSlice.getBaseFile().isPresent(), "Data file for base instant must be present");
assertEquals(instantTime1, fileSlice.getBaseInstantTime(), "Base Instant for file-group set correctly");
List<HoodieLogFile> logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
assertEquals(2, logFiles.size(), "Correct number of log-files shows up in file-slice");
assertEquals(fileName2, logFiles.get(0).getFileName(), "Log File Order check");
assertEquals(fileName1, logFiles.get(1).getFileName(), "Log File Order check");
// Check Merged File Slices API
fileSliceList = rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime2).collect(Collectors.toList());
assertEquals(1, fileSliceList.size());
fileSlice = fileSliceList.get(0);
assertEquals(fileId, fileSlice.getFileId(), "File-Id must be set correctly");
assertFalse(fileSlice.getBaseFile().isPresent(), "Data file for base instant must be present");
assertEquals(instantTime1, fileSlice.getBaseInstantTime(), "Base Instant for file-group set correctly");
logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
assertEquals(2, logFiles.size(), "Correct number of log-files shows up in file-slice");
assertEquals(fileName2, logFiles.get(0).getFileName(), "Log File Order check");
assertEquals(fileName1, logFiles.get(1).getFileName(), "Log File Order check");
// Check UnCompacted File Slices API
fileSliceList = rtView.getLatestUnCompactedFileSlices(partitionPath).collect(Collectors.toList());
assertEquals(1, fileSliceList.size());
fileSlice = fileSliceList.get(0);
assertEquals(fileId, fileSlice.getFileId(), "File-Id must be set correctly");
assertFalse(fileSlice.getBaseFile().isPresent(), "Data file for base instant must be present");
assertEquals(instantTime1, fileSlice.getBaseInstantTime(), "Base Instant for file-group set correctly");
logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
assertEquals(2, logFiles.size(), "Correct number of log-files shows up in file-slice");
assertEquals(fileName2, logFiles.get(0).getFileName(), "Log File Order check");
assertEquals(fileName1, logFiles.get(1).getFileName(), "Log File Order check");
assertEquals(expNumTotalFileSlices, rtView.getAllFileSlices(partitionPath).count(), "Total number of file-slices in view matches expected");
assertEquals(expNumTotalDataFiles, roView.getAllBaseFiles(partitionPath).count(), "Total number of data-files in view matches expected");
assertEquals(1, fsView.getAllFileGroups(partitionPath).count(), "Total number of file-groups in view matches expected");
}
use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class TestHoodieTableFileSystemView method testHoodieTableFileSystemViewWithPendingClustering.
/**
* create hoodie table like
* .
* ├── .hoodie
* │ ├── .aux
* │ │ └── .bootstrap
* │ │ ├── .fileids
* │ │ └── .partitions
* │ ├── .temp
* │ ├── 1.commit
* │ ├── 1.commit.requested
* │ ├── 1.inflight
* │ ├── 2.replacecommit
* │ ├── 2.replacecommit.inflight
* │ ├── 2.replacecommit.requested
* │ ├── 3.commit
* │ ├── 3.commit.requested
* │ ├── 3.inflight
* │ ├── archived
* │ └── hoodie.properties
* └── 2020
* └── 06
* └── 27
* ├── 5fe477d2-0150-46d4-833c-1e9cc8da9948_1-0-1_3.parquet
* ├── 7e3208c8-fdec-4254-9682-8fff1e51ee8d_1-0-1_2.parquet
* ├── e04b0e2d-1467-46b2-8ea6-f4fe950965a5_1-0-1_1.parquet
* └── f3936b66-b3db-4fc8-a6d0-b1a7559016e6_1-0-1_1.parquet
*
* First test fsView API with finished clustering:
* 1. getLatestBaseFilesBeforeOrOn
* 2. getBaseFileOn
* 3. getLatestBaseFilesInRange
* 4. getAllBaseFiles
* 5. getLatestBaseFiles
*
* Then remove 2.replacecommit, 1.commit, 1.commit.requested, 1.inflight to simulate
* pending clustering at the earliest position in the active timeline and test these APIs again.
*
* @throws IOException
*/
@Test
public void testHoodieTableFileSystemViewWithPendingClustering() throws IOException {
List<String> latestBaseFilesBeforeOrOn;
Option<HoodieBaseFile> baseFileOn;
List<String> latestBaseFilesInRange;
List<String> allBaseFiles;
List<String> latestBaseFiles;
List<String> latestBaseFilesPerPartition;
String partitionPath = "2020/06/27";
new File(basePath + "/" + partitionPath).mkdirs();
HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
// will create 5 fileId in partition.
// fileId1 and fileId2 will be replaced by fileID3
// fileId4 and fileId5 will be committed after clustering finished.
String fileId1 = UUID.randomUUID().toString();
String fileId2 = UUID.randomUUID().toString();
String fileId3 = UUID.randomUUID().toString();
String fileId4 = UUID.randomUUID().toString();
String fileId5 = UUID.randomUUID().toString();
assertFalse(roView.getLatestBaseFiles(partitionPath).anyMatch(dfile -> dfile.getFileId().equals(fileId1) || dfile.getFileId().equals(fileId2) || dfile.getFileId().equals(fileId3) || dfile.getFileId().equals(fileId4) || dfile.getFileId().equals(fileId5)), "No commit, should not find any data file");
// first insert commit
String commitTime1 = "1";
String fileName1 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1);
String fileName2 = FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId2);
new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile();
new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile();
HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime1);
// build writeStats
HashMap<String, List<String>> partitionToFile1 = new HashMap<>();
ArrayList<String> files1 = new ArrayList<>();
files1.add(fileId1);
files1.add(fileId2);
partitionToFile1.put(partitionPath, files1);
List<HoodieWriteStat> writeStats1 = buildWriteStats(partitionToFile1, commitTime1);
HoodieCommitMetadata commitMetadata1 = CommitUtils.buildMetadata(writeStats1, new HashMap<>(), Option.empty(), WriteOperationType.INSERT, "", HoodieTimeline.COMMIT_ACTION);
saveAsComplete(commitTimeline, instant1, Option.of(commitMetadata1.toJsonString().getBytes(StandardCharsets.UTF_8)));
commitTimeline.reload();
// replace commit
String commitTime2 = "2";
String fileName3 = FSUtils.makeDataFileName(commitTime2, TEST_WRITE_TOKEN, fileId3);
new File(basePath + "/" + partitionPath + "/" + fileName3).createNewFile();
HoodieInstant instant2 = new HoodieInstant(true, HoodieTimeline.REPLACE_COMMIT_ACTION, commitTime2);
Map<String, List<String>> partitionToReplaceFileIds = new HashMap<>();
List<String> replacedFileIds = new ArrayList<>();
replacedFileIds.add(fileId1);
replacedFileIds.add(fileId2);
partitionToReplaceFileIds.put(partitionPath, replacedFileIds);
HashMap<String, List<String>> partitionToFile2 = new HashMap<>();
ArrayList<String> files2 = new ArrayList<>();
files2.add(fileId3);
partitionToFile2.put(partitionPath, files2);
List<HoodieWriteStat> writeStats2 = buildWriteStats(partitionToFile2, commitTime2);
HoodieCommitMetadata commitMetadata2 = CommitUtils.buildMetadata(writeStats2, partitionToReplaceFileIds, Option.empty(), WriteOperationType.INSERT_OVERWRITE, "", HoodieTimeline.REPLACE_COMMIT_ACTION);
saveAsComplete(commitTimeline, instant2, Option.of(commitMetadata2.toJsonString().getBytes(StandardCharsets.UTF_8)));
// another insert commit
String commitTime3 = "3";
String fileName4 = FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId4);
new File(basePath + "/" + partitionPath + "/" + fileName4).createNewFile();
HoodieInstant instant3 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, commitTime3);
// build writeStats
HashMap<String, List<String>> partitionToFile3 = new HashMap<>();
ArrayList<String> files3 = new ArrayList<>();
files3.add(fileId4);
partitionToFile3.put(partitionPath, files3);
List<HoodieWriteStat> writeStats3 = buildWriteStats(partitionToFile3, commitTime3);
HoodieCommitMetadata commitMetadata3 = CommitUtils.buildMetadata(writeStats3, new HashMap<>(), Option.empty(), WriteOperationType.INSERT, "", HoodieTimeline.COMMIT_ACTION);
saveAsComplete(commitTimeline, instant3, Option.of(commitMetadata3.toJsonString().getBytes(StandardCharsets.UTF_8)));
metaClient.reloadActiveTimeline();
refreshFsView();
ArrayList<String> commits = new ArrayList<>();
commits.add(commitTime1);
commits.add(commitTime2);
commits.add(commitTime3);
// do check
latestBaseFilesBeforeOrOn = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, commitTime3).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(2, latestBaseFilesBeforeOrOn.size());
assertTrue(latestBaseFilesBeforeOrOn.contains(fileId3));
assertTrue(latestBaseFilesBeforeOrOn.contains(fileId4));
// could see fileId3 because clustering is committed.
baseFileOn = fsView.getBaseFileOn(partitionPath, commitTime2, fileId3);
assertTrue(baseFileOn.isPresent());
assertEquals(baseFileOn.get().getFileId(), fileId3);
latestBaseFilesInRange = fsView.getLatestBaseFilesInRange(commits).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(2, latestBaseFilesInRange.size());
assertTrue(latestBaseFilesInRange.contains(fileId3));
assertTrue(latestBaseFilesInRange.contains(fileId4));
allBaseFiles = fsView.getAllBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(2, allBaseFiles.size());
assertTrue(allBaseFiles.contains(fileId3));
assertTrue(allBaseFiles.contains(fileId4));
// could see fileId3 because clustering is committed.
latestBaseFiles = fsView.getLatestBaseFiles().map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(2, latestBaseFiles.size());
assertTrue(allBaseFiles.contains(fileId3));
assertTrue(allBaseFiles.contains(fileId4));
// could see fileId3 because clustering is committed.
latestBaseFilesPerPartition = fsView.getLatestBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(2, latestBaseFiles.size());
assertTrue(latestBaseFilesPerPartition.contains(fileId3));
assertTrue(latestBaseFilesPerPartition.contains(fileId4));
HoodieWrapperFileSystem fs = metaClient.getFs();
fs.delete(new Path(basePath + "/.hoodie", "1.commit"), false);
fs.delete(new Path(basePath + "/.hoodie", "1.inflight"), false);
fs.delete(new Path(basePath + "/.hoodie", "1.commit.requested"), false);
fs.delete(new Path(basePath + "/.hoodie", "2.replacecommit"), false);
metaClient.reloadActiveTimeline();
refreshFsView();
// do check after delete some commit file
latestBaseFilesBeforeOrOn = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, commitTime3).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(3, latestBaseFilesBeforeOrOn.size());
assertTrue(latestBaseFilesBeforeOrOn.contains(fileId1));
assertTrue(latestBaseFilesBeforeOrOn.contains(fileId2));
assertTrue(latestBaseFilesBeforeOrOn.contains(fileId4));
// couldn't see fileId3 because clustering is not committed.
baseFileOn = fsView.getBaseFileOn(partitionPath, commitTime2, fileId3);
assertFalse(baseFileOn.isPresent());
latestBaseFilesInRange = fsView.getLatestBaseFilesInRange(commits).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(3, latestBaseFilesInRange.size());
assertTrue(latestBaseFilesInRange.contains(fileId1));
assertTrue(latestBaseFilesInRange.contains(fileId2));
assertTrue(latestBaseFilesInRange.contains(fileId4));
allBaseFiles = fsView.getAllBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(3, allBaseFiles.size());
assertTrue(allBaseFiles.contains(fileId1));
assertTrue(allBaseFiles.contains(fileId2));
assertTrue(allBaseFiles.contains(fileId4));
// couldn't see fileId3 because clustering is not committed.
latestBaseFiles = fsView.getLatestBaseFiles().map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(3, latestBaseFiles.size());
assertTrue(allBaseFiles.contains(fileId1));
assertTrue(allBaseFiles.contains(fileId2));
assertTrue(allBaseFiles.contains(fileId4));
// couldn't see fileId3 because clustering is not committed.
latestBaseFilesPerPartition = fsView.getLatestBaseFiles(partitionPath).map(HoodieBaseFile::getFileId).collect(Collectors.toList());
assertEquals(3, latestBaseFiles.size());
assertTrue(latestBaseFilesPerPartition.contains(fileId1));
assertTrue(latestBaseFilesPerPartition.contains(fileId2));
assertTrue(latestBaseFilesPerPartition.contains(fileId4));
}
use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class TestIncrementalFSViewSync method addReplaceInstant.
private List<String> addReplaceInstant(HoodieTableMetaClient metaClient, String instant, List<Pair<String, HoodieWriteStat>> writeStats, Map<String, List<String>> partitionToReplaceFileIds) throws IOException {
// created requested
HoodieInstant newRequestedInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.REPLACE_COMMIT_ACTION, instant);
HoodieRequestedReplaceMetadata requestedReplaceMetadata = HoodieRequestedReplaceMetadata.newBuilder().setOperationType(WriteOperationType.UNKNOWN.name()).build();
metaClient.getActiveTimeline().saveToPendingReplaceCommit(newRequestedInstant, TimelineMetadataUtils.serializeRequestedReplaceMetadata(requestedReplaceMetadata));
metaClient.reloadActiveTimeline();
// transition to inflight
HoodieInstant inflightInstant = metaClient.getActiveTimeline().transitionReplaceRequestedToInflight(newRequestedInstant, Option.empty());
// transition to replacecommit
HoodieReplaceCommitMetadata replaceCommitMetadata = new HoodieReplaceCommitMetadata();
writeStats.forEach(e -> replaceCommitMetadata.addWriteStat(e.getKey(), e.getValue()));
replaceCommitMetadata.setPartitionToReplaceFileIds(partitionToReplaceFileIds);
metaClient.getActiveTimeline().saveAsComplete(inflightInstant, Option.of(replaceCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
return writeStats.stream().map(e -> e.getValue().getPath()).collect(Collectors.toList());
}
use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.
the class TestIncrementalFSViewSync method testAsyncCompaction.
@Test
public void testAsyncCompaction() throws IOException {
SyncableFileSystemView view = getFileSystemView(metaClient);
view.sync();
// Run 3 ingestion on MOR table (3 delta commits)
Map<String, List<String>> instantsToFiles = testMultipleWriteSteps(view, Arrays.asList("11", "12", "13"), true, "11");
// Schedule Compaction
scheduleCompaction(view, "14");
// Restore pending compaction
unscheduleCompaction(view, "14", "13", "11");
// Add one more delta instant
instantsToFiles.putAll(testMultipleWriteSteps(view, Collections.singletonList("15"), true, "11"));
// Schedule Compaction again
scheduleCompaction(view, "16");
// Run Compaction - This will be the second file-slice
testMultipleWriteSteps(view, Collections.singletonList("16"), false, "16", 2);
// Run 2 more ingest
instantsToFiles.putAll(testMultipleWriteSteps(view, Arrays.asList("17", "18"), true, "16", 2));
// Schedule Compaction again
scheduleCompaction(view, "19");
// Run one more ingestion after pending compaction. THis will be 3rd slice
instantsToFiles.putAll(testMultipleWriteSteps(view, Collections.singletonList("20"), true, "19", 3));
// Clean first slice
testCleans(view, Collections.singletonList("21"), new HashMap<String, List<String>>() {
{
put("11", Arrays.asList("12", "13", "15"));
}
}, instantsToFiles, Collections.singletonList("11"), 0, 0);
// Add one more ingestion instant. This should be 2nd slice now
instantsToFiles.putAll(testMultipleWriteSteps(view, Collections.singletonList("22"), true, "19", 2));
// Restore last ingestion
testRestore(view, Collections.singletonList("23"), new HashMap<>(), Collections.singletonList(getHoodieCommitInstant("22", true)), "24", false);
// Run one more ingestion. THis is still 2nd slice
instantsToFiles.putAll(testMultipleWriteSteps(view, Collections.singletonList("24"), true, "19", 2));
// Finish Compaction
instantsToFiles.putAll(testMultipleWriteSteps(view, Collections.singletonList("19"), false, "19", 2, Collections.singletonList(new HoodieInstant(State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "24"))));
}
Aggregations