use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.
the class TestHoodieTableFileSystemView method testStreamLatestVersionInPartition.
private void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly, String fullPartitionPath, String commitTime1, String commitTime2, String commitTime3, String commitTime4, String fileId1, String fileId2, String fileId3, String fileId4) throws IOException {
// Now we list the entire partition
FileStatus[] statuses = metaClient.getFs().listStatus(new Path(fullPartitionPath));
assertEquals(11, statuses.length);
refreshFsView();
// Check files as of lastest commit.
List<FileSlice> allSlices = rtView.getAllFileSlices("2016/05/01").collect(Collectors.toList());
assertEquals(isLatestFileSliceOnly ? 4 : 8, allSlices.size());
Map<String, Long> fileSliceMap = allSlices.stream().collect(Collectors.groupingBy(FileSlice::getFileId, Collectors.counting()));
assertEquals(isLatestFileSliceOnly ? 1 : 2, fileSliceMap.get(fileId1).longValue());
assertEquals(isLatestFileSliceOnly ? 1 : 3, fileSliceMap.get(fileId2).longValue());
assertEquals(isLatestFileSliceOnly ? 1 : 2, fileSliceMap.get(fileId3).longValue());
assertEquals(1, fileSliceMap.get(fileId4).longValue());
List<HoodieBaseFile> dataFileList = roView.getLatestBaseFilesBeforeOrOn("2016/05/01", commitTime4).collect(Collectors.toList());
assertEquals(3, dataFileList.size());
Set<String> filenames = new HashSet<>();
for (HoodieBaseFile status : dataFileList) {
filenames.add(status.getFileName());
}
assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)));
assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)));
filenames = new HashSet<>();
List<HoodieLogFile> logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime4, true).map(FileSlice::getLogFiles).flatMap(logFileList -> logFileList).collect(Collectors.toList());
assertEquals(logFilesList.size(), 4);
for (HoodieLogFile logFile : logFilesList) {
filenames.add(logFile.getFileName());
}
assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)));
assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1, TEST_WRITE_TOKEN)));
assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)));
assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)));
// Reset the max commit time
List<HoodieBaseFile> dataFiles = roView.getLatestBaseFilesBeforeOrOn("2016/05/01", commitTime3).collect(Collectors.toList());
filenames = new HashSet<>();
for (HoodieBaseFile status : dataFiles) {
filenames.add(status.getFileName());
}
if (!isLatestFileSliceOnly) {
assertEquals(3, dataFiles.size());
assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)));
assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)));
} else {
assertEquals(1, dataFiles.size());
assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
}
logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3, true).map(FileSlice::getLogFiles).flatMap(logFileList -> logFileList).collect(Collectors.toList());
assertEquals(logFilesList.size(), 1);
assertEquals(logFilesList.get(0).getFileName(), FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN));
}
use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.
the class TestIncrementalFSViewSync method areViewsConsistent.
/**
* Check for equality of views.
*
* @param view1 View1
* @param view2 View2
*/
private void areViewsConsistent(SyncableFileSystemView view1, SyncableFileSystemView view2, long expectedTotalFileSlices) {
// Timeline check
assertEquals(view1.getLastInstant(), view2.getLastInstant());
// View Checks
Map<HoodieFileGroupId, HoodieFileGroup> fileGroupsMap1 = partitions.stream().flatMap(view1::getAllFileGroups).collect(Collectors.toMap(HoodieFileGroup::getFileGroupId, fg -> fg));
Map<HoodieFileGroupId, HoodieFileGroup> fileGroupsMap2 = partitions.stream().flatMap(view2::getAllFileGroups).collect(Collectors.toMap(HoodieFileGroup::getFileGroupId, fg -> fg));
assertEquals(fileGroupsMap1.keySet(), fileGroupsMap2.keySet());
long gotSlicesCount = fileGroupsMap1.keySet().stream().map(k -> Pair.of(fileGroupsMap1.get(k), fileGroupsMap2.get(k))).mapToLong(e -> {
HoodieFileGroup fg1 = e.getKey();
HoodieFileGroup fg2 = e.getValue();
assertEquals(fg1.getFileGroupId(), fg2.getFileGroupId());
List<FileSlice> slices1 = fg1.getAllRawFileSlices().collect(Collectors.toList());
List<FileSlice> slices2 = fg2.getAllRawFileSlices().collect(Collectors.toList());
assertEquals(slices1.size(), slices2.size());
IntStream.range(0, slices1.size()).mapToObj(idx -> Pair.of(slices1.get(idx), slices2.get(idx))).forEach(e2 -> {
FileSlice slice1 = e2.getKey();
FileSlice slice2 = e2.getValue();
assertEquals(slice1.getBaseInstantTime(), slice2.getBaseInstantTime());
assertEquals(slice1.getFileId(), slice2.getFileId());
assertEquals(slice1.getBaseFile().isPresent(), slice2.getBaseFile().isPresent());
if (slice1.getBaseFile().isPresent()) {
HoodieBaseFile df1 = slice1.getBaseFile().get();
HoodieBaseFile df2 = slice2.getBaseFile().get();
assertEquals(df1.getCommitTime(), df2.getCommitTime());
assertEquals(df1.getFileId(), df2.getFileId());
assertEquals(df1.getFileName(), df2.getFileName());
assertEquals(Path.getPathWithoutSchemeAndAuthority(new Path(df1.getPath())), Path.getPathWithoutSchemeAndAuthority(new Path(df2.getPath())));
}
List<Path> logPaths1 = slice1.getLogFiles().map(lf -> Path.getPathWithoutSchemeAndAuthority(lf.getPath())).collect(Collectors.toList());
List<Path> logPaths2 = slice2.getLogFiles().map(lf -> Path.getPathWithoutSchemeAndAuthority(lf.getPath())).collect(Collectors.toList());
assertEquals(logPaths1, logPaths2);
});
return slices1.size();
}).sum();
assertEquals(expectedTotalFileSlices, gotSlicesCount);
// Pending Compaction Operations Check
Set<Pair<String, CompactionOperation>> ops1 = view1.getPendingCompactionOperations().collect(Collectors.toSet());
Set<Pair<String, CompactionOperation>> ops2 = view2.getPendingCompactionOperations().collect(Collectors.toSet());
assertEquals(ops1, ops2);
}
use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.
the class HoodieBackedTableMetadata method getBaseFileReader.
private Pair<HoodieFileReader, Long> getBaseFileReader(FileSlice slice, HoodieTimer timer) throws IOException {
HoodieFileReader baseFileReader = null;
Long baseFileOpenMs;
// If the base file is present then create a reader
Option<HoodieBaseFile> basefile = slice.getBaseFile();
if (basefile.isPresent()) {
String basefilePath = basefile.get().getPath();
baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
baseFileOpenMs = timer.endTimer();
LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath, basefile.get().getCommitTime(), baseFileOpenMs));
} else {
baseFileOpenMs = 0L;
timer.endTimer();
}
return Pair.of(baseFileReader, baseFileOpenMs);
}
use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.
the class TestPriorityBasedFileSystemView method testGetLatestBaseFile.
@Test
public void testGetLatestBaseFile() {
Option<HoodieBaseFile> actual;
Option<HoodieBaseFile> expected = Option.of(new HoodieBaseFile("test.file"));
String partitionPath = "/table2";
String fileID = "file.123";
when(primary.getLatestBaseFile(partitionPath, fileID)).thenReturn(expected);
actual = fsView.getLatestBaseFile(partitionPath, fileID);
assertEquals(expected, actual);
resetMocks();
when(primary.getLatestBaseFile(partitionPath, fileID)).thenThrow(new RuntimeException());
when(secondary.getLatestBaseFile(partitionPath, fileID)).thenReturn(expected);
actual = fsView.getLatestBaseFile(partitionPath, fileID);
assertEquals(expected, actual);
resetMocks();
when(secondary.getLatestBaseFile(partitionPath, fileID)).thenReturn(expected);
actual = fsView.getLatestBaseFile(partitionPath, fileID);
assertEquals(expected, actual);
resetMocks();
when(secondary.getLatestBaseFile(partitionPath, fileID)).thenThrow(new RuntimeException());
assertThrows(RuntimeException.class, () -> {
fsView.getLatestBaseFile(partitionPath, fileID);
});
}
use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.
the class HoodieInputFormatUtils method filterIncrementalFileStatus.
/**
* Filter a list of FileStatus based on commitsToCheck for incremental view.
* @param job
* @param tableMetaClient
* @param timeline
* @param fileStatuses
* @param commitsToCheck
* @return
*/
public static List<FileStatus> filterIncrementalFileStatus(Job job, HoodieTableMetaClient tableMetaClient, HoodieTimeline timeline, FileStatus[] fileStatuses, List<HoodieInstant> commitsToCheck) throws IOException {
TableFileSystemView.BaseFileOnlyView roView = new HoodieTableFileSystemView(tableMetaClient, timeline, fileStatuses);
List<String> commitsList = commitsToCheck.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
List<HoodieBaseFile> filteredFiles = roView.getLatestBaseFilesInRange(commitsList).collect(Collectors.toList());
List<FileStatus> returns = new ArrayList<>();
for (HoodieBaseFile filteredFile : filteredFiles) {
LOG.debug("Processing incremental hoodie file - " + filteredFile.getPath());
filteredFile = refreshFileStatus(job.getConfiguration(), filteredFile);
returns.add(getFileStatus(filteredFile));
}
LOG.info("Total paths to process after hoodie incremental filter " + filteredFiles.size());
return returns;
}
Aggregations