Search in sources :

Example 66 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieTableFileSystemView method testStreamLatestVersionInPartition.

private void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly, String fullPartitionPath, String commitTime1, String commitTime2, String commitTime3, String commitTime4, String fileId1, String fileId2, String fileId3, String fileId4) throws IOException {
    // Now we list the entire partition
    FileStatus[] statuses = metaClient.getFs().listStatus(new Path(fullPartitionPath));
    assertEquals(11, statuses.length);
    refreshFsView();
    // Check files as of lastest commit.
    List<FileSlice> allSlices = rtView.getAllFileSlices("2016/05/01").collect(Collectors.toList());
    assertEquals(isLatestFileSliceOnly ? 4 : 8, allSlices.size());
    Map<String, Long> fileSliceMap = allSlices.stream().collect(Collectors.groupingBy(FileSlice::getFileId, Collectors.counting()));
    assertEquals(isLatestFileSliceOnly ? 1 : 2, fileSliceMap.get(fileId1).longValue());
    assertEquals(isLatestFileSliceOnly ? 1 : 3, fileSliceMap.get(fileId2).longValue());
    assertEquals(isLatestFileSliceOnly ? 1 : 2, fileSliceMap.get(fileId3).longValue());
    assertEquals(1, fileSliceMap.get(fileId4).longValue());
    List<HoodieBaseFile> dataFileList = roView.getLatestBaseFilesBeforeOrOn("2016/05/01", commitTime4).collect(Collectors.toList());
    assertEquals(3, dataFileList.size());
    Set<String> filenames = new HashSet<>();
    for (HoodieBaseFile status : dataFileList) {
        filenames.add(status.getFileName());
    }
    assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)));
    assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
    assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)));
    filenames = new HashSet<>();
    List<HoodieLogFile> logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime4, true).map(FileSlice::getLogFiles).flatMap(logFileList -> logFileList).collect(Collectors.toList());
    assertEquals(logFilesList.size(), 4);
    for (HoodieLogFile logFile : logFilesList) {
        filenames.add(logFile.getFileName());
    }
    assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)));
    assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1, TEST_WRITE_TOKEN)));
    assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)));
    assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)));
    // Reset the max commit time
    List<HoodieBaseFile> dataFiles = roView.getLatestBaseFilesBeforeOrOn("2016/05/01", commitTime3).collect(Collectors.toList());
    filenames = new HashSet<>();
    for (HoodieBaseFile status : dataFiles) {
        filenames.add(status.getFileName());
    }
    if (!isLatestFileSliceOnly) {
        assertEquals(3, dataFiles.size());
        assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)));
        assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
        assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)));
    } else {
        assertEquals(1, dataFiles.size());
        assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
    }
    logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3, true).map(FileSlice::getLogFiles).flatMap(logFileList -> logFileList).collect(Collectors.toList());
    assertEquals(logFilesList.size(), 1);
    assertEquals(logFilesList.get(0).getFileName(), FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodiePath(org.apache.hudi.avro.model.HoodiePath) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodiePath(org.apache.hudi.avro.model.HoodiePath) HoodieFSPermission(org.apache.hudi.avro.model.HoodieFSPermission) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSlice(org.apache.hudi.common.model.FileSlice) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) CommitUtils(org.apache.hudi.common.util.CommitUtils) FsAction(org.apache.hadoop.fs.permission.FsAction) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HashSet(java.util.HashSet)

Example 67 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestIncrementalFSViewSync method areViewsConsistent.

/**
 * Check for equality of views.
 *
 * @param view1 View1
 * @param view2 View2
 */
private void areViewsConsistent(SyncableFileSystemView view1, SyncableFileSystemView view2, long expectedTotalFileSlices) {
    // Timeline check
    assertEquals(view1.getLastInstant(), view2.getLastInstant());
    // View Checks
    Map<HoodieFileGroupId, HoodieFileGroup> fileGroupsMap1 = partitions.stream().flatMap(view1::getAllFileGroups).collect(Collectors.toMap(HoodieFileGroup::getFileGroupId, fg -> fg));
    Map<HoodieFileGroupId, HoodieFileGroup> fileGroupsMap2 = partitions.stream().flatMap(view2::getAllFileGroups).collect(Collectors.toMap(HoodieFileGroup::getFileGroupId, fg -> fg));
    assertEquals(fileGroupsMap1.keySet(), fileGroupsMap2.keySet());
    long gotSlicesCount = fileGroupsMap1.keySet().stream().map(k -> Pair.of(fileGroupsMap1.get(k), fileGroupsMap2.get(k))).mapToLong(e -> {
        HoodieFileGroup fg1 = e.getKey();
        HoodieFileGroup fg2 = e.getValue();
        assertEquals(fg1.getFileGroupId(), fg2.getFileGroupId());
        List<FileSlice> slices1 = fg1.getAllRawFileSlices().collect(Collectors.toList());
        List<FileSlice> slices2 = fg2.getAllRawFileSlices().collect(Collectors.toList());
        assertEquals(slices1.size(), slices2.size());
        IntStream.range(0, slices1.size()).mapToObj(idx -> Pair.of(slices1.get(idx), slices2.get(idx))).forEach(e2 -> {
            FileSlice slice1 = e2.getKey();
            FileSlice slice2 = e2.getValue();
            assertEquals(slice1.getBaseInstantTime(), slice2.getBaseInstantTime());
            assertEquals(slice1.getFileId(), slice2.getFileId());
            assertEquals(slice1.getBaseFile().isPresent(), slice2.getBaseFile().isPresent());
            if (slice1.getBaseFile().isPresent()) {
                HoodieBaseFile df1 = slice1.getBaseFile().get();
                HoodieBaseFile df2 = slice2.getBaseFile().get();
                assertEquals(df1.getCommitTime(), df2.getCommitTime());
                assertEquals(df1.getFileId(), df2.getFileId());
                assertEquals(df1.getFileName(), df2.getFileName());
                assertEquals(Path.getPathWithoutSchemeAndAuthority(new Path(df1.getPath())), Path.getPathWithoutSchemeAndAuthority(new Path(df2.getPath())));
            }
            List<Path> logPaths1 = slice1.getLogFiles().map(lf -> Path.getPathWithoutSchemeAndAuthority(lf.getPath())).collect(Collectors.toList());
            List<Path> logPaths2 = slice2.getLogFiles().map(lf -> Path.getPathWithoutSchemeAndAuthority(lf.getPath())).collect(Collectors.toList());
            assertEquals(logPaths1, logPaths2);
        });
        return slices1.size();
    }).sum();
    assertEquals(expectedTotalFileSlices, gotSlicesCount);
    // Pending Compaction Operations Check
    Set<Pair<String, CompactionOperation>> ops1 = view1.getPendingCompactionOperations().collect(Collectors.toSet());
    Set<Pair<String, CompactionOperation>> ops2 = view2.getPendingCompactionOperations().collect(Collectors.toSet());
    assertEquals(ops1, ops2);
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) COMPACTION_ACTION(org.apache.hudi.common.table.timeline.HoodieTimeline.COMPACTION_ACTION) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) IntStream(java.util.stream.IntStream) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) Files(java.nio.file.Files) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Paths(java.nio.file.Paths) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) HoodieRollbackStat(org.apache.hudi.common.HoodieRollbackStat) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) FileSlice(org.apache.hudi.common.model.FileSlice) List(java.util.List) ArrayList(java.util.ArrayList) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Pair(org.apache.hudi.common.util.collection.Pair)

Example 68 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieBackedTableMetadata method getBaseFileReader.

private Pair<HoodieFileReader, Long> getBaseFileReader(FileSlice slice, HoodieTimer timer) throws IOException {
    HoodieFileReader baseFileReader = null;
    Long baseFileOpenMs;
    // If the base file is present then create a reader
    Option<HoodieBaseFile> basefile = slice.getBaseFile();
    if (basefile.isPresent()) {
        String basefilePath = basefile.get().getPath();
        baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(basefilePath));
        baseFileOpenMs = timer.endTimer();
        LOG.info(String.format("Opened metadata base file from %s at instant %s in %d ms", basefilePath, basefile.get().getCommitTime(), baseFileOpenMs));
    } else {
        baseFileOpenMs = 0L;
        timer.endTimer();
    }
    return Pair.of(baseFileReader, baseFileOpenMs);
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader)

Example 69 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestPriorityBasedFileSystemView method testGetLatestBaseFile.

@Test
public void testGetLatestBaseFile() {
    Option<HoodieBaseFile> actual;
    Option<HoodieBaseFile> expected = Option.of(new HoodieBaseFile("test.file"));
    String partitionPath = "/table2";
    String fileID = "file.123";
    when(primary.getLatestBaseFile(partitionPath, fileID)).thenReturn(expected);
    actual = fsView.getLatestBaseFile(partitionPath, fileID);
    assertEquals(expected, actual);
    resetMocks();
    when(primary.getLatestBaseFile(partitionPath, fileID)).thenThrow(new RuntimeException());
    when(secondary.getLatestBaseFile(partitionPath, fileID)).thenReturn(expected);
    actual = fsView.getLatestBaseFile(partitionPath, fileID);
    assertEquals(expected, actual);
    resetMocks();
    when(secondary.getLatestBaseFile(partitionPath, fileID)).thenReturn(expected);
    actual = fsView.getLatestBaseFile(partitionPath, fileID);
    assertEquals(expected, actual);
    resetMocks();
    when(secondary.getLatestBaseFile(partitionPath, fileID)).thenThrow(new RuntimeException());
    assertThrows(RuntimeException.class, () -> {
        fsView.getLatestBaseFile(partitionPath, fileID);
    });
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Test(org.junit.jupiter.api.Test)

Example 70 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class HoodieInputFormatUtils method filterIncrementalFileStatus.

/**
 * Filter a list of FileStatus based on commitsToCheck for incremental view.
 * @param job
 * @param tableMetaClient
 * @param timeline
 * @param fileStatuses
 * @param commitsToCheck
 * @return
 */
public static List<FileStatus> filterIncrementalFileStatus(Job job, HoodieTableMetaClient tableMetaClient, HoodieTimeline timeline, FileStatus[] fileStatuses, List<HoodieInstant> commitsToCheck) throws IOException {
    TableFileSystemView.BaseFileOnlyView roView = new HoodieTableFileSystemView(tableMetaClient, timeline, fileStatuses);
    List<String> commitsList = commitsToCheck.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
    List<HoodieBaseFile> filteredFiles = roView.getLatestBaseFilesInRange(commitsList).collect(Collectors.toList());
    List<FileStatus> returns = new ArrayList<>();
    for (HoodieBaseFile filteredFile : filteredFiles) {
        LOG.debug("Processing incremental hoodie file - " + filteredFile.getPath());
        filteredFile = refreshFileStatus(job.getConfiguration(), filteredFile);
        returns.add(getFileStatus(filteredFile));
    }
    LOG.info("Total paths to process after hoodie incremental filter " + filteredFiles.size());
    return returns;
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) ArrayList(java.util.ArrayList) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)71 Path (org.apache.hadoop.fs.Path)40 ArrayList (java.util.ArrayList)33 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)31 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)31 FileSlice (org.apache.hudi.common.model.FileSlice)29 List (java.util.List)27 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)27 IOException (java.io.IOException)26 FileStatus (org.apache.hadoop.fs.FileStatus)25 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 Pair (org.apache.hudi.common.util.collection.Pair)24 Option (org.apache.hudi.common.util.Option)23 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 Collectors (java.util.stream.Collectors)21 Test (org.junit.jupiter.api.Test)21 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)21 Map (java.util.Map)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20 HoodieTable (org.apache.hudi.table.HoodieTable)20