Search in sources :

Example 1 with FileStatusWithBootstrapBaseFile

use of org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile in project hudi by apache.

the class HoodieMergeOnReadTableInputFormat method createRealtimeFileStatusUnchecked.

/**
 * Creates {@link RealtimeFileStatus} for the file-slice where base file is present
 */
private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieBaseFile baseFile, Stream<HoodieLogFile> logFiles, String basePath, Option<HoodieInstant> latestCompletedInstantOpt, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {
    FileStatus baseFileStatus = getFileStatusUnchecked(baseFile);
    List<HoodieLogFile> sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
    try {
        RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(baseFileStatus, basePath, sortedLogFiles, false, virtualKeyInfoOpt);
        if (latestCompletedInstantOpt.isPresent()) {
            HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get();
            checkState(latestCompletedInstant.isCompleted());
            rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp());
        }
        if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
            rtFileStatus.setBootStrapFileStatus(baseFileStatus);
        }
        return rtFileStatus;
    } catch (IOException e) {
        throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 2 with FileStatusWithBootstrapBaseFile

use of org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile in project hudi by apache.

the class HoodieMergeOnReadTableInputFormat method collectAllIncrementalFiles.

private static List<FileStatus> collectAllIncrementalFiles(List<HoodieFileGroup> fileGroups, String maxCommitTime, String basePath, Map<String, FileStatus> candidateFileStatus, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {
    List<FileStatus> result = new ArrayList<>();
    fileGroups.stream().forEach(f -> {
        try {
            List<FileSlice> baseFiles = f.getAllFileSlices().filter(slice -> slice.getBaseFile().isPresent()).collect(Collectors.toList());
            if (!baseFiles.isEmpty()) {
                FileStatus baseFileStatus = HoodieInputFormatUtils.getFileStatus(baseFiles.get(0).getBaseFile().get());
                String baseFilePath = baseFileStatus.getPath().toUri().toString();
                if (!candidateFileStatus.containsKey(baseFilePath)) {
                    throw new HoodieException("Error obtaining fileStatus for file: " + baseFilePath);
                }
                List<HoodieLogFile> deltaLogFiles = f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList());
                // We cannot use baseFileStatus.getPath() here, since baseFileStatus.getPath() missing file size information.
                // So we use candidateFileStatus.get(baseFileStatus.getPath()) to get a correct path.
                RealtimeFileStatus fileStatus = new RealtimeFileStatus(candidateFileStatus.get(baseFilePath), basePath, deltaLogFiles, true, virtualKeyInfoOpt);
                fileStatus.setMaxCommitTime(maxCommitTime);
                if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
                    fileStatus.setBootStrapFileStatus(baseFileStatus);
                }
                result.add(fileStatus);
            }
            // add file group which has only logs.
            if (f.getLatestFileSlice().isPresent() && baseFiles.isEmpty()) {
                List<FileStatus> logFileStatus = f.getLatestFileSlice().get().getLogFiles().map(logFile -> logFile.getFileStatus()).collect(Collectors.toList());
                if (logFileStatus.size() > 0) {
                    List<HoodieLogFile> deltaLogFiles = logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList());
                    RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0), basePath, deltaLogFiles, true, virtualKeyInfoOpt);
                    fileStatus.setMaxCommitTime(maxCommitTime);
                    result.add(fileStatus);
                }
            }
        } catch (IOException e) {
            throw new HoodieException("Error obtaining data file/log file grouping ", e);
        }
    });
    return result;
}
Also used : HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) Arrays(java.util.Arrays) FileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile) FileSystem(org.apache.hadoop.fs.FileSystem) HiveHoodieTableFileIndex(org.apache.hudi.hadoop.HiveHoodieTableFileIndex) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) ValidationUtils.checkState(org.apache.hudi.common.util.ValidationUtils.checkState) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HoodieCopyOnWriteTableInputFormat(org.apache.hudi.hadoop.HoodieCopyOnWriteTableInputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Configurable(org.apache.hadoop.conf.Configurable) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) BootstrapBaseFileSplit(org.apache.hudi.hadoop.BootstrapBaseFileSplit) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) JobConf(org.apache.hadoop.mapred.JobConf) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieRealtimeInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapred.InputSplit) HoodieIOException(org.apache.hudi.exception.HoodieIOException) FileStatus(org.apache.hadoop.fs.FileStatus) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) FileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Aggregations

IOException (java.io.IOException)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)2 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)2 HoodieIOException (org.apache.hudi.exception.HoodieIOException)2 FileStatusWithBootstrapBaseFile (org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile)2 LocatedFileStatusWithBootstrapBaseFile (org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile)2 RealtimeFileStatus (org.apache.hudi.hadoop.RealtimeFileStatus)2 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 Configurable (org.apache.hadoop.conf.Configurable)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 FileInputFormat (org.apache.hadoop.mapred.FileInputFormat)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1