Search in sources :

Example 16 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class HoodieMergeOnReadTableInputFormat method listStatusForIncrementalMode.

/**
 * Keep the logic of mor_incr_view as same as spark datasource.
 * Step1: Get list of commits to be fetched based on start commit and max commits(for snapshot max commits is -1).
 * Step2: Get list of affected files status for these affected file status.
 * Step3: Construct HoodieTableFileSystemView based on those affected file status.
 *        a. Filter affected partitions based on inputPaths.
 *        b. Get list of fileGroups based on affected partitions by fsView.getAllFileGroups.
 * Step4: Set input paths based on filtered affected partition paths. changes that amony original input paths passed to
 *        this method. some partitions did not have commits as part of the trimmed down list of commits and hence we need this step.
 * Step5: Find candidate fileStatus, since when we get baseFileStatus from HoodieTableFileSystemView,
 *        the BaseFileStatus will missing file size information.
 *        We should use candidate fileStatus to update the size information for BaseFileStatus.
 * Step6: For every file group from step3(b)
 *        Get 1st available base file from all file slices. then we use candidate file status to update the baseFileStatus,
 *        and construct RealTimeFileStatus and add it to result along with log files.
 *        If file group just has log files, construct RealTimeFileStatus and add it to result.
 * TODO: unify the incremental view code between hive/spark-sql and spark datasource
 */
@Override
protected List<FileStatus> listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths, String incrementalTableName) throws IOException {
    List<FileStatus> result = new ArrayList<>();
    Job jobContext = Job.getInstance(job);
    // step1
    Option<HoodieTimeline> timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient);
    if (!timeline.isPresent()) {
        return result;
    }
    HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, incrementalTableName, timeline.get());
    Option<List<HoodieInstant>> commitsToCheck = Option.of(commitsTimelineToReturn.getInstants().collect(Collectors.toList()));
    if (!commitsToCheck.isPresent()) {
        return result;
    }
    // step2
    commitsToCheck.get().sort(HoodieInstant::compareTo);
    List<HoodieCommitMetadata> metadataList = commitsToCheck.get().stream().map(instant -> {
        try {
            return HoodieInputFormatUtils.getCommitMetadata(instant, commitsTimelineToReturn);
        } catch (IOException e) {
            throw new HoodieException(String.format("cannot get metadata for instant: %s", instant));
        }
    }).collect(Collectors.toList());
    // build fileGroup from fsView
    List<FileStatus> affectedFileStatus = Arrays.asList(HoodieInputFormatUtils.listAffectedFilesForCommits(job, new Path(tableMetaClient.getBasePath()), metadataList));
    // step3
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(tableMetaClient, commitsTimelineToReturn, affectedFileStatus.toArray(new FileStatus[0]));
    // build fileGroup from fsView
    Path basePath = new Path(tableMetaClient.getBasePath());
    // filter affectedPartition by inputPaths
    List<String> affectedPartition = HoodieInputFormatUtils.getWritePartitionPaths(metadataList).stream().filter(k -> k.isEmpty() ? inputPaths.contains(basePath) : inputPaths.contains(new Path(basePath, k))).collect(Collectors.toList());
    if (affectedPartition.isEmpty()) {
        return result;
    }
    List<HoodieFileGroup> fileGroups = affectedPartition.stream().flatMap(partitionPath -> fsView.getAllFileGroups(partitionPath)).collect(Collectors.toList());
    // step4
    setInputPaths(job, affectedPartition.stream().map(p -> p.isEmpty() ? basePath.toString() : new Path(basePath, p).toString()).collect(Collectors.joining(",")));
    // step5
    // find all file status in partitionPaths.
    FileStatus[] fileStatuses = doListStatus(job);
    Map<String, FileStatus> candidateFileStatus = new HashMap<>();
    for (int i = 0; i < fileStatuses.length; i++) {
        String key = fileStatuses[i].getPath().toString();
        candidateFileStatus.put(key, fileStatuses[i]);
    }
    Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient);
    String maxCommitTime = fsView.getLastInstant().get().getTimestamp();
    // step6
    result.addAll(collectAllIncrementalFiles(fileGroups, maxCommitTime, basePath.toString(), candidateFileStatus, virtualKeyInfoOpt));
    return result;
}
Also used : HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) Arrays(java.util.Arrays) FileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile) FileSystem(org.apache.hadoop.fs.FileSystem) HiveHoodieTableFileIndex(org.apache.hudi.hadoop.HiveHoodieTableFileIndex) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) ValidationUtils.checkState(org.apache.hudi.common.util.ValidationUtils.checkState) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HoodieCopyOnWriteTableInputFormat(org.apache.hudi.hadoop.HoodieCopyOnWriteTableInputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Configurable(org.apache.hadoop.conf.Configurable) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) BootstrapBaseFileSplit(org.apache.hudi.hadoop.BootstrapBaseFileSplit) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) JobConf(org.apache.hadoop.mapred.JobConf) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieRealtimeInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapred.InputSplit) HoodieIOException(org.apache.hudi.exception.HoodieIOException) FileStatus(org.apache.hadoop.fs.FileStatus) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) HashMap(java.util.HashMap) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) ArrayList(java.util.ArrayList) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 17 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class FileSystemViewCommand method buildFileSystemView.

/**
 * Build File System View.
 *
 * @param globRegex Path Regex
 * @param maxInstant Max Instants to be used for displaying file-instants
 * @param basefileOnly Include only base file view
 * @param includeMaxInstant Include Max instant
 * @param includeInflight Include inflight instants
 * @param excludeCompaction Exclude Compaction instants
 * @return
 * @throws IOException
 */
private HoodieTableFileSystemView buildFileSystemView(String globRegex, String maxInstant, boolean basefileOnly, boolean includeMaxInstant, boolean includeInflight, boolean excludeCompaction) throws IOException {
    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(client.getHadoopConf()).setBasePath(client.getBasePath()).setLoadActiveTimelineOnLoad(true).build();
    FileSystem fs = HoodieCLI.fs;
    String globPath = String.format("%s/%s/*", client.getBasePath(), globRegex);
    List<FileStatus> statuses = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(globPath));
    Stream<HoodieInstant> instantsStream;
    HoodieTimeline timeline;
    if (basefileOnly) {
        timeline = metaClient.getActiveTimeline().getCommitTimeline();
    } else if (excludeCompaction) {
        timeline = metaClient.getActiveTimeline().getCommitsTimeline();
    } else {
        timeline = metaClient.getActiveTimeline().getWriteTimeline();
    }
    if (!includeInflight) {
        timeline = timeline.filterCompletedInstants();
    }
    instantsStream = timeline.getInstants();
    if (!maxInstant.isEmpty()) {
        final BiPredicate<String, String> predicate;
        if (includeMaxInstant) {
            predicate = HoodieTimeline.GREATER_THAN_OR_EQUALS;
        } else {
            predicate = HoodieTimeline.GREATER_THAN;
        }
        instantsStream = instantsStream.filter(is -> predicate.test(maxInstant, is.getTimestamp()));
    }
    HoodieTimeline filteredTimeline = new HoodieDefaultTimeline(instantsStream, (Function<HoodieInstant, Option<byte[]>> & Serializable) metaClient.getActiveTimeline()::getInstantDetails);
    return new HoodieTableFileSystemView(metaClient, filteredTimeline, statuses.toArray(new FileStatus[0]));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) CliOption(org.springframework.shell.core.annotation.CliOption) Function(java.util.function.Function) ArrayList(java.util.ArrayList) BiPredicate(java.util.function.BiPredicate) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) CommandMarker(org.springframework.shell.core.CommandMarker) CliCommand(org.springframework.shell.core.annotation.CliCommand) TableHeader(org.apache.hudi.cli.TableHeader) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) HoodieCLI(org.apache.hudi.cli.HoodieCLI) Component(org.springframework.stereotype.Component) List(java.util.List) Stream(java.util.stream.Stream) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) FSUtils(org.apache.hudi.common.fs.FSUtils) NumericUtils(org.apache.hudi.common.util.NumericUtils) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) FileSystem(org.apache.hadoop.fs.FileSystem) Option(org.apache.hudi.common.util.Option) CliOption(org.springframework.shell.core.annotation.CliOption) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 18 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class FileSystemViewCommand method showLatestFileSlices.

@CliCommand(value = "show fsview latest", help = "Show latest file-system view")
public String showLatestFileSlices(@CliOption(key = { "partitionPath" }, help = "A valid partition path", mandatory = true) String partition, @CliOption(key = { "baseFileOnly" }, help = "Only display base file view", unspecifiedDefaultValue = "false") boolean baseFileOnly, @CliOption(key = { "maxInstant" }, help = "File-Slices upto this instant are displayed", unspecifiedDefaultValue = "") String maxInstant, @CliOption(key = { "merge" }, help = "Merge File Slices due to pending compaction", unspecifiedDefaultValue = "true") final boolean merge, @CliOption(key = { "includeMax" }, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant, @CliOption(key = { "includeInflight" }, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") boolean includeInflight, @CliOption(key = { "excludeCompaction" }, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") boolean excludeCompaction, @CliOption(key = { "limit" }, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
    HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, baseFileOnly, includeMaxInstant, includeInflight, excludeCompaction);
    List<Comparable[]> rows = new ArrayList<>();
    final Stream<FileSlice> fileSliceStream;
    if (!merge) {
        fileSliceStream = fsView.getLatestFileSlices(partition);
    } else {
        if (maxInstant.isEmpty()) {
            maxInstant = HoodieCLI.getTableMetaClient().getActiveTimeline().filterCompletedAndCompactionInstants().lastInstant().get().getTimestamp();
        }
        fileSliceStream = fsView.getLatestMergedFileSlicesBeforeOrOn(partition, maxInstant);
    }
    fileSliceStream.forEach(fs -> {
        int idx = 0;
        Comparable[] row = new Comparable[baseFileOnly ? 5 : 13];
        row[idx++] = partition;
        row[idx++] = fs.getFileId();
        row[idx++] = fs.getBaseInstantTime();
        row[idx++] = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getPath() : "";
        long dataFileSize = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getFileSize() : -1;
        row[idx++] = dataFileSize;
        if (!baseFileOnly) {
            row[idx++] = fs.getLogFiles().count();
            row[idx++] = fs.getLogFiles().mapToLong(HoodieLogFile::getFileSize).sum();
            long logFilesScheduledForCompactionTotalSize = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).mapToLong(HoodieLogFile::getFileSize).sum();
            row[idx++] = logFilesScheduledForCompactionTotalSize;
            long logFilesUnscheduledTotalSize = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).mapToLong(HoodieLogFile::getFileSize).sum();
            row[idx++] = logFilesUnscheduledTotalSize;
            double logSelectedForCompactionToBaseRatio = dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1;
            row[idx++] = logSelectedForCompactionToBaseRatio;
            double logUnscheduledToBaseRatio = dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
            row[idx++] = logUnscheduledToBaseRatio;
            row[idx++] = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).collect(Collectors.toList()).toString();
            row[idx++] = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).collect(Collectors.toList()).toString();
        }
        rows.add(row);
    });
    Function<Object, String> converterFunction = entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())));
    Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
    fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE, converterFunction);
    if (!baseFileOnly) {
        fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_SIZE, converterFunction);
        fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DELTA_SIZE_SCHEDULED, converterFunction);
        fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DELTA_SIZE_UNSCHEDULED, converterFunction);
    }
    TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION).addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID).addTableHeaderField(HoodieTableHeaderFields.HEADER_BASE_INSTANT).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE);
    if (!baseFileOnly) {
        header = header.addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_DELTA_FILES).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_SIZE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_SIZE_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_SIZE_UNSCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_BASE_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_BASE_UNSCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_UNSCHEDULED);
    }
    return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) CliOption(org.springframework.shell.core.annotation.CliOption) Function(java.util.function.Function) ArrayList(java.util.ArrayList) BiPredicate(java.util.function.BiPredicate) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) CommandMarker(org.springframework.shell.core.CommandMarker) CliCommand(org.springframework.shell.core.annotation.CliCommand) TableHeader(org.apache.hudi.cli.TableHeader) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) HoodieCLI(org.apache.hudi.cli.HoodieCLI) Component(org.springframework.stereotype.Component) List(java.util.List) Stream(java.util.stream.Stream) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) FSUtils(org.apache.hudi.common.fs.FSUtils) NumericUtils(org.apache.hudi.common.util.NumericUtils) TableHeader(org.apache.hudi.cli.TableHeader) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) Function(java.util.function.Function) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 19 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class HoodieTableMetadataUtil method getFileSystemView.

/**
 * Get metadata table file system view.
 *
 * @param metaClient - Metadata table meta client
 * @return Filesystem view for the metadata table
 */
public static HoodieTableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient) {
    // If there are no commits on the metadata table then the table's
    // default FileSystemView will not return any file slices even
    // though we may have initialized them.
    HoodieTimeline timeline = metaClient.getActiveTimeline();
    if (timeline.empty()) {
        final HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.createNewInstantTime());
        timeline = new HoodieDefaultTimeline(Stream.of(instant), metaClient.getActiveTimeline()::getInstantDetails);
    }
    return new HoodieTableFileSystemView(metaClient, timeline);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 20 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class HoodieTableMetadataUtil method getPartitionFileSlices.

/**
 * Get the latest file slices for a given partition.
 *
 * @param metaClient      - Instance of {@link HoodieTableMetaClient}.
 * @param partition       - The name of the partition whose file groups are to be loaded.
 * @param mergeFileSlices - When enabled, will merge the latest file slices with the last known
 *                        completed instant. This is useful for readers when there are pending
 *                        compactions. MergeFileSlices when disabled, will return the latest file
 *                        slices without any merging, and this is needed for the writers.
 * @return List of latest file slices for all file groups in a given partition.
 */
private static List<FileSlice> getPartitionFileSlices(HoodieTableMetaClient metaClient, Option<HoodieTableFileSystemView> fileSystemView, String partition, boolean mergeFileSlices) {
    HoodieTableFileSystemView fsView = fileSystemView.orElse(getFileSystemView(metaClient));
    Stream<FileSlice> fileSliceStream;
    if (mergeFileSlices) {
        fileSliceStream = fsView.getLatestMergedFileSlicesBeforeOrOn(partition, metaClient.getActiveTimeline().filterCompletedInstants().lastInstant().get().getTimestamp());
    } else {
        fileSliceStream = fsView.getLatestFileSlices(partition);
    }
    return fileSliceStream.sorted(Comparator.comparing(FileSlice::getFileId)).collect(Collectors.toList());
}
Also used : FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Aggregations

HoodieTableFileSystemView (org.apache.hudi.common.table.view.HoodieTableFileSystemView)42 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)29 FileStatus (org.apache.hadoop.fs.FileStatus)25 Path (org.apache.hadoop.fs.Path)24 IOException (java.io.IOException)22 ArrayList (java.util.ArrayList)22 FileSlice (org.apache.hudi.common.model.FileSlice)22 List (java.util.List)21 Collectors (java.util.stream.Collectors)20 Option (org.apache.hudi.common.util.Option)20 Map (java.util.Map)19 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)19 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)18 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)17 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)16 HoodieIOException (org.apache.hudi.exception.HoodieIOException)16 HoodieException (org.apache.hudi.exception.HoodieException)15 Stream (java.util.stream.Stream)14 Test (org.junit.jupiter.api.Test)13 HashMap (java.util.HashMap)12