Search in sources :

Example 1 with InstantRange

use of org.apache.hudi.common.table.log.InstantRange in project hudi by apache.

the class IncrementalInputSplits method inputSplits.

/**
 * Returns the incremental input splits.
 *
 * @param metaClient    The meta client
 * @param hadoopConf    The hadoop configuration
 * @param issuedInstant The last issued instant, only valid in streaming read
 * @return The list of incremental input splits or empty if there are no new instants
 */
public Result inputSplits(HoodieTableMetaClient metaClient, org.apache.hadoop.conf.Configuration hadoopConf, String issuedInstant) {
    metaClient.reloadActiveTimeline();
    HoodieTimeline commitTimeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants();
    if (commitTimeline.empty()) {
        LOG.warn("No splits found for the table under path " + path);
        return Result.EMPTY;
    }
    List<HoodieInstant> instants = filterInstantsWithRange(commitTimeline, issuedInstant);
    // get the latest instant that satisfies condition
    final HoodieInstant instantToIssue = instants.size() == 0 ? null : instants.get(instants.size() - 1);
    final InstantRange instantRange;
    if (instantToIssue != null) {
        if (issuedInstant != null) {
            // the streaming reader may record the last issued instant, if the issued instant is present,
            // the instant range should be: (issued instant, the latest instant].
            instantRange = InstantRange.getInstance(issuedInstant, instantToIssue.getTimestamp(), InstantRange.RangeType.OPEN_CLOSE);
        } else if (this.conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent()) {
            // first time consume and has a start commit
            final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT);
            instantRange = startCommit.equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST) ? null : InstantRange.getInstance(startCommit, instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE);
        } else {
            // first time consume and no start commit, consumes the latest incremental data set.
            instantRange = InstantRange.getInstance(instantToIssue.getTimestamp(), instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE);
        }
    } else {
        LOG.info("No new instant found for the table under path " + path + ", skip reading");
        return Result.EMPTY;
    }
    String tableName = conf.getString(FlinkOptions.TABLE_NAME);
    Set<String> writePartitions;
    final FileStatus[] fileStatuses;
    if (instantRange == null) {
        // reading from the earliest, scans the partitions and files directly.
        FileIndex fileIndex = FileIndex.instance(new org.apache.hadoop.fs.Path(path.toUri()), conf);
        if (this.requiredPartitions != null) {
            // apply partition push down
            fileIndex.setPartitionPaths(this.requiredPartitions);
        }
        writePartitions = new HashSet<>(fileIndex.getOrBuildPartitionPaths());
        if (writePartitions.size() == 0) {
            LOG.warn("No partitions found for reading in user provided path.");
            return Result.EMPTY;
        }
        fileStatuses = fileIndex.getFilesInPartitions();
    } else {
        List<HoodieCommitMetadata> activeMetadataList = instants.stream().map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList());
        List<HoodieCommitMetadata> archivedMetadataList = getArchivedMetadata(metaClient, instantRange, commitTimeline, tableName);
        if (archivedMetadataList.size() > 0) {
            LOG.warn("\n" + "--------------------------------------------------------------------------------\n" + "---------- caution: the reader has fall behind too much from the writer,\n" + "---------- tweak 'read.tasks' option to add parallelism of read tasks.\n" + "--------------------------------------------------------------------------------");
        }
        List<HoodieCommitMetadata> metadataList = archivedMetadataList.size() > 0 ? // IMPORTANT: the merged metadata list must be in ascending order by instant time
        mergeList(archivedMetadataList, activeMetadataList) : activeMetadataList;
        writePartitions = HoodieInputFormatUtils.getWritePartitionPaths(metadataList);
        // apply partition push down
        if (this.requiredPartitions != null) {
            writePartitions = writePartitions.stream().filter(this.requiredPartitions::contains).collect(Collectors.toSet());
        }
        if (writePartitions.size() == 0) {
            LOG.warn("No partitions found for reading in user provided path.");
            return Result.EMPTY;
        }
        fileStatuses = WriteProfiles.getWritePathsOfInstants(path, hadoopConf, metadataList, metaClient.getTableType());
    }
    if (fileStatuses.length == 0) {
        LOG.warn("No files found for reading in user provided path.");
        return Result.EMPTY;
    }
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, commitTimeline, fileStatuses);
    final String endInstant = instantToIssue.getTimestamp();
    final AtomicInteger cnt = new AtomicInteger(0);
    final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
    List<MergeOnReadInputSplit> inputSplits = writePartitions.stream().map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, endInstant).map(fileSlice -> {
        Option<List<String>> logPaths = Option.ofNullable(fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()));
        String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
        return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, endInstant, metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange);
    }).collect(Collectors.toList())).flatMap(Collection::stream).collect(Collectors.toList());
    return Result.instance(inputSplits, endInstant);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) Serializable(scala.Serializable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BaseFile(org.apache.hudi.common.model.BaseFile) Path(org.apache.flink.core.fs.Path) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Nullable(javax.annotation.Nullable) Logger(org.slf4j.Logger) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) LESSER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Objects(java.util.Objects) WriteProfiles(org.apache.hudi.sink.partitioner.profile.WriteProfiles) List(java.util.List) GREATER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS) Stream(java.util.stream.Stream) InstantRange(org.apache.hudi.common.table.log.InstantRange) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) Collections(java.util.Collections) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) InstantRange(org.apache.hudi.common.table.log.InstantRange) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ArrayList(java.util.ArrayList) List(java.util.List) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 2 with InstantRange

use of org.apache.hudi.common.table.log.InstantRange in project hudi by apache.

the class IncrementalInputSplits method getArchivedMetadata.

/**
 * Returns the archived metadata in case the reader consumes untimely or it wants
 * to read from the earliest.
 *
 * <p>Note: should improve it with metadata table when the metadata table is stable enough.
 *
 * @param metaClient     The meta client
 * @param instantRange   The instant range to filter the timeline instants
 * @param commitTimeline The commit timeline
 * @param tableName      The table name
 * @return the list of archived metadata, or empty if there is no need to read the archived timeline
 */
private List<HoodieCommitMetadata> getArchivedMetadata(HoodieTableMetaClient metaClient, InstantRange instantRange, HoodieTimeline commitTimeline, String tableName) {
    if (commitTimeline.isBeforeTimelineStarts(instantRange.getStartInstant())) {
        // read the archived metadata if the start instant is archived.
        HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(instantRange.getStartInstant());
        HoodieTimeline archivedCompleteTimeline = archivedTimeline.getCommitsTimeline().filterCompletedInstants();
        if (!archivedCompleteTimeline.empty()) {
            Stream<HoodieInstant> instantStream = archivedCompleteTimeline.getInstants();
            return maySkipCompaction(instantStream).map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, archivedTimeline)).collect(Collectors.toList());
        }
    }
    return Collections.emptyList();
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) Serializable(scala.Serializable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BaseFile(org.apache.hudi.common.model.BaseFile) Path(org.apache.flink.core.fs.Path) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Nullable(javax.annotation.Nullable) Logger(org.slf4j.Logger) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) LESSER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Objects(java.util.Objects) WriteProfiles(org.apache.hudi.sink.partitioner.profile.WriteProfiles) List(java.util.List) GREATER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS) Stream(java.util.stream.Stream) InstantRange(org.apache.hudi.common.table.log.InstantRange) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) Collections(java.util.Collections) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline)

Aggregations

ArrayList (java.util.ArrayList)2 Collection (java.util.Collection)2 Collections (java.util.Collections)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Objects (java.util.Objects)2 Set (java.util.Set)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 Collectors (java.util.stream.Collectors)2 Stream (java.util.stream.Stream)2 Nullable (javax.annotation.Nullable)2 Configuration (org.apache.flink.configuration.Configuration)2 Path (org.apache.flink.core.fs.Path)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 BaseFile (org.apache.hudi.common.model.BaseFile)2 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)2 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)2 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)2 InstantRange (org.apache.hudi.common.table.log.InstantRange)2 HoodieArchivedTimeline (org.apache.hudi.common.table.timeline.HoodieArchivedTimeline)2