Search in sources :

Example 21 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class DeltaWriteProfile method smallFilesProfile.

@Override
protected List<SmallFile> smallFilesProfile(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    // Init here since this class (and member variables) might not have been initialized
    HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
    // Find out all eligible small file slices
    if (!commitTimeline.empty()) {
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        // find the smallest file in partition and append to it
        List<FileSlice> allSmallFileSlices = new ArrayList<>();
        // If we can index log files, we can add more inserts to log files for fileIds including those under
        // pending compaction.
        List<FileSlice> allFileSlices = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).collect(Collectors.toList());
        for (FileSlice fileSlice : allFileSlices) {
            if (isSmallFile(fileSlice)) {
                allSmallFileSlices.add(fileSlice);
            }
        }
        // Create SmallFiles from the eligible file slices
        for (FileSlice smallFileSlice : allSmallFileSlices) {
            SmallFile sf = new SmallFile();
            if (smallFileSlice.getBaseFile().isPresent()) {
                // TODO : Move logic of file name, file id, base commit time handling inside file slice
                String filename = smallFileSlice.getBaseFile().get().getFileName();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = getTotalFileSize(smallFileSlice);
                smallFileLocations.add(sf);
            } else {
                smallFileSlice.getLogFiles().findFirst().ifPresent(logFile -> {
                    // in case there is something error, and the file slice has no log file
                    sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
                    sf.sizeBytes = getTotalFileSize(smallFileSlice);
                    smallFileLocations.add(sf);
                });
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 22 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class WriteProfile method smallFilesProfile.

/**
 * Returns a list of small files in the given partition path from the latest filesystem view.
 */
protected List<SmallFile> smallFilesProfile(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
    if (!commitTimeline.empty()) {
        // if we have some commits
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        List<HoodieBaseFile> allFiles = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
        for (HoodieBaseFile file : allFiles) {
            // filter out the corrupted files.
            if (file.getFileSize() < config.getParquetSmallFileLimit() && file.getFileSize() > 0) {
                String filename = file.getFileName();
                SmallFile sf = new SmallFile();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = file.getFileSize();
                smallFileLocations.add(sf);
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 23 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class RequestHandler method isLocalViewBehind.

/**
 * Determines if local view of table's timeline is behind that of client's view.
 */
private boolean isLocalViewBehind(Context ctx) {
    String basePath = ctx.queryParam(RemoteHoodieTableFileSystemView.BASEPATH_PARAM);
    String lastKnownInstantFromClient = ctx.queryParam(RemoteHoodieTableFileSystemView.LAST_INSTANT_TS, HoodieTimeline.INVALID_INSTANT_TS);
    String timelineHashFromClient = ctx.queryParam(RemoteHoodieTableFileSystemView.TIMELINE_HASH, "");
    HoodieTimeline localTimeline = viewManager.getFileSystemView(basePath).getTimeline().filterCompletedAndCompactionInstants();
    String localLastKnownInstant = localTimeline.lastInstant().isPresent() ? localTimeline.lastInstant().get().getTimestamp() : HoodieTimeline.INVALID_INSTANT_TS;
    if (LOG.isDebugEnabled()) {
        LOG.debug("Client [ LastTs=" + lastKnownInstantFromClient + ", TimelineHash=" + timelineHashFromClient + "], localTimeline=" + localTimeline.getInstants().collect(Collectors.toList()));
    }
    if ((!localTimeline.getInstants().findAny().isPresent()) && HoodieTimeline.INVALID_INSTANT_TS.equals(lastKnownInstantFromClient)) {
        return false;
    }
    String localTimelineHash = localTimeline.getTimelineHash();
    // refresh if timeline hash mismatches and if local's last known instant < client's last known instant (if config is enabled)
    if (!localTimelineHash.equals(timelineHashFromClient) && (!timelineServiceConfig.refreshTimelineBasedOnLatestCommit || HoodieTimeline.compareTimestamps(localLastKnownInstant, HoodieTimeline.LESSER_THAN, lastKnownInstantFromClient))) {
        return true;
    }
    // As a safety check, even if hash is same, ensure instant is present
    return !localTimeline.containsOrBeforeTimelineStarts(lastKnownInstantFromClient);
}
Also used : HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 24 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class SparkRDDWriteClient method cluster.

@Override
public HoodieWriteMetadata<JavaRDD<WriteStatus>> cluster(String clusteringInstant, boolean shouldComplete) {
    HoodieSparkTable<T> table = HoodieSparkTable.create(config, context, config.isMetadataTableEnabled());
    preWrite(clusteringInstant, WriteOperationType.CLUSTER, table.getMetaClient());
    HoodieTimeline pendingClusteringTimeline = table.getActiveTimeline().filterPendingReplaceTimeline();
    HoodieInstant inflightInstant = HoodieTimeline.getReplaceCommitInflightInstant(clusteringInstant);
    if (pendingClusteringTimeline.containsInstant(inflightInstant)) {
        rollbackInflightClustering(inflightInstant, table);
        table.getMetaClient().reloadActiveTimeline();
    }
    clusteringTimer = metrics.getClusteringCtx();
    LOG.info("Starting clustering at " + clusteringInstant);
    HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = table.cluster(context, clusteringInstant);
    HoodieWriteMetadata<JavaRDD<WriteStatus>> clusteringMetadata = writeMetadata.clone(HoodieJavaRDD.getJavaRDD(writeMetadata.getWriteStatuses()));
    // TODO : Where is shouldComplete used ?
    if (shouldComplete && clusteringMetadata.getCommitMetadata().isPresent()) {
        completeTableService(TableServiceType.CLUSTER, clusteringMetadata.getCommitMetadata().get(), table, clusteringInstant);
    }
    return clusteringMetadata;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 25 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class SparkRDDWriteClient method compact.

@Override
protected HoodieWriteMetadata<JavaRDD<WriteStatus>> compact(String compactionInstantTime, boolean shouldComplete) {
    HoodieSparkTable<T> table = HoodieSparkTable.create(config, context, true);
    preWrite(compactionInstantTime, WriteOperationType.COMPACT, table.getMetaClient());
    HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline();
    HoodieInstant inflightInstant = HoodieTimeline.getCompactionInflightInstant(compactionInstantTime);
    if (pendingCompactionTimeline.containsInstant(inflightInstant)) {
        table.rollbackInflightCompaction(inflightInstant, commitToRollback -> getPendingRollbackInfo(table.getMetaClient(), commitToRollback, false));
        table.getMetaClient().reloadActiveTimeline();
    }
    compactionTimer = metrics.getCompactionCtx();
    HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = table.compact(context, compactionInstantTime);
    HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = writeMetadata.clone(HoodieJavaRDD.getJavaRDD(writeMetadata.getWriteStatuses()));
    if (shouldComplete && compactionMetadata.getCommitMetadata().isPresent()) {
        completeTableService(TableServiceType.COMPACT, compactionMetadata.getCommitMetadata().get(), table, compactionInstantTime);
    }
    return compactionMetadata;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD)

Aggregations

HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)118 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)74 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)39 List (java.util.List)36 IOException (java.io.IOException)34 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)34 ArrayList (java.util.ArrayList)32 Option (org.apache.hudi.common.util.Option)30 Collectors (java.util.stream.Collectors)29 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)29 HoodieException (org.apache.hudi.exception.HoodieException)26 Map (java.util.Map)25 FileStatus (org.apache.hadoop.fs.FileStatus)24 Path (org.apache.hadoop.fs.Path)24 Set (java.util.Set)22 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)22 FileSlice (org.apache.hudi.common.model.FileSlice)21 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)21 Pair (org.apache.hudi.common.util.collection.Pair)21 FSUtils (org.apache.hudi.common.fs.FSUtils)20