Search in sources :

Example 71 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class BucketStreamWriteFunction method bootstrapIndex.

/**
 * Get partition_bucket -> fileID mapping from the existing hudi table.
 * This is a required operation for each restart to avoid having duplicate file ids for one bucket.
 */
private void bootstrapIndex() throws IOException {
    Option<HoodieInstant> latestCommitTime = table.getFileSystemView().getTimeline().filterCompletedInstants().lastInstant();
    if (!latestCommitTime.isPresent()) {
        return;
    }
    // bootstrap bucket info from existing file system
    // bucketNum % totalParallelism == this taskID belongs to this task
    HashSet<Integer> bucketToLoad = new HashSet<>();
    for (int i = 0; i < bucketNum; i++) {
        int partitionOfBucket = BucketIdentifier.mod(i, parallelism);
        if (partitionOfBucket == taskID) {
            LOG.info(String.format("Bootstrapping index. Adding bucket %s , " + "Current parallelism: %s , Max parallelism: %s , Current task id: %s", i, parallelism, maxParallelism, taskID));
            bucketToLoad.add(i);
        }
    }
    bucketToLoad.forEach(bucket -> LOG.info(String.format("bucketToLoad contains %s", bucket)));
    LOG.info(String.format("Loading Hoodie Table %s, with path %s", table.getMetaClient().getTableConfig().getTableName(), table.getMetaClient().getBasePath()));
    // Iterate through all existing partitions to load existing fileID belongs to this task
    List<String> partitions = table.getMetadata().getAllPartitionPaths();
    for (String partitionPath : partitions) {
        List<FileSlice> latestFileSlices = table.getSliceView().getLatestFileSlices(partitionPath).collect(toList());
        for (FileSlice fileslice : latestFileSlices) {
            String fileID = fileslice.getFileId();
            int bucketNumber = BucketIdentifier.bucketIdFromFileId(fileID);
            if (bucketToLoad.contains(bucketNumber)) {
                String partitionBucketId = BucketIdentifier.partitionBucketIdStr(partitionPath, bucketNumber);
                LOG.info(String.format("Should load this partition bucket %s with fileID %s", partitionBucketId, fileID));
                if (bucketToFileIDMap.containsKey(partitionBucketId)) {
                    throw new RuntimeException(String.format("Duplicate fileID %s from partitionBucket %s found " + "during the BucketStreamWriteFunction index bootstrap.", fileID, partitionBucketId));
                } else {
                    LOG.info(String.format("Adding fileID %s to the partition bucket %s.", fileID, partitionBucketId));
                    bucketToFileIDMap.put(partitionBucketId, fileID);
                }
            }
        }
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HashSet(java.util.HashSet)

Example 72 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class DeltaWriteProfile method smallFilesProfile.

@Override
protected List<SmallFile> smallFilesProfile(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    // Init here since this class (and member variables) might not have been initialized
    HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
    // Find out all eligible small file slices
    if (!commitTimeline.empty()) {
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        // find the smallest file in partition and append to it
        List<FileSlice> allSmallFileSlices = new ArrayList<>();
        // If we can index log files, we can add more inserts to log files for fileIds including those under
        // pending compaction.
        List<FileSlice> allFileSlices = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).collect(Collectors.toList());
        for (FileSlice fileSlice : allFileSlices) {
            if (isSmallFile(fileSlice)) {
                allSmallFileSlices.add(fileSlice);
            }
        }
        // Create SmallFiles from the eligible file slices
        for (FileSlice smallFileSlice : allSmallFileSlices) {
            SmallFile sf = new SmallFile();
            if (smallFileSlice.getBaseFile().isPresent()) {
                // TODO : Move logic of file name, file id, base commit time handling inside file slice
                String filename = smallFileSlice.getBaseFile().get().getFileName();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = getTotalFileSize(smallFileSlice);
                smallFileLocations.add(sf);
            } else {
                smallFileSlice.getLogFiles().findFirst().ifPresent(logFile -> {
                    // in case there is something error, and the file slice has no log file
                    sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
                    sf.sizeBytes = getTotalFileSize(smallFileSlice);
                    smallFileLocations.add(sf);
                });
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 73 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class WriteProfile method smallFilesProfile.

/**
 * Returns a list of small files in the given partition path from the latest filesystem view.
 */
protected List<SmallFile> smallFilesProfile(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
    if (!commitTimeline.empty()) {
        // if we have some commits
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        List<HoodieBaseFile> allFiles = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
        for (HoodieBaseFile file : allFiles) {
            // filter out the corrupted files.
            if (file.getFileSize() < config.getParquetSmallFileLimit() && file.getFileSize() > 0) {
                String filename = file.getFileName();
                SmallFile sf = new SmallFile();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = file.getFileSize();
                smallFileLocations.add(sf);
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 74 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class HoodieHiveClient method updateLastCommitTimeSynced.

@Override
public void updateLastCommitTimeSynced(String tableName) {
    // Set the last commit time from the TBLproperties
    Option<String> lastCommitSynced = activeTimeline.lastInstant().map(HoodieInstant::getTimestamp);
    if (lastCommitSynced.isPresent()) {
        try {
            Table table = client.getTable(syncConfig.databaseName, tableName);
            table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced.get());
            client.alter_table(syncConfig.databaseName, tableName, table);
        } catch (Exception e) {
            throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e);
        }
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Table(org.apache.hadoop.hive.metastore.api.Table) TException(org.apache.thrift.TException) HoodieSyncException(org.apache.hudi.sync.common.HoodieSyncException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException)

Example 75 with HoodieInstant

use of org.apache.hudi.common.table.timeline.HoodieInstant in project hudi by apache.

the class HoodieBackedTableMetadataWriter method update.

/**
 * Update from {@code HoodieRollbackMetadata}.
 *
 * @param rollbackMetadata {@code HoodieRollbackMetadata}
 * @param instantTime Timestamp at which the rollback was performed
 */
@Override
public void update(HoodieRollbackMetadata rollbackMetadata, String instantTime) {
    if (enabled && metadata != null) {
        // Is this rollback of an instant that has been synced to the metadata table?
        String rollbackInstant = rollbackMetadata.getCommitsRollback().get(0);
        boolean wasSynced = metadataMetaClient.getActiveTimeline().containsInstant(new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, rollbackInstant));
        if (!wasSynced) {
            // A compaction may have taken place on metadata table which would have included this instant being rolled back.
            // Revisit this logic to relax the compaction fencing : https://issues.apache.org/jira/browse/HUDI-2458
            Option<String> latestCompaction = metadata.getLatestCompactionTime();
            if (latestCompaction.isPresent()) {
                wasSynced = HoodieTimeline.compareTimestamps(rollbackInstant, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCompaction.get());
            }
        }
        Map<MetadataPartitionType, HoodieData<HoodieRecord>> records = HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, metadataMetaClient.getActiveTimeline(), rollbackMetadata, getRecordsGenerationParams(), instantTime, metadata.getSyncedInstantTime(), wasSynced);
        commit(instantTime, records, false);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieData(org.apache.hudi.common.data.HoodieData)

Aggregations

HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)323 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)129 ArrayList (java.util.ArrayList)118 List (java.util.List)116 IOException (java.io.IOException)112 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)104 Test (org.junit.jupiter.api.Test)97 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)96 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)89 Map (java.util.Map)84 Option (org.apache.hudi.common.util.Option)84 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)84 Collectors (java.util.stream.Collectors)83 HashMap (java.util.HashMap)81 Path (org.apache.hadoop.fs.Path)78 Pair (org.apache.hudi.common.util.collection.Pair)71 Logger (org.apache.log4j.Logger)67 LogManager (org.apache.log4j.LogManager)66 HoodieIOException (org.apache.hudi.exception.HoodieIOException)65 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)61