Search in sources :

Example 31 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class SparkUpsertDeltaCommitPartitioner method getSmallFiles.

@Override
protected List<SmallFile> getSmallFiles(String partitionPath) {
    // Init here since this class (and member variables) might not have been initialized
    HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
    if (commitTimeline.empty()) {
        return Collections.emptyList();
    }
    HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
    // Find out all eligible small file slices, looking for
    // smallest file in the partition to append to
    List<FileSlice> smallFileSlicesCandidates = getSmallFileCandidates(partitionPath, latestCommitTime);
    List<SmallFile> smallFileLocations = new ArrayList<>();
    // Create SmallFiles from the eligible file slices
    for (FileSlice smallFileSlice : smallFileSlicesCandidates) {
        SmallFile sf = new SmallFile();
        if (smallFileSlice.getBaseFile().isPresent()) {
            // TODO : Move logic of file name, file id, base commit time handling inside file slice
            String filename = smallFileSlice.getBaseFile().get().getFileName();
            sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
            sf.sizeBytes = getTotalFileSize(smallFileSlice);
            smallFileLocations.add(sf);
        } else {
            HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get();
            sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
            sf.sizeBytes = getTotalFileSize(smallFileSlice);
            smallFileLocations.add(sf);
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 32 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class UpsertPartitioner method getSmallFiles.

/**
 * Returns a list of small files in the given partition path.
 */
protected List<SmallFile> getSmallFiles(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants();
    if (!commitTimeline.empty()) {
        // if we have some commits
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        List<HoodieBaseFile> allFiles = table.getBaseFileOnlyView().getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
        for (HoodieBaseFile file : allFiles) {
            if (file.getFileSize() < config.getParquetSmallFileLimit()) {
                String filename = file.getFileName();
                SmallFile sf = new SmallFile();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = file.getFileSize();
                smallFileLocations.add(sf);
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 33 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class UpsertPartitioner method getPartition.

@Override
public int getPartition(Object key) {
    Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
    if (keyLocation._2().isPresent()) {
        HoodieRecordLocation location = keyLocation._2().get();
        return updateLocationToBucket.get(location.getFileId());
    } else {
        String partitionPath = keyLocation._1().getPartitionPath();
        List<InsertBucketCumulativeWeightPair> targetBuckets = partitionPathToInsertBucketInfos.get(partitionPath);
        // pick the target bucket to use based on the weights.
        final long totalInserts = Math.max(1, profile.getWorkloadStat(partitionPath).getNumInserts());
        final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", keyLocation._1().getRecordKey());
        final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
        int index = Collections.binarySearch(targetBuckets, new InsertBucketCumulativeWeightPair(new InsertBucket(), r));
        if (index >= 0) {
            return targetBuckets.get(index).getKey().bucketNumber;
        }
        if ((-1 * index - 1) < targetBuckets.size()) {
            return targetBuckets.get((-1 * index - 1)).getKey().bucketNumber;
        }
        // return first one, by default
        return targetBuckets.get(0).getKey().bucketNumber;
    }
}
Also used : Tuple2(scala.Tuple2) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Option(org.apache.hudi.common.util.Option)

Example 34 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class JavaUpsertPartitioner method assignUpdates.

private void assignUpdates(WorkloadProfile profile) {
    // each update location gets a partition
    Set<Map.Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet();
    for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
        WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat());
        for (Map.Entry<String, Pair<String, Long>> updateLocEntry : partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
            addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
            if (profile.hasOutputWorkLoadStats()) {
                HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey());
                outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue());
            }
        }
        if (profile.hasOutputWorkLoadStats()) {
            profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats);
        }
    }
}
Also used : WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HashMap(java.util.HashMap) Map(java.util.Map) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 35 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class JavaUpsertPartitioner method getPartition.

@Override
public int getPartition(Object key) {
    Pair<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Pair<HoodieKey, Option<HoodieRecordLocation>>) key;
    if (keyLocation.getRight().isPresent()) {
        HoodieRecordLocation location = keyLocation.getRight().get();
        return updateLocationToBucket.get(location.getFileId());
    } else {
        String partitionPath = keyLocation.getLeft().getPartitionPath();
        List<InsertBucketCumulativeWeightPair> targetBuckets = partitionPathToInsertBucketInfos.get(partitionPath);
        // pick the target bucket to use based on the weights.
        final long totalInserts = Math.max(1, workloadProfile.getWorkloadStat(partitionPath).getNumInserts());
        final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", keyLocation.getLeft().getRecordKey());
        final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
        int index = Collections.binarySearch(targetBuckets, new InsertBucketCumulativeWeightPair(new InsertBucket(), r));
        if (index >= 0) {
            return targetBuckets.get(index).getKey().bucketNumber;
        }
        if ((-1 * index - 1) < targetBuckets.size()) {
            return targetBuckets.get((-1 * index - 1)).getKey().bucketNumber;
        }
        // return first one, by default
        return targetBuckets.get(0).getKey().bucketNumber;
    }
}
Also used : HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Option(org.apache.hudi.common.util.Option) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)43 ArrayList (java.util.ArrayList)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 HashMap (java.util.HashMap)16 List (java.util.List)16 HoodieKey (org.apache.hudi.common.model.HoodieKey)16 Map (java.util.Map)13 Pair (org.apache.hudi.common.util.collection.Pair)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)9 Option (org.apache.hudi.common.util.Option)9 IOException (java.io.IOException)8 WorkloadStat (org.apache.hudi.table.WorkloadStat)8 SmallFile (org.apache.hudi.table.action.commit.SmallFile)8 Tuple2 (scala.Tuple2)8 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 HoodieTable (org.apache.hudi.table.HoodieTable)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6