use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class SparkUpsertDeltaCommitPartitioner method getSmallFiles.
@Override
protected List<SmallFile> getSmallFiles(String partitionPath) {
// Init here since this class (and member variables) might not have been initialized
HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
if (commitTimeline.empty()) {
return Collections.emptyList();
}
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
// Find out all eligible small file slices, looking for
// smallest file in the partition to append to
List<FileSlice> smallFileSlicesCandidates = getSmallFileCandidates(partitionPath, latestCommitTime);
List<SmallFile> smallFileLocations = new ArrayList<>();
// Create SmallFiles from the eligible file slices
for (FileSlice smallFileSlice : smallFileSlicesCandidates) {
SmallFile sf = new SmallFile();
if (smallFileSlice.getBaseFile().isPresent()) {
// TODO : Move logic of file name, file id, base commit time handling inside file slice
String filename = smallFileSlice.getBaseFile().get().getFileName();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
sf.sizeBytes = getTotalFileSize(smallFileSlice);
smallFileLocations.add(sf);
} else {
HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get();
sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
sf.sizeBytes = getTotalFileSize(smallFileSlice);
smallFileLocations.add(sf);
}
}
return smallFileLocations;
}
use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class UpsertPartitioner method getSmallFiles.
/**
* Returns a list of small files in the given partition path.
*/
protected List<SmallFile> getSmallFiles(String partitionPath) {
// smallFiles only for partitionPath
List<SmallFile> smallFileLocations = new ArrayList<>();
HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants();
if (!commitTimeline.empty()) {
// if we have some commits
HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
List<HoodieBaseFile> allFiles = table.getBaseFileOnlyView().getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
for (HoodieBaseFile file : allFiles) {
if (file.getFileSize() < config.getParquetSmallFileLimit()) {
String filename = file.getFileName();
SmallFile sf = new SmallFile();
sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
sf.sizeBytes = file.getFileSize();
smallFileLocations.add(sf);
}
}
}
return smallFileLocations;
}
use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class UpsertPartitioner method getPartition.
@Override
public int getPartition(Object key) {
Tuple2<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Tuple2<HoodieKey, Option<HoodieRecordLocation>>) key;
if (keyLocation._2().isPresent()) {
HoodieRecordLocation location = keyLocation._2().get();
return updateLocationToBucket.get(location.getFileId());
} else {
String partitionPath = keyLocation._1().getPartitionPath();
List<InsertBucketCumulativeWeightPair> targetBuckets = partitionPathToInsertBucketInfos.get(partitionPath);
// pick the target bucket to use based on the weights.
final long totalInserts = Math.max(1, profile.getWorkloadStat(partitionPath).getNumInserts());
final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", keyLocation._1().getRecordKey());
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
int index = Collections.binarySearch(targetBuckets, new InsertBucketCumulativeWeightPair(new InsertBucket(), r));
if (index >= 0) {
return targetBuckets.get(index).getKey().bucketNumber;
}
if ((-1 * index - 1) < targetBuckets.size()) {
return targetBuckets.get((-1 * index - 1)).getKey().bucketNumber;
}
// return first one, by default
return targetBuckets.get(0).getKey().bucketNumber;
}
}
use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class JavaUpsertPartitioner method assignUpdates.
private void assignUpdates(WorkloadProfile profile) {
// each update location gets a partition
Set<Map.Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet();
for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat());
for (Map.Entry<String, Pair<String, Long>> updateLocEntry : partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
if (profile.hasOutputWorkLoadStats()) {
HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey());
outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue());
}
}
if (profile.hasOutputWorkLoadStats()) {
profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats);
}
}
}
use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.
the class JavaUpsertPartitioner method getPartition.
@Override
public int getPartition(Object key) {
Pair<HoodieKey, Option<HoodieRecordLocation>> keyLocation = (Pair<HoodieKey, Option<HoodieRecordLocation>>) key;
if (keyLocation.getRight().isPresent()) {
HoodieRecordLocation location = keyLocation.getRight().get();
return updateLocationToBucket.get(location.getFileId());
} else {
String partitionPath = keyLocation.getLeft().getPartitionPath();
List<InsertBucketCumulativeWeightPair> targetBuckets = partitionPathToInsertBucketInfos.get(partitionPath);
// pick the target bucket to use based on the weights.
final long totalInserts = Math.max(1, workloadProfile.getWorkloadStat(partitionPath).getNumInserts());
final long hashOfKey = NumericUtils.getMessageDigestHash("MD5", keyLocation.getLeft().getRecordKey());
final double r = 1.0 * Math.floorMod(hashOfKey, totalInserts) / totalInserts;
int index = Collections.binarySearch(targetBuckets, new InsertBucketCumulativeWeightPair(new InsertBucket(), r));
if (index >= 0) {
return targetBuckets.get(index).getKey().bucketNumber;
}
if ((-1 * index - 1) < targetBuckets.size()) {
return targetBuckets.get((-1 * index - 1)).getKey().bucketNumber;
}
// return first one, by default
return targetBuckets.get(0).getKey().bucketNumber;
}
}
Aggregations