Search in sources :

Example 6 with WorkloadStat

use of org.apache.hudi.table.WorkloadStat in project hudi by apache.

the class HoodieDeleteHelper method execute.

@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> execute(String instantTime, HoodieData<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, BaseCommitActionExecutor<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>, R> deleteExecutor) {
    try {
        HoodieData<HoodieKey> dedupedKeys = keys;
        final int parallelism = config.getDeleteShuffleParallelism();
        if (config.shouldCombineBeforeDelete()) {
            // De-dupe/merge if needed
            dedupedKeys = deduplicateKeys(keys, table, parallelism);
        } else if (!keys.isEmpty()) {
            dedupedKeys = keys.repartition(parallelism);
        }
        HoodieData<HoodieRecord<T>> dedupedRecords = dedupedKeys.map(key -> new HoodieAvroRecord(key, new EmptyHoodieRecordPayload()));
        Instant beginTag = Instant.now();
        // perform index loop up to get existing location of records
        HoodieData<HoodieRecord<T>> taggedRecords = table.getIndex().tagLocation(dedupedRecords, context, table);
        Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
        // filter out non existent keys/records
        HoodieData<HoodieRecord<T>> taggedValidRecords = taggedRecords.filter(HoodieRecord::isCurrentLocationKnown);
        HoodieWriteMetadata<HoodieData<WriteStatus>> result;
        if (!taggedValidRecords.isEmpty()) {
            result = deleteExecutor.execute(taggedValidRecords);
            result.setIndexLookupDuration(tagLocationDuration);
        } else {
            // if entire set of keys are non existent
            deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
            result = new HoodieWriteMetadata<>();
            result.setWriteStatuses(context.emptyHoodieData());
            deleteExecutor.commitOnAutoCommit(result);
        }
        return result;
    } catch (Throwable e) {
        if (e instanceof HoodieUpsertException) {
            throw (HoodieUpsertException) e;
        }
        throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
    }
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Instant(java.time.Instant) Duration(java.time.Duration) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload)

Example 7 with WorkloadStat

use of org.apache.hudi.table.WorkloadStat in project hudi by apache.

the class SparkDeletePartitionCommitActionExecutor method execute.

@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> execute() {
    HoodieTimer timer = new HoodieTimer().startTimer();
    context.setJobStatus(this.getClass().getSimpleName(), "Gather all file ids from all deleting partitions.");
    Map<String, List<String>> partitionToReplaceFileIds = HoodieJavaPairRDD.getJavaPairRDD(context.parallelize(partitions).distinct().mapToPair(partitionPath -> Pair.of(partitionPath, getAllExistingFileIds(partitionPath)))).collectAsMap();
    HoodieWriteMetadata<HoodieData<WriteStatus>> result = new HoodieWriteMetadata<>();
    result.setPartitionToReplaceFileIds(partitionToReplaceFileIds);
    result.setIndexUpdateDuration(Duration.ofMillis(timer.endTimer()));
    result.setWriteStatuses(context.emptyHoodieData());
    this.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
    this.commitOnAutoCommit(result);
    return result;
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) List(java.util.List) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata)

Example 8 with WorkloadStat

use of org.apache.hudi.table.WorkloadStat in project hudi by apache.

the class UpsertPartitioner method assignUpdates.

private void assignUpdates(WorkloadProfile profile) {
    // each update location gets a partition
    Set<Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet();
    for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
        WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat());
        for (Map.Entry<String, Pair<String, Long>> updateLocEntry : partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
            addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
            if (profile.hasOutputWorkLoadStats()) {
                HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey());
                outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue());
            }
        }
        if (profile.hasOutputWorkLoadStats()) {
            profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats);
        }
    }
}
Also used : Entry(java.util.Map.Entry) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HashMap(java.util.HashMap) Map(java.util.Map) Pair(org.apache.hudi.common.util.collection.Pair)

Example 9 with WorkloadStat

use of org.apache.hudi.table.WorkloadStat in project hudi by apache.

the class UpsertPartitioner method assignInserts.

private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) {
    // for new inserts, compute buckets depending on how many records we have for each partition
    Set<String> partitionPaths = profile.getPartitionPaths();
    long averageRecordSize = averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(), config);
    LOG.info("AvgRecordSize => " + averageRecordSize);
    Map<String, List<SmallFile>> partitionSmallFilesMap = getSmallFilesForPartitions(new ArrayList<String>(partitionPaths), context);
    Map<String, Set<String>> partitionPathToPendingClusteringFileGroupsId = getPartitionPathToPendingClusteringFileGroupsId();
    for (String partitionPath : partitionPaths) {
        WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
        WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat());
        if (pStat.getNumInserts() > 0) {
            List<SmallFile> smallFiles = filterSmallFilesInClustering(partitionPathToPendingClusteringFileGroupsId.getOrDefault(partitionPath, Collections.emptySet()), partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>()));
            this.smallFiles.addAll(smallFiles);
            LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
            long totalUnassignedInserts = pStat.getNumInserts();
            List<Integer> bucketNumbers = new ArrayList<>();
            List<Long> recordsPerBucket = new ArrayList<>();
            // first try packing this into one of the smallFiles
            for (SmallFile smallFile : smallFiles) {
                long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, totalUnassignedInserts);
                if (recordsToAppend > 0) {
                    // create a new bucket or re-use an existing bucket
                    int bucket;
                    if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
                        bucket = updateLocationToBucket.get(smallFile.location.getFileId());
                        LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
                    } else {
                        bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId());
                        LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
                    }
                    if (profile.hasOutputWorkLoadStats()) {
                        outputWorkloadStats.addInserts(smallFile.location, recordsToAppend);
                    }
                    bucketNumbers.add(bucket);
                    recordsPerBucket.add(recordsToAppend);
                    totalUnassignedInserts -= recordsToAppend;
                    if (totalUnassignedInserts <= 0) {
                        // stop the loop when all the inserts are assigned
                        break;
                    }
                }
            }
            // if we have anything more, create new insert buckets, like normal
            if (totalUnassignedInserts > 0) {
                long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
                if (config.shouldAutoTuneInsertSplits()) {
                    insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
                }
                int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket);
                LOG.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket);
                for (int b = 0; b < insertBuckets; b++) {
                    bucketNumbers.add(totalBuckets);
                    if (b < insertBuckets - 1) {
                        recordsPerBucket.add(insertRecordsPerBucket);
                    } else {
                        recordsPerBucket.add(totalUnassignedInserts - (insertBuckets - 1) * insertRecordsPerBucket);
                    }
                    BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath);
                    bucketInfoMap.put(totalBuckets, bucketInfo);
                    if (profile.hasOutputWorkLoadStats()) {
                        outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1));
                    }
                    totalBuckets++;
                }
            }
            // Go over all such buckets, and assign weights as per amount of incoming inserts.
            List<InsertBucketCumulativeWeightPair> insertBuckets = new ArrayList<>();
            double currentCumulativeWeight = 0;
            for (int i = 0; i < bucketNumbers.size(); i++) {
                InsertBucket bkt = new InsertBucket();
                bkt.bucketNumber = bucketNumbers.get(i);
                bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
                currentCumulativeWeight += bkt.weight;
                insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, currentCumulativeWeight));
            }
            LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
            partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets);
        }
        if (profile.hasOutputWorkLoadStats()) {
            profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats);
        }
    }
}
Also used : Set(java.util.Set) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) WorkloadStat(org.apache.hudi.table.WorkloadStat) ArrayList(java.util.ArrayList) List(java.util.List)

Example 10 with WorkloadStat

use of org.apache.hudi.table.WorkloadStat in project hudi by apache.

the class JavaUpsertPartitioner method assignUpdates.

private void assignUpdates(WorkloadProfile profile) {
    // each update location gets a partition
    Set<Map.Entry<String, WorkloadStat>> partitionStatEntries = profile.getInputPartitionPathStatMap().entrySet();
    for (Map.Entry<String, WorkloadStat> partitionStat : partitionStatEntries) {
        WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionStat.getKey(), new WorkloadStat());
        for (Map.Entry<String, Pair<String, Long>> updateLocEntry : partitionStat.getValue().getUpdateLocationToCount().entrySet()) {
            addUpdateBucket(partitionStat.getKey(), updateLocEntry.getKey());
            if (profile.hasOutputWorkLoadStats()) {
                HoodieRecordLocation hoodieRecordLocation = new HoodieRecordLocation(updateLocEntry.getValue().getKey(), updateLocEntry.getKey());
                outputWorkloadStats.addUpdates(hoodieRecordLocation, updateLocEntry.getValue().getValue());
            }
        }
        if (profile.hasOutputWorkLoadStats()) {
            profile.updateOutputPartitionPathStatMap(partitionStat.getKey(), outputWorkloadStats);
        }
    }
}
Also used : WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HashMap(java.util.HashMap) Map(java.util.Map) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

WorkloadStat (org.apache.hudi.table.WorkloadStat)12 List (java.util.List)8 Pair (org.apache.hudi.common.util.collection.Pair)8 HashMap (java.util.HashMap)7 HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)7 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)6 Duration (java.time.Duration)5 Instant (java.time.Instant)5 Collections (java.util.Collections)5 Map (java.util.Map)5 Collectors (java.util.stream.Collectors)5 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)5 HoodieKey (org.apache.hudi.common.model.HoodieKey)5 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)5 WorkloadProfile (org.apache.hudi.table.WorkloadProfile)5 IOException (java.io.IOException)4 WriteStatus (org.apache.hudi.client.WriteStatus)4 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)4 HoodieUpsertException (org.apache.hudi.exception.HoodieUpsertException)4 HoodieTable (org.apache.hudi.table.HoodieTable)4