Search in sources :

Example 41 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class BucketAssignFunction method processRecord.

@SuppressWarnings("unchecked")
private void processRecord(HoodieRecord<?> record, Collector<O> out) throws Exception {
    // 1. put the record into the BucketAssigner;
    // 2. look up the state for location, if the record has a location, just send it out;
    // 3. if it is an INSERT, decide the location using the BucketAssigner then send it out.
    final HoodieKey hoodieKey = record.getKey();
    final String recordKey = hoodieKey.getRecordKey();
    final String partitionPath = hoodieKey.getPartitionPath();
    final HoodieRecordLocation location;
    // Only changing records need looking up the index for the location,
    // append only records are always recognized as INSERT.
    HoodieRecordGlobalLocation oldLoc = indexState.value();
    if (isChangingRecords && oldLoc != null) {
        // Set up the instant time as "U" to mark the bucket as an update bucket.
        if (!Objects.equals(oldLoc.getPartitionPath(), partitionPath)) {
            if (globalIndex) {
                // if partition path changes, emit a delete record for old partition path,
                // then update the index state using location with new partition path.
                HoodieRecord<?> deleteRecord = new HoodieAvroRecord<>(new HoodieKey(recordKey, oldLoc.getPartitionPath()), payloadCreation.createDeletePayload((BaseAvroPayload) record.getData()));
                deleteRecord.setCurrentLocation(oldLoc.toLocal("U"));
                deleteRecord.seal();
                out.collect((O) deleteRecord);
            }
            location = getNewRecordLocation(partitionPath);
            updateIndexState(partitionPath, location);
        } else {
            location = oldLoc.toLocal("U");
            this.bucketAssigner.addUpdate(partitionPath, location.getFileId());
        }
    } else {
        location = getNewRecordLocation(partitionPath);
    }
    // always refresh the index
    if (isChangingRecords) {
        updateIndexState(partitionPath, location);
    }
    record.setCurrentLocation(location);
    out.collect((O) record);
}
Also used : BaseAvroPayload(org.apache.hudi.common.model.BaseAvroPayload) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecordGlobalLocation(org.apache.hudi.common.model.HoodieRecordGlobalLocation) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 42 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class BucketAssignFunction method getNewRecordLocation.

private HoodieRecordLocation getNewRecordLocation(String partitionPath) {
    final BucketInfo bucketInfo = this.bucketAssigner.addInsert(partitionPath);
    final HoodieRecordLocation location;
    switch(bucketInfo.getBucketType()) {
        case INSERT:
            // This is an insert bucket, use HoodieRecordLocation instant time as "I".
            // Downstream operators can then check the instant time to know whether
            // a record belongs to an insert bucket.
            location = new HoodieRecordLocation("I", bucketInfo.getFileIdPrefix());
            break;
        case UPDATE:
            location = new HoodieRecordLocation("U", bucketInfo.getFileIdPrefix());
            break;
        default:
            throw new AssertionError();
    }
    return location;
}
Also used : HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) BucketInfo(org.apache.hudi.table.action.commit.BucketInfo)

Example 43 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class BaseSparkCommitActionExecutor method buildProfile.

private Pair<HashMap<String, WorkloadStat>, WorkloadStat> buildProfile(HoodieData<HoodieRecord<T>> inputRecords) {
    HashMap<String, WorkloadStat> partitionPathStatMap = new HashMap<>();
    WorkloadStat globalStat = new WorkloadStat();
    // group the records by partitionPath + currentLocation combination, count the number of
    // records in each partition
    Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = inputRecords.mapToPair(record -> Pair.of(new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)).countByKey();
    // count the number of both inserts and updates in each partition, update the counts to workLoadStats
    for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
        String partitionPath = e.getKey()._1();
        Long count = e.getValue();
        Option<HoodieRecordLocation> locOption = e.getKey()._2();
        if (!partitionPathStatMap.containsKey(partitionPath)) {
            partitionPathStatMap.put(partitionPath, new WorkloadStat());
        }
        if (locOption.isPresent()) {
            // update
            partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
            globalStat.addUpdates(locOption.get(), count);
        } else {
            // insert
            partitionPathStatMap.get(partitionPath).addInserts(count);
            globalStat.addInserts(count);
        }
    }
    return Pair.of(partitionPathStatMap, globalStat);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) Partitioner(org.apache.spark.Partitioner) StorageLevel(org.apache.spark.storage.StorageLevel) Duration(java.time.Duration) Map(java.util.Map) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieConcatHandle(org.apache.hudi.io.HoodieConcatHandle) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) Set(java.util.Set) Instant(java.time.Instant) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) ClusteringUtils.getAllFileGroupsInPendingClusteringPlans(org.apache.hudi.common.util.ClusteringUtils.getAllFileGroupsInPendingClusteringPlans) UpdateStrategy(org.apache.hudi.table.action.cluster.strategy.UpdateStrategy) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) CommitUtils(org.apache.hudi.common.util.CommitUtils) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) SparkLazyInsertIterable(org.apache.hudi.execution.SparkLazyInsertIterable) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) WorkloadStat(org.apache.hudi.table.WorkloadStat) Tuple2(scala.Tuple2) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)43 ArrayList (java.util.ArrayList)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 HashMap (java.util.HashMap)16 List (java.util.List)16 HoodieKey (org.apache.hudi.common.model.HoodieKey)16 Map (java.util.Map)13 Pair (org.apache.hudi.common.util.collection.Pair)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)9 Option (org.apache.hudi.common.util.Option)9 IOException (java.io.IOException)8 WorkloadStat (org.apache.hudi.table.WorkloadStat)8 SmallFile (org.apache.hudi.table.action.commit.SmallFile)8 Tuple2 (scala.Tuple2)8 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 HoodieTable (org.apache.hudi.table.HoodieTable)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6