Search in sources :

Example 31 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class SparkSingleFileSortExecutionStrategy method performClusteringWithRecordsRDD.

@Override
public HoodieData<WriteStatus> performClusteringWithRecordsRDD(HoodieData<HoodieRecord<T>> inputRecords, int numOutputGroups, String instantTime, Map<String, String> strategyParams, Schema schema, List<HoodieFileGroupId> fileGroupIdList, boolean preserveHoodieMetadata) {
    if (numOutputGroups != 1 || fileGroupIdList.size() != 1) {
        throw new HoodieClusteringException("Expect only one file group for strategy: " + getClass().getName());
    }
    LOG.info("Starting clustering for a group, parallelism:" + numOutputGroups + " commit:" + instantTime);
    Properties props = getWriteConfig().getProps();
    props.put(HoodieWriteConfig.BULKINSERT_PARALLELISM_VALUE.key(), String.valueOf(numOutputGroups));
    // We are calling another action executor - disable auto commit. Strategy is only expected to write data in new files.
    props.put(HoodieWriteConfig.AUTO_COMMIT_ENABLE.key(), Boolean.FALSE.toString());
    // Since clustering will write to single file group using HoodieUnboundedCreateHandle, set max file size to a large value.
    props.put(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key(), String.valueOf(Long.MAX_VALUE));
    HoodieWriteConfig newConfig = HoodieWriteConfig.newBuilder().withProps(props).build();
    return (HoodieData<WriteStatus>) SparkBulkInsertHelper.newInstance().bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig, false, getPartitioner(strategyParams, schema), true, numOutputGroups, new SingleFileHandleCreateFactory(fileGroupIdList.get(0).getFileId(), preserveHoodieMetadata));
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) SingleFileHandleCreateFactory(org.apache.hudi.io.SingleFileHandleCreateFactory)

Example 32 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class SparkSortAndSizeExecutionStrategy method performClusteringWithRecordsRDD.

@Override
public HoodieData<WriteStatus> performClusteringWithRecordsRDD(final HoodieData<HoodieRecord<T>> inputRecords, final int numOutputGroups, final String instantTime, final Map<String, String> strategyParams, final Schema schema, final List<HoodieFileGroupId> fileGroupIdList, final boolean preserveHoodieMetadata) {
    LOG.info("Starting clustering for a group, parallelism:" + numOutputGroups + " commit:" + instantTime);
    Properties props = getWriteConfig().getProps();
    props.put(HoodieWriteConfig.BULKINSERT_PARALLELISM_VALUE.key(), String.valueOf(numOutputGroups));
    // We are calling another action executor - disable auto commit. Strategy is only expected to write data in new files.
    props.put(HoodieWriteConfig.AUTO_COMMIT_ENABLE.key(), Boolean.FALSE.toString());
    props.put(HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.key(), String.valueOf(getWriteConfig().getClusteringTargetFileMaxBytes()));
    HoodieWriteConfig newConfig = HoodieWriteConfig.newBuilder().withProps(props).build();
    return (HoodieData<WriteStatus>) SparkBulkInsertHelper.newInstance().bulkInsert(inputRecords, instantTime, getHoodieTable(), newConfig, false, getPartitioner(strategyParams, schema), true, numOutputGroups, new CreateHandleFactory(preserveHoodieMetadata));
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) Properties(java.util.Properties)

Example 33 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class SparkHoodieHBaseIndex method updateLocation.

@Override
public HoodieData<WriteStatus> updateLocation(HoodieData<WriteStatus> writeStatus, HoodieEngineContext context, HoodieTable hoodieTable) {
    JavaRDD<WriteStatus> writeStatusRDD = HoodieJavaRDD.getJavaRDD(writeStatus);
    final Option<Float> desiredQPSFraction = calculateQPSFraction(writeStatusRDD);
    final Map<String, Integer> fileIdPartitionMap = mapFileWithInsertsToUniquePartition(writeStatusRDD);
    JavaRDD<WriteStatus> partitionedRDD = this.numWriteStatusWithInserts == 0 ? writeStatusRDD : writeStatusRDD.mapToPair(w -> new Tuple2<>(w.getFileId(), w)).partitionBy(new WriteStatusPartitioner(fileIdPartitionMap, this.numWriteStatusWithInserts)).map(w -> w._2());
    JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
    acquireQPSResourcesAndSetBatchSize(desiredQPSFraction, jsc);
    JavaRDD<WriteStatus> writeStatusJavaRDD = partitionedRDD.mapPartitionsWithIndex(updateLocationFunction(), true);
    // caching the index updated status RDD
    writeStatusJavaRDD = writeStatusJavaRDD.persist(SparkMemoryUtils.getWriteStatusStorageLevel(config.getProps()));
    // force trigger update location(hbase puts)
    writeStatusJavaRDD.count();
    this.hBaseIndexQPSResourceAllocator.releaseQPSResources();
    return HoodieJavaRDD.of(writeStatusJavaRDD);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Mutation(org.apache.hadoop.hbase.client.Mutation) Function2(org.apache.spark.api.java.function.Function2) Result(org.apache.hadoop.hbase.client.Result) Date(java.util.Date) RateLimiter(org.apache.hudi.common.util.RateLimiter) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Logger(org.apache.log4j.Logger) Delete(org.apache.hadoop.hbase.client.Delete) Partitioner(org.apache.spark.Partitioner) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieDependentSystemUnavailableException(org.apache.hudi.exception.HoodieDependentSystemUnavailableException) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) BufferedMutator(org.apache.hadoop.hbase.client.BufferedMutator) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieIndexException(org.apache.hudi.exception.HoodieIndexException) Get(org.apache.hadoop.hbase.client.Get) Tuple2(scala.Tuple2) HoodieIndex(org.apache.hudi.index.HoodieIndex) Serializable(java.io.Serializable) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) RegionLocator(org.apache.hadoop.hbase.client.RegionLocator) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) ResultScanner(org.apache.hadoop.hbase.client.ResultScanner) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) SparkMemoryUtils(org.apache.hudi.client.utils.SparkMemoryUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) HTable(org.apache.hadoop.hbase.client.HTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) Bytes(org.apache.hadoop.hbase.util.Bytes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) TableName(org.apache.hadoop.hbase.TableName) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Put(org.apache.hadoop.hbase.client.Put) SparkConf(org.apache.spark.SparkConf) DateTime(org.joda.time.DateTime) HoodieHBaseIndexConfig(org.apache.hudi.config.HoodieHBaseIndexConfig) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) ConnectionFactory(org.apache.hadoop.hbase.client.ConnectionFactory) Scan(org.apache.hadoop.hbase.client.Scan) TimeUnit(java.util.concurrent.TimeUnit) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HRegionLocation(org.apache.hadoop.hbase.HRegionLocation) Connection(org.apache.hadoop.hbase.client.Connection) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 34 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class BaseSparkCommitActionExecutor method buildProfile.

private Pair<HashMap<String, WorkloadStat>, WorkloadStat> buildProfile(HoodieData<HoodieRecord<T>> inputRecords) {
    HashMap<String, WorkloadStat> partitionPathStatMap = new HashMap<>();
    WorkloadStat globalStat = new WorkloadStat();
    // group the records by partitionPath + currentLocation combination, count the number of
    // records in each partition
    Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = inputRecords.mapToPair(record -> Pair.of(new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)).countByKey();
    // count the number of both inserts and updates in each partition, update the counts to workLoadStats
    for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
        String partitionPath = e.getKey()._1();
        Long count = e.getValue();
        Option<HoodieRecordLocation> locOption = e.getKey()._2();
        if (!partitionPathStatMap.containsKey(partitionPath)) {
            partitionPathStatMap.put(partitionPath, new WorkloadStat());
        }
        if (locOption.isPresent()) {
            // update
            partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
            globalStat.addUpdates(locOption.get(), count);
        } else {
            // insert
            partitionPathStatMap.get(partitionPath).addInserts(count);
            globalStat.addInserts(count);
        }
    }
    return Pair.of(partitionPathStatMap, globalStat);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) Partitioner(org.apache.spark.Partitioner) StorageLevel(org.apache.spark.storage.StorageLevel) Duration(java.time.Duration) Map(java.util.Map) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieConcatHandle(org.apache.hudi.io.HoodieConcatHandle) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) Set(java.util.Set) Instant(java.time.Instant) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) ClusteringUtils.getAllFileGroupsInPendingClusteringPlans(org.apache.hudi.common.util.ClusteringUtils.getAllFileGroupsInPendingClusteringPlans) UpdateStrategy(org.apache.hudi.table.action.cluster.strategy.UpdateStrategy) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) CommitUtils(org.apache.hudi.common.util.CommitUtils) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) SparkLazyInsertIterable(org.apache.hudi.execution.SparkLazyInsertIterable) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) HoodieJavaPairRDD(org.apache.hudi.data.HoodieJavaPairRDD) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) WorkloadStat(org.apache.hudi.table.WorkloadStat) Tuple2(scala.Tuple2) Map(java.util.Map) HashMap(java.util.HashMap)

Example 35 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class BaseSparkCommitActionExecutor method execute.

@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> execute(HoodieData<HoodieRecord<T>> inputRecords) {
    // Cache the tagged records, so we don't end up computing both
    // TODO: Consistent contract in HoodieWriteClient regarding preppedRecord storage level handling
    JavaRDD<HoodieRecord<T>> inputRDD = HoodieJavaRDD.getJavaRDD(inputRecords);
    if (inputRDD.getStorageLevel() == StorageLevel.NONE()) {
        inputRDD.persist(StorageLevel.MEMORY_AND_DISK_SER());
    } else {
        LOG.info("RDD PreppedRecords was persisted at: " + inputRDD.getStorageLevel());
    }
    WorkloadProfile workloadProfile = null;
    if (isWorkloadProfileNeeded()) {
        context.setJobStatus(this.getClass().getSimpleName(), "Building workload profile");
        workloadProfile = new WorkloadProfile(buildProfile(inputRecords), operationType, table.getIndex().canIndexLogFiles());
        LOG.info("Input workload profile :" + workloadProfile);
    }
    // partition using the insert partitioner
    final Partitioner partitioner = getPartitioner(workloadProfile);
    if (isWorkloadProfileNeeded()) {
        saveWorkloadProfileMetadataToInflight(workloadProfile, instantTime);
    }
    // handle records update with clustering
    HoodieData<HoodieRecord<T>> inputRecordsWithClusteringUpdate = clusteringHandleUpdate(inputRecords);
    context.setJobStatus(this.getClass().getSimpleName(), "Doing partition and writing data");
    HoodieData<WriteStatus> writeStatuses = mapPartitionsAsRDD(inputRecordsWithClusteringUpdate, partitioner);
    HoodieWriteMetadata<HoodieData<WriteStatus>> result = new HoodieWriteMetadata<>();
    updateIndexAndCommitIfNeeded(writeStatuses, result);
    return result;
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) Partitioner(org.apache.spark.Partitioner) WriteStatus(org.apache.hudi.client.WriteStatus)

Aggregations

HoodieData (org.apache.hudi.common.data.HoodieData)36 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 WriteStatus (org.apache.hudi.client.WriteStatus)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)22 List (java.util.List)21 HoodieTable (org.apache.hudi.table.HoodieTable)20 HoodieKey (org.apache.hudi.common.model.HoodieKey)18 LogManager (org.apache.log4j.LogManager)18 Logger (org.apache.log4j.Logger)18 IOException (java.io.IOException)17 Collectors (java.util.stream.Collectors)17 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)17 Option (org.apache.hudi.common.util.Option)17 Map (java.util.Map)16 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)16 HoodieWriteMetadata (org.apache.hudi.table.action.HoodieWriteMetadata)16 JavaRDD (org.apache.spark.api.java.JavaRDD)16 Pair (org.apache.hudi.common.util.collection.Pair)15 HoodieJavaRDD (org.apache.hudi.data.HoodieJavaRDD)15 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)14