Search in sources :

Example 1 with BulkInsertMapFunction

use of org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction in project hudi by apache.

the class SparkBulkInsertHelper method bulkInsert.

@Override
public HoodieData<WriteStatus> bulkInsert(HoodieData<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
    // De-dupe/merge if needed
    HoodieData<HoodieRecord<T>> dedupedRecords = inputRecords;
    if (performDedupe) {
        dedupedRecords = (HoodieData<HoodieRecord<T>>) HoodieWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
    }
    final HoodieData<HoodieRecord<T>> repartitionedRecords;
    BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
    // only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
    repartitionedRecords = HoodieJavaRDD.of((JavaRDD<HoodieRecord<T>>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism));
    // generate new file ID prefixes for each output partition
    final List<String> fileIDPrefixes = IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
    JavaRDD<WriteStatus> writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords).mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes, useWriterSchema, writeHandleFactory), true).flatMap(List::iterator);
    return HoodieJavaRDD.of(writeStatusRDD);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) IntStream(java.util.stream.IntStream) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) Option(org.apache.hudi.common.util.Option) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Collectors(java.util.stream.Collectors) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) WriteHandleFactory(org.apache.hudi.io.WriteHandleFactory) List(java.util.List) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) BulkInsertInternalPartitionerFactory(org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory) BulkInsertMapFunction(org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) FSUtils(org.apache.hudi.common.fs.FSUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BulkInsertMapFunction(org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction) List(java.util.List) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD)

Aggregations

List (java.util.List)1 Collectors (java.util.stream.Collectors)1 IntStream (java.util.stream.IntStream)1 WriteStatus (org.apache.hudi.client.WriteStatus)1 HoodieData (org.apache.hudi.common.data.HoodieData)1 FSUtils (org.apache.hudi.common.fs.FSUtils)1 HoodieKey (org.apache.hudi.common.model.HoodieKey)1 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)1 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)1 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)1 Option (org.apache.hudi.common.util.Option)1 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)1 HoodieJavaRDD (org.apache.hudi.data.HoodieJavaRDD)1 BulkInsertInternalPartitionerFactory (org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory)1 BulkInsertMapFunction (org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction)1 CreateHandleFactory (org.apache.hudi.io.CreateHandleFactory)1 WriteHandleFactory (org.apache.hudi.io.WriteHandleFactory)1 BulkInsertPartitioner (org.apache.hudi.table.BulkInsertPartitioner)1 HoodieTable (org.apache.hudi.table.HoodieTable)1 HoodieWriteMetadata (org.apache.hudi.table.action.HoodieWriteMetadata)1