use of org.apache.hudi.io.WriteHandleFactory in project hudi by apache.
the class SparkBulkInsertHelper method bulkInsert.
@Override
public HoodieData<WriteStatus> bulkInsert(HoodieData<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
// De-dupe/merge if needed
HoodieData<HoodieRecord<T>> dedupedRecords = inputRecords;
if (performDedupe) {
dedupedRecords = (HoodieData<HoodieRecord<T>>) HoodieWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
}
final HoodieData<HoodieRecord<T>> repartitionedRecords;
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
// only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
repartitionedRecords = HoodieJavaRDD.of((JavaRDD<HoodieRecord<T>>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism));
// generate new file ID prefixes for each output partition
final List<String> fileIDPrefixes = IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
JavaRDD<WriteStatus> writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords).mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes, useWriterSchema, writeHandleFactory), true).flatMap(List::iterator);
return HoodieJavaRDD.of(writeStatusRDD);
}
Aggregations