use of org.apache.hudi.table.BulkInsertPartitioner in project hudi by apache.
the class SparkBulkInsertHelper method bulkInsert.
@Override
public HoodieData<WriteStatus> bulkInsert(HoodieData<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
// De-dupe/merge if needed
HoodieData<HoodieRecord<T>> dedupedRecords = inputRecords;
if (performDedupe) {
dedupedRecords = (HoodieData<HoodieRecord<T>>) HoodieWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
}
final HoodieData<HoodieRecord<T>> repartitionedRecords;
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
// only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
repartitionedRecords = HoodieJavaRDD.of((JavaRDD<HoodieRecord<T>>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism));
// generate new file ID prefixes for each output partition
final List<String> fileIDPrefixes = IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
JavaRDD<WriteStatus> writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords).mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes, useWriterSchema, writeHandleFactory), true).flatMap(List::iterator);
return HoodieJavaRDD.of(writeStatusRDD);
}
use of org.apache.hudi.table.BulkInsertPartitioner in project hudi by apache.
the class HoodieDatasetBulkInsertHelper method prepareHoodieDatasetForBulkInsert.
/**
* Prepares input hoodie spark dataset for bulk insert. It does the following steps.
* 1. Uses KeyGenerator to generate hoodie record keys and partition path.
* 2. Add hoodie columns to input spark dataset.
* 3. Reorders input dataset columns so that hoodie columns appear in the beginning.
* 4. Sorts input dataset by hoodie partition path and record key
*
* @param sqlContext SQL Context
* @param config Hoodie Write Config
* @param rows Spark Input dataset
* @return hoodie dataset which is ready for bulk insert.
*/
public static Dataset<Row> prepareHoodieDatasetForBulkInsert(SQLContext sqlContext, HoodieWriteConfig config, Dataset<Row> rows, String structName, String recordNamespace, BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows, boolean isGlobalIndex, boolean dropPartitionColumns) {
List<Column> originalFields = Arrays.stream(rows.schema().fields()).map(f -> new Column(f.name())).collect(Collectors.toList());
TypedProperties properties = new TypedProperties();
properties.putAll(config.getProps());
String keyGeneratorClass = properties.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key());
BuiltinKeyGenerator keyGenerator = (BuiltinKeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, properties);
String tableName = properties.getString(HoodieWriteConfig.TBL_NAME.key());
String recordKeyUdfFn = RECORD_KEY_UDF_FN + tableName;
String partitionPathUdfFn = PARTITION_PATH_UDF_FN + tableName;
sqlContext.udf().register(recordKeyUdfFn, (UDF1<Row, String>) keyGenerator::getRecordKey, DataTypes.StringType);
sqlContext.udf().register(partitionPathUdfFn, (UDF1<Row, String>) keyGenerator::getPartitionPath, DataTypes.StringType);
final Dataset<Row> rowDatasetWithRecordKeys = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, callUDF(recordKeyUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
final Dataset<Row> rowDatasetWithRecordKeysAndPartitionPath = rowDatasetWithRecordKeys.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, callUDF(partitionPathUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
// Add other empty hoodie fields which will be populated before writing to parquet.
Dataset<Row> rowDatasetWithHoodieColumns = rowDatasetWithRecordKeysAndPartitionPath.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.FILENAME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType));
Dataset<Row> processedDf = rowDatasetWithHoodieColumns;
if (dropPartitionColumns) {
String partitionColumns = String.join(",", keyGenerator.getPartitionPathFields());
for (String partitionField : keyGenerator.getPartitionPathFields()) {
originalFields.remove(new Column(partitionField));
}
processedDf = rowDatasetWithHoodieColumns.drop(partitionColumns);
}
Dataset<Row> dedupedDf = processedDf;
if (config.shouldCombineBeforeInsert()) {
dedupedDf = SparkRowWriteHelper.newInstance().deduplicateRows(processedDf, config.getPreCombineField(), isGlobalIndex);
}
List<Column> orderedFields = Stream.concat(HoodieRecord.HOODIE_META_COLUMNS.stream().map(Column::new), originalFields.stream()).collect(Collectors.toList());
Dataset<Row> colOrderedDataset = dedupedDf.select(JavaConverters.collectionAsScalaIterableConverter(orderedFields).asScala().toSeq());
return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
}
use of org.apache.hudi.table.BulkInsertPartitioner in project hudi by apache.
the class JavaBulkInsertHelper method bulkInsert.
@Override
public List<WriteStatus> bulkInsert(List<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
// De-dupe/merge if needed
List<HoodieRecord<T>> dedupedRecords = inputRecords;
if (performDedupe) {
dedupedRecords = (List<HoodieRecord<T>>) JavaWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
}
final List<HoodieRecord<T>> repartitionedRecords;
BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
// only List is supported for Java partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
repartitionedRecords = (List<HoodieRecord<T>>) partitioner.repartitionRecords(dedupedRecords, parallelism);
FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass(config.getFileIdPrefixProviderClassName(), config.getProps());
List<WriteStatus> writeStatuses = new ArrayList<>();
new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true, config, instantTime, table, fileIdPrefixProvider.createFilePrefix(""), table.getTaskContextSupplier(), new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll);
return writeStatuses;
}
Aggregations