Search in sources :

Example 1 with BulkInsertPartitioner

use of org.apache.hudi.table.BulkInsertPartitioner in project hudi by apache.

the class SparkBulkInsertHelper method bulkInsert.

@Override
public HoodieData<WriteStatus> bulkInsert(HoodieData<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
    // De-dupe/merge if needed
    HoodieData<HoodieRecord<T>> dedupedRecords = inputRecords;
    if (performDedupe) {
        dedupedRecords = (HoodieData<HoodieRecord<T>>) HoodieWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
    }
    final HoodieData<HoodieRecord<T>> repartitionedRecords;
    BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : BulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
    // only JavaRDD is supported for Spark partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
    repartitionedRecords = HoodieJavaRDD.of((JavaRDD<HoodieRecord<T>>) partitioner.repartitionRecords(HoodieJavaRDD.getJavaRDD(dedupedRecords), parallelism));
    // generate new file ID prefixes for each output partition
    final List<String> fileIDPrefixes = IntStream.range(0, parallelism).mapToObj(i -> FSUtils.createNewFileIdPfx()).collect(Collectors.toList());
    JavaRDD<WriteStatus> writeStatusRDD = HoodieJavaRDD.getJavaRDD(repartitionedRecords).mapPartitionsWithIndex(new BulkInsertMapFunction<>(instantTime, partitioner.arePartitionRecordsSorted(), config, table, fileIDPrefixes, useWriterSchema, writeHandleFactory), true).flatMap(List::iterator);
    return HoodieJavaRDD.of(writeStatusRDD);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) IntStream(java.util.stream.IntStream) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) Option(org.apache.hudi.common.util.Option) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) Collectors(java.util.stream.Collectors) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) WriteHandleFactory(org.apache.hudi.io.WriteHandleFactory) List(java.util.List) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) BulkInsertInternalPartitionerFactory(org.apache.hudi.execution.bulkinsert.BulkInsertInternalPartitionerFactory) BulkInsertMapFunction(org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) FSUtils(org.apache.hudi.common.fs.FSUtils) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BulkInsertMapFunction(org.apache.hudi.execution.bulkinsert.BulkInsertMapFunction) List(java.util.List) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 2 with BulkInsertPartitioner

use of org.apache.hudi.table.BulkInsertPartitioner in project hudi by apache.

the class HoodieDatasetBulkInsertHelper method prepareHoodieDatasetForBulkInsert.

/**
 * Prepares input hoodie spark dataset for bulk insert. It does the following steps.
 *  1. Uses KeyGenerator to generate hoodie record keys and partition path.
 *  2. Add hoodie columns to input spark dataset.
 *  3. Reorders input dataset columns so that hoodie columns appear in the beginning.
 *  4. Sorts input dataset by hoodie partition path and record key
 *
 * @param sqlContext SQL Context
 * @param config Hoodie Write Config
 * @param rows Spark Input dataset
 * @return hoodie dataset which is ready for bulk insert.
 */
public static Dataset<Row> prepareHoodieDatasetForBulkInsert(SQLContext sqlContext, HoodieWriteConfig config, Dataset<Row> rows, String structName, String recordNamespace, BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows, boolean isGlobalIndex, boolean dropPartitionColumns) {
    List<Column> originalFields = Arrays.stream(rows.schema().fields()).map(f -> new Column(f.name())).collect(Collectors.toList());
    TypedProperties properties = new TypedProperties();
    properties.putAll(config.getProps());
    String keyGeneratorClass = properties.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key());
    BuiltinKeyGenerator keyGenerator = (BuiltinKeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, properties);
    String tableName = properties.getString(HoodieWriteConfig.TBL_NAME.key());
    String recordKeyUdfFn = RECORD_KEY_UDF_FN + tableName;
    String partitionPathUdfFn = PARTITION_PATH_UDF_FN + tableName;
    sqlContext.udf().register(recordKeyUdfFn, (UDF1<Row, String>) keyGenerator::getRecordKey, DataTypes.StringType);
    sqlContext.udf().register(partitionPathUdfFn, (UDF1<Row, String>) keyGenerator::getPartitionPath, DataTypes.StringType);
    final Dataset<Row> rowDatasetWithRecordKeys = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, callUDF(recordKeyUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
    final Dataset<Row> rowDatasetWithRecordKeysAndPartitionPath = rowDatasetWithRecordKeys.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, callUDF(partitionPathUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
    // Add other empty hoodie fields which will be populated before writing to parquet.
    Dataset<Row> rowDatasetWithHoodieColumns = rowDatasetWithRecordKeysAndPartitionPath.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.FILENAME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType));
    Dataset<Row> processedDf = rowDatasetWithHoodieColumns;
    if (dropPartitionColumns) {
        String partitionColumns = String.join(",", keyGenerator.getPartitionPathFields());
        for (String partitionField : keyGenerator.getPartitionPathFields()) {
            originalFields.remove(new Column(partitionField));
        }
        processedDf = rowDatasetWithHoodieColumns.drop(partitionColumns);
    }
    Dataset<Row> dedupedDf = processedDf;
    if (config.shouldCombineBeforeInsert()) {
        dedupedDf = SparkRowWriteHelper.newInstance().deduplicateRows(processedDf, config.getPreCombineField(), isGlobalIndex);
    }
    List<Column> orderedFields = Stream.concat(HoodieRecord.HOODIE_META_COLUMNS.stream().map(Column::new), originalFields.stream()).collect(Collectors.toList());
    Dataset<Row> colOrderedDataset = dedupedDf.select(JavaConverters.collectionAsScalaIterableConverter(orderedFields).asScala().toSeq());
    return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
}
Also used : DataTypes(org.apache.spark.sql.types.DataTypes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Arrays(java.util.Arrays) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Dataset(org.apache.spark.sql.Dataset) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) BuiltinKeyGenerator(org.apache.hudi.keygen.BuiltinKeyGenerator) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) List(java.util.List) UDF1(org.apache.spark.sql.api.java.UDF1) Stream(java.util.stream.Stream) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) JavaConverters(scala.collection.JavaConverters) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) org.apache.spark.sql.functions(org.apache.spark.sql.functions) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) Column(org.apache.spark.sql.Column) Row(org.apache.spark.sql.Row) TypedProperties(org.apache.hudi.common.config.TypedProperties) BuiltinKeyGenerator(org.apache.hudi.keygen.BuiltinKeyGenerator)

Example 3 with BulkInsertPartitioner

use of org.apache.hudi.table.BulkInsertPartitioner in project hudi by apache.

the class JavaBulkInsertHelper method bulkInsert.

@Override
public List<WriteStatus> bulkInsert(List<HoodieRecord<T>> inputRecords, String instantTime, HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table, HoodieWriteConfig config, boolean performDedupe, Option<BulkInsertPartitioner> userDefinedBulkInsertPartitioner, boolean useWriterSchema, int parallelism, WriteHandleFactory writeHandleFactory) {
    // De-dupe/merge if needed
    List<HoodieRecord<T>> dedupedRecords = inputRecords;
    if (performDedupe) {
        dedupedRecords = (List<HoodieRecord<T>>) JavaWriteHelper.newInstance().combineOnCondition(config.shouldCombineBeforeInsert(), inputRecords, parallelism, table);
    }
    final List<HoodieRecord<T>> repartitionedRecords;
    BulkInsertPartitioner partitioner = userDefinedBulkInsertPartitioner.isPresent() ? userDefinedBulkInsertPartitioner.get() : JavaBulkInsertInternalPartitionerFactory.get(config.getBulkInsertSortMode());
    // only List is supported for Java partitioner, but it is not enforced by BulkInsertPartitioner API. To improve this, TODO HUDI-3463
    repartitionedRecords = (List<HoodieRecord<T>>) partitioner.repartitionRecords(dedupedRecords, parallelism);
    FileIdPrefixProvider fileIdPrefixProvider = (FileIdPrefixProvider) ReflectionUtils.loadClass(config.getFileIdPrefixProviderClassName(), config.getProps());
    List<WriteStatus> writeStatuses = new ArrayList<>();
    new JavaLazyInsertIterable<>(repartitionedRecords.iterator(), true, config, instantTime, table, fileIdPrefixProvider.createFilePrefix(""), table.getTaskContextSupplier(), new CreateHandleFactory<>()).forEachRemaining(writeStatuses::addAll);
    return writeStatuses;
}
Also used : FileIdPrefixProvider(org.apache.hudi.table.FileIdPrefixProvider) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) WriteStatus(org.apache.hudi.client.WriteStatus)

Aggregations

HoodieRecord (org.apache.hudi.common.model.HoodieRecord)3 BulkInsertPartitioner (org.apache.hudi.table.BulkInsertPartitioner)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 WriteStatus (org.apache.hudi.client.WriteStatus)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2 CreateHandleFactory (org.apache.hudi.io.CreateHandleFactory)2 Arrays (java.util.Arrays)1 IntStream (java.util.stream.IntStream)1 Stream (java.util.stream.Stream)1 TypedProperties (org.apache.hudi.common.config.TypedProperties)1 HoodieData (org.apache.hudi.common.data.HoodieData)1 FSUtils (org.apache.hudi.common.fs.FSUtils)1 HoodieKey (org.apache.hudi.common.model.HoodieKey)1 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)1 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)1 Option (org.apache.hudi.common.util.Option)1 ReflectionUtils (org.apache.hudi.common.util.ReflectionUtils)1 HoodieJavaRDD (org.apache.hudi.data.HoodieJavaRDD)1