Search in sources :

Example 1 with BuiltinKeyGenerator

use of org.apache.hudi.keygen.BuiltinKeyGenerator in project hudi by apache.

the class HoodieDatasetBulkInsertHelper method prepareHoodieDatasetForBulkInsert.

/**
 * Prepares input hoodie spark dataset for bulk insert. It does the following steps.
 *  1. Uses KeyGenerator to generate hoodie record keys and partition path.
 *  2. Add hoodie columns to input spark dataset.
 *  3. Reorders input dataset columns so that hoodie columns appear in the beginning.
 *  4. Sorts input dataset by hoodie partition path and record key
 *
 * @param sqlContext SQL Context
 * @param config Hoodie Write Config
 * @param rows Spark Input dataset
 * @return hoodie dataset which is ready for bulk insert.
 */
public static Dataset<Row> prepareHoodieDatasetForBulkInsert(SQLContext sqlContext, HoodieWriteConfig config, Dataset<Row> rows, String structName, String recordNamespace, BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows, boolean isGlobalIndex, boolean dropPartitionColumns) {
    List<Column> originalFields = Arrays.stream(rows.schema().fields()).map(f -> new Column(f.name())).collect(Collectors.toList());
    TypedProperties properties = new TypedProperties();
    properties.putAll(config.getProps());
    String keyGeneratorClass = properties.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key());
    BuiltinKeyGenerator keyGenerator = (BuiltinKeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, properties);
    String tableName = properties.getString(HoodieWriteConfig.TBL_NAME.key());
    String recordKeyUdfFn = RECORD_KEY_UDF_FN + tableName;
    String partitionPathUdfFn = PARTITION_PATH_UDF_FN + tableName;
    sqlContext.udf().register(recordKeyUdfFn, (UDF1<Row, String>) keyGenerator::getRecordKey, DataTypes.StringType);
    sqlContext.udf().register(partitionPathUdfFn, (UDF1<Row, String>) keyGenerator::getPartitionPath, DataTypes.StringType);
    final Dataset<Row> rowDatasetWithRecordKeys = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, callUDF(recordKeyUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
    final Dataset<Row> rowDatasetWithRecordKeysAndPartitionPath = rowDatasetWithRecordKeys.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, callUDF(partitionPathUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
    // Add other empty hoodie fields which will be populated before writing to parquet.
    Dataset<Row> rowDatasetWithHoodieColumns = rowDatasetWithRecordKeysAndPartitionPath.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.FILENAME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType));
    Dataset<Row> processedDf = rowDatasetWithHoodieColumns;
    if (dropPartitionColumns) {
        String partitionColumns = String.join(",", keyGenerator.getPartitionPathFields());
        for (String partitionField : keyGenerator.getPartitionPathFields()) {
            originalFields.remove(new Column(partitionField));
        }
        processedDf = rowDatasetWithHoodieColumns.drop(partitionColumns);
    }
    Dataset<Row> dedupedDf = processedDf;
    if (config.shouldCombineBeforeInsert()) {
        dedupedDf = SparkRowWriteHelper.newInstance().deduplicateRows(processedDf, config.getPreCombineField(), isGlobalIndex);
    }
    List<Column> orderedFields = Stream.concat(HoodieRecord.HOODIE_META_COLUMNS.stream().map(Column::new), originalFields.stream()).collect(Collectors.toList());
    Dataset<Row> colOrderedDataset = dedupedDf.select(JavaConverters.collectionAsScalaIterableConverter(orderedFields).asScala().toSeq());
    return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
}
Also used : DataTypes(org.apache.spark.sql.types.DataTypes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Arrays(java.util.Arrays) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Dataset(org.apache.spark.sql.Dataset) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) BuiltinKeyGenerator(org.apache.hudi.keygen.BuiltinKeyGenerator) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) List(java.util.List) UDF1(org.apache.spark.sql.api.java.UDF1) Stream(java.util.stream.Stream) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) JavaConverters(scala.collection.JavaConverters) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) org.apache.spark.sql.functions(org.apache.spark.sql.functions) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) Column(org.apache.spark.sql.Column) Row(org.apache.spark.sql.Row) TypedProperties(org.apache.hudi.common.config.TypedProperties) BuiltinKeyGenerator(org.apache.hudi.keygen.BuiltinKeyGenerator)

Aggregations

ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 TypedProperties (org.apache.hudi.common.config.TypedProperties)1 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)1 ReflectionUtils (org.apache.hudi.common.util.ReflectionUtils)1 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)1 BuiltinKeyGenerator (org.apache.hudi.keygen.BuiltinKeyGenerator)1 BulkInsertPartitioner (org.apache.hudi.table.BulkInsertPartitioner)1 LogManager (org.apache.log4j.LogManager)1 Logger (org.apache.log4j.Logger)1 Column (org.apache.spark.sql.Column)1 Dataset (org.apache.spark.sql.Dataset)1 Row (org.apache.spark.sql.Row)1 SQLContext (org.apache.spark.sql.SQLContext)1 UDF1 (org.apache.spark.sql.api.java.UDF1)1 org.apache.spark.sql.functions (org.apache.spark.sql.functions)1 org.apache.spark.sql.functions.callUDF (org.apache.spark.sql.functions.callUDF)1