use of org.apache.hudi.keygen.BuiltinKeyGenerator in project hudi by apache.
the class HoodieDatasetBulkInsertHelper method prepareHoodieDatasetForBulkInsert.
/**
* Prepares input hoodie spark dataset for bulk insert. It does the following steps.
* 1. Uses KeyGenerator to generate hoodie record keys and partition path.
* 2. Add hoodie columns to input spark dataset.
* 3. Reorders input dataset columns so that hoodie columns appear in the beginning.
* 4. Sorts input dataset by hoodie partition path and record key
*
* @param sqlContext SQL Context
* @param config Hoodie Write Config
* @param rows Spark Input dataset
* @return hoodie dataset which is ready for bulk insert.
*/
public static Dataset<Row> prepareHoodieDatasetForBulkInsert(SQLContext sqlContext, HoodieWriteConfig config, Dataset<Row> rows, String structName, String recordNamespace, BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows, boolean isGlobalIndex, boolean dropPartitionColumns) {
List<Column> originalFields = Arrays.stream(rows.schema().fields()).map(f -> new Column(f.name())).collect(Collectors.toList());
TypedProperties properties = new TypedProperties();
properties.putAll(config.getProps());
String keyGeneratorClass = properties.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key());
BuiltinKeyGenerator keyGenerator = (BuiltinKeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, properties);
String tableName = properties.getString(HoodieWriteConfig.TBL_NAME.key());
String recordKeyUdfFn = RECORD_KEY_UDF_FN + tableName;
String partitionPathUdfFn = PARTITION_PATH_UDF_FN + tableName;
sqlContext.udf().register(recordKeyUdfFn, (UDF1<Row, String>) keyGenerator::getRecordKey, DataTypes.StringType);
sqlContext.udf().register(partitionPathUdfFn, (UDF1<Row, String>) keyGenerator::getPartitionPath, DataTypes.StringType);
final Dataset<Row> rowDatasetWithRecordKeys = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, callUDF(recordKeyUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
final Dataset<Row> rowDatasetWithRecordKeysAndPartitionPath = rowDatasetWithRecordKeys.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, callUDF(partitionPathUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
// Add other empty hoodie fields which will be populated before writing to parquet.
Dataset<Row> rowDatasetWithHoodieColumns = rowDatasetWithRecordKeysAndPartitionPath.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.FILENAME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType));
Dataset<Row> processedDf = rowDatasetWithHoodieColumns;
if (dropPartitionColumns) {
String partitionColumns = String.join(",", keyGenerator.getPartitionPathFields());
for (String partitionField : keyGenerator.getPartitionPathFields()) {
originalFields.remove(new Column(partitionField));
}
processedDf = rowDatasetWithHoodieColumns.drop(partitionColumns);
}
Dataset<Row> dedupedDf = processedDf;
if (config.shouldCombineBeforeInsert()) {
dedupedDf = SparkRowWriteHelper.newInstance().deduplicateRows(processedDf, config.getPreCombineField(), isGlobalIndex);
}
List<Column> orderedFields = Stream.concat(HoodieRecord.HOODIE_META_COLUMNS.stream().map(Column::new), originalFields.stream()).collect(Collectors.toList());
Dataset<Row> colOrderedDataset = dedupedDf.select(JavaConverters.collectionAsScalaIterableConverter(orderedFields).asScala().toSeq());
return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
}
Aggregations