Search in sources :

Example 1 with UDF1

use of org.apache.spark.sql.api.java.UDF1 in project hudi by apache.

the class TestBootstrap method generateTestRawTripDataset.

public static Dataset<Row> generateTestRawTripDataset(long timestamp, int from, int to, List<String> partitionPaths, JavaSparkContext jsc, SQLContext sqlContext) {
    boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty();
    final List<String> records = new ArrayList<>();
    IntStream.range(from, to).forEach(i -> {
        String id = "" + i;
        records.add(new HoodieTestDataGenerator().generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, timestamp, false, false).toString());
    });
    if (isPartitioned) {
        sqlContext.udf().register("partgen", (UDF1<String, String>) (val) -> PartitionPathEncodeUtils.escapePathName(partitionPaths.get(Integer.parseInt(val.split("_")[1]) % partitionPaths.size())), DataTypes.StringType);
    }
    JavaRDD rdd = jsc.parallelize(records);
    Dataset<Row> df = sqlContext.read().json(rdd);
    if (isPartitioned) {
        df = df.withColumn("datestr", callUDF("partgen", new Column("_row_key")));
        // Order the columns to ensure generated avro schema aligns with Hive schema
        df = df.select("timestamp", "_row_key", "partition_path", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted", "datestr");
    } else {
        // Order the columns to ensure generated avro schema aligns with Hive schema
        df = df.select("timestamp", "_row_key", "partition_path", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted");
    }
    return df;
}
Also used : BootstrapUtils(org.apache.hudi.table.action.bootstrap.BootstrapUtils) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) Spliterators(java.util.Spliterators) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) LongWritable(org.apache.hadoop.io.LongWritable) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) DataTypes(org.apache.spark.sql.types.DataTypes) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) Set(java.util.Set) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) IntStream(java.util.stream.IntStream) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) Collectors.mapping(java.util.stream.Collectors.mapping) StreamSupport(java.util.stream.StreamSupport) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) SaveMode(org.apache.spark.sql.SaveMode) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) AfterEach(org.junit.jupiter.api.AfterEach) Collectors.toList(java.util.stream.Collectors.toList) UDF1(org.apache.spark.sql.api.java.UDF1) HoodieKey(org.apache.hudi.common.model.HoodieKey) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) PartitionPathEncodeUtils(org.apache.hudi.common.util.PartitionPathEncodeUtils) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBootstrapConfig(org.apache.hudi.config.HoodieBootstrapConfig) Column(org.apache.spark.sql.Column) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 2 with UDF1

use of org.apache.spark.sql.api.java.UDF1 in project hudi by apache.

the class TestOrcBootstrap method generateTestRawTripDataset.

public static Dataset<Row> generateTestRawTripDataset(long timestamp, int from, int to, List<String> partitionPaths, JavaSparkContext jsc, SQLContext sqlContext) {
    boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty();
    final List<String> records = new ArrayList<>();
    IntStream.range(from, to).forEach(i -> {
        String id = "" + i;
        records.add(new HoodieTestDataGenerator().generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, timestamp, false, false).toString());
    });
    if (isPartitioned) {
        sqlContext.udf().register("partgen", (UDF1<String, String>) (val) -> PartitionPathEncodeUtils.escapePathName(partitionPaths.get(Integer.parseInt(val.split("_")[1]) % partitionPaths.size())), DataTypes.StringType);
    }
    JavaRDD rdd = jsc.parallelize(records);
    Dataset<Row> df = sqlContext.read().json(rdd);
    if (isPartitioned) {
        df = df.withColumn("datestr", callUDF("partgen", new Column("_row_key")));
        // Order the columns to ensure generated avro schema aligns with Hive schema
        df = df.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted", "datestr");
    } else {
        // Order the columns to ensure generated avro schema aligns with Hive schema
        df = df.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted");
    }
    return df;
}
Also used : BootstrapUtils(org.apache.hudi.table.action.bootstrap.BootstrapUtils) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) Spliterators(java.util.Spliterators) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) DataTypes(org.apache.spark.sql.types.DataTypes) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) IntStream(java.util.stream.IntStream) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) AvroOrcUtils(org.apache.hudi.common.util.AvroOrcUtils) Dataset(org.apache.spark.sql.Dataset) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) OrcFile(org.apache.orc.OrcFile) ArrayList(java.util.ArrayList) Reader(org.apache.orc.Reader) Collectors.mapping(java.util.stream.Collectors.mapping) StreamSupport(java.util.stream.StreamSupport) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) SaveMode(org.apache.spark.sql.SaveMode) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Column(org.apache.spark.sql.Column) RecordReader(org.apache.orc.RecordReader) SQLContext(org.apache.spark.sql.SQLContext) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) AfterEach(org.junit.jupiter.api.AfterEach) Collectors.toList(java.util.stream.Collectors.toList) UDF1(org.apache.spark.sql.api.java.UDF1) HoodieKey(org.apache.hudi.common.model.HoodieKey) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) PartitionPathEncodeUtils(org.apache.hudi.common.util.PartitionPathEncodeUtils) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBootstrapConfig(org.apache.hudi.config.HoodieBootstrapConfig) Column(org.apache.spark.sql.Column) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 3 with UDF1

use of org.apache.spark.sql.api.java.UDF1 in project hudi by apache.

the class HoodieDatasetBulkInsertHelper method prepareHoodieDatasetForBulkInsert.

/**
 * Prepares input hoodie spark dataset for bulk insert. It does the following steps.
 *  1. Uses KeyGenerator to generate hoodie record keys and partition path.
 *  2. Add hoodie columns to input spark dataset.
 *  3. Reorders input dataset columns so that hoodie columns appear in the beginning.
 *  4. Sorts input dataset by hoodie partition path and record key
 *
 * @param sqlContext SQL Context
 * @param config Hoodie Write Config
 * @param rows Spark Input dataset
 * @return hoodie dataset which is ready for bulk insert.
 */
public static Dataset<Row> prepareHoodieDatasetForBulkInsert(SQLContext sqlContext, HoodieWriteConfig config, Dataset<Row> rows, String structName, String recordNamespace, BulkInsertPartitioner<Dataset<Row>> bulkInsertPartitionerRows, boolean isGlobalIndex, boolean dropPartitionColumns) {
    List<Column> originalFields = Arrays.stream(rows.schema().fields()).map(f -> new Column(f.name())).collect(Collectors.toList());
    TypedProperties properties = new TypedProperties();
    properties.putAll(config.getProps());
    String keyGeneratorClass = properties.getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key());
    BuiltinKeyGenerator keyGenerator = (BuiltinKeyGenerator) ReflectionUtils.loadClass(keyGeneratorClass, properties);
    String tableName = properties.getString(HoodieWriteConfig.TBL_NAME.key());
    String recordKeyUdfFn = RECORD_KEY_UDF_FN + tableName;
    String partitionPathUdfFn = PARTITION_PATH_UDF_FN + tableName;
    sqlContext.udf().register(recordKeyUdfFn, (UDF1<Row, String>) keyGenerator::getRecordKey, DataTypes.StringType);
    sqlContext.udf().register(partitionPathUdfFn, (UDF1<Row, String>) keyGenerator::getPartitionPath, DataTypes.StringType);
    final Dataset<Row> rowDatasetWithRecordKeys = rows.withColumn(HoodieRecord.RECORD_KEY_METADATA_FIELD, callUDF(recordKeyUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
    final Dataset<Row> rowDatasetWithRecordKeysAndPartitionPath = rowDatasetWithRecordKeys.withColumn(HoodieRecord.PARTITION_PATH_METADATA_FIELD, callUDF(partitionPathUdfFn, org.apache.spark.sql.functions.struct(JavaConverters.collectionAsScalaIterableConverter(originalFields).asScala().toSeq())));
    // Add other empty hoodie fields which will be populated before writing to parquet.
    Dataset<Row> rowDatasetWithHoodieColumns = rowDatasetWithRecordKeysAndPartitionPath.withColumn(HoodieRecord.COMMIT_TIME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType)).withColumn(HoodieRecord.FILENAME_METADATA_FIELD, functions.lit("").cast(DataTypes.StringType));
    Dataset<Row> processedDf = rowDatasetWithHoodieColumns;
    if (dropPartitionColumns) {
        String partitionColumns = String.join(",", keyGenerator.getPartitionPathFields());
        for (String partitionField : keyGenerator.getPartitionPathFields()) {
            originalFields.remove(new Column(partitionField));
        }
        processedDf = rowDatasetWithHoodieColumns.drop(partitionColumns);
    }
    Dataset<Row> dedupedDf = processedDf;
    if (config.shouldCombineBeforeInsert()) {
        dedupedDf = SparkRowWriteHelper.newInstance().deduplicateRows(processedDf, config.getPreCombineField(), isGlobalIndex);
    }
    List<Column> orderedFields = Stream.concat(HoodieRecord.HOODIE_META_COLUMNS.stream().map(Column::new), originalFields.stream()).collect(Collectors.toList());
    Dataset<Row> colOrderedDataset = dedupedDf.select(JavaConverters.collectionAsScalaIterableConverter(orderedFields).asScala().toSeq());
    return bulkInsertPartitionerRows.repartitionRecords(colOrderedDataset, config.getBulkInsertShuffleParallelism());
}
Also used : DataTypes(org.apache.spark.sql.types.DataTypes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Arrays(java.util.Arrays) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Dataset(org.apache.spark.sql.Dataset) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) BuiltinKeyGenerator(org.apache.hudi.keygen.BuiltinKeyGenerator) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) List(java.util.List) UDF1(org.apache.spark.sql.api.java.UDF1) Stream(java.util.stream.Stream) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) JavaConverters(scala.collection.JavaConverters) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) org.apache.spark.sql.functions(org.apache.spark.sql.functions) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) Column(org.apache.spark.sql.Column) Row(org.apache.spark.sql.Row) TypedProperties(org.apache.hudi.common.config.TypedProperties) BuiltinKeyGenerator(org.apache.hudi.keygen.BuiltinKeyGenerator)

Aggregations

ArrayList (java.util.ArrayList)3 Arrays (java.util.Arrays)3 List (java.util.List)3 Collectors (java.util.stream.Collectors)3 TypedProperties (org.apache.hudi.common.config.TypedProperties)3 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)3 IOException (java.io.IOException)2 Instant (java.time.Instant)2 Iterator (java.util.Iterator)2 Map (java.util.Map)2 Random (java.util.Random)2 Spliterators (java.util.Spliterators)2 Collectors.mapping (java.util.stream.Collectors.mapping)2 Collectors.toList (java.util.stream.Collectors.toList)2 IntStream (java.util.stream.IntStream)2 StreamSupport (java.util.stream.StreamSupport)2 Schema (org.apache.avro.Schema)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2