Search in sources :

Example 51 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestTableSchemaEvolution method generateInsertsWithSchema.

private List<HoodieRecord> generateInsertsWithSchema(String commitTime, int numRecords, String schemaStr) {
    HoodieTestDataGenerator gen = schemaStr.equals(TRIP_EXAMPLE_SCHEMA_EVOLVED) ? dataGenEvolved : dataGenDevolved;
    List<HoodieRecord> records = gen.generateInserts(commitTime, numRecords);
    return convertToSchema(records, schemaStr);
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 52 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestTableSchemaEvolution method generateUpdatesWithSchema.

private List<HoodieRecord> generateUpdatesWithSchema(String commitTime, int numRecords, String schemaStr) {
    HoodieTestDataGenerator gen = schemaStr.equals(TRIP_EXAMPLE_SCHEMA_EVOLVED) ? dataGenEvolved : dataGenDevolved;
    List<HoodieRecord> records = gen.generateUniqueUpdates(commitTime, numRecords);
    return convertToSchema(records, schemaStr);
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 53 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class HoodieJavaGenerateApp method insert.

private void insert(SparkSession spark) throws IOException {
    HoodieTestDataGenerator dataGen = getDataGenerate();
    JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
    // Generate some input..
    String instantTime = HoodieActiveTimeline.createNewInstantTime();
    List<HoodieRecord> recordsSoFar = new ArrayList<>(dataGen.generateInserts(instantTime, /* ignore */
    100));
    List<String> records1 = recordsToStrings(recordsSoFar);
    Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
    // Save as hoodie dataset (copy on write)
    // specify the hoodie source
    DataFrameWriter<Row> writer = inputDF1.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2").option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType).option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()).option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key").option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition").option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp").option(HoodieWriteConfig.TBL_NAME.key(), tableName).option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()).mode(commitType);
    updateHiveSyncConfig(writer);
    // new dataset if needed
    // ultimately where the dataset will be placed
    writer.save(tablePath);
    FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
    String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    LOG.info("Commit at instant time :" + commitInstantTime1);
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 54 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class HoodieJavaStreamingApp method run.

/**
 * @throws Exception
 */
public void run() throws Exception {
    // Spark session setup..
    SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
    JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
    // folder path clean up and creation, preparing the environment
    FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
    fs.delete(new Path(streamingSourcePath), true);
    fs.delete(new Path(streamingCheckpointingPath), true);
    fs.delete(new Path(tablePath), true);
    fs.mkdirs(new Path(streamingSourcePath));
    // Generator of some records to be loaded in.
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    List<String> records1 = recordsToStrings(dataGen.generateInserts("001", 100));
    Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
    List<String> records2 = recordsToStrings(dataGen.generateUpdatesForAllRecords("002"));
    Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
    String ckptPath = streamingCheckpointingPath + "/stream1";
    String srcPath = streamingSourcePath + "/stream1";
    fs.mkdirs(new Path(ckptPath));
    fs.mkdirs(new Path(srcPath));
    // setup the input for streaming
    Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(srcPath + "/*");
    // start streaming and showing
    ExecutorService executor = Executors.newFixedThreadPool(2);
    int numInitialCommits = 0;
    // thread for spark structured streaming
    try {
        Future<Void> streamFuture = executor.submit(() -> {
            LOG.info("===== Streaming Starting =====");
            stream(streamingInput, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL(), ckptPath);
            LOG.info("===== Streaming Ends =====");
            return null;
        });
        // thread for adding data to the streaming source and showing results over time
        Future<Integer> showFuture = executor.submit(() -> {
            LOG.info("===== Showing Starting =====");
            int numCommits = addInputAndValidateIngestion(spark, fs, srcPath, 0, 100, inputDF1, inputDF2, true);
            LOG.info("===== Showing Ends =====");
            return numCommits;
        });
        // let the threads run
        streamFuture.get();
        numInitialCommits = showFuture.get();
    } finally {
        executor.shutdownNow();
    }
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jssc.hadoopConfiguration()).setBasePath(tablePath).build();
    if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
        // Ensure we have successfully completed one compaction commit
        ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() == 1);
    } else {
        ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() >= 1);
    }
    // Deletes Stream
    // Need to restart application to ensure spark does not assume there are multiple streams active.
    spark.close();
    SparkSession newSpark = SparkSession.builder().appName("Hoodie Spark Streaming APP").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
    jssc = new JavaSparkContext(newSpark.sparkContext());
    String ckptPath2 = streamingCheckpointingPath + "/stream2";
    String srcPath2 = srcPath + "/stream2";
    fs.mkdirs(new Path(ckptPath2));
    fs.mkdirs(new Path(srcPath2));
    Dataset<Row> delStreamingInput = newSpark.readStream().schema(inputDF1.schema()).json(srcPath2 + "/*");
    List<String> deletes = recordsToStrings(dataGen.generateUniqueUpdates("002", 20));
    Dataset<Row> inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2));
    executor = Executors.newFixedThreadPool(2);
    // thread for spark structured streaming
    try {
        Future<Void> streamFuture = executor.submit(() -> {
            LOG.info("===== Streaming Starting =====");
            stream(delStreamingInput, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), ckptPath2);
            LOG.info("===== Streaming Ends =====");
            return null;
        });
        final int numCommits = numInitialCommits;
        // thread for adding data to the streaming source and showing results over time
        Future<Void> showFuture = executor.submit(() -> {
            LOG.info("===== Showing Starting =====");
            addInputAndValidateIngestion(newSpark, fs, srcPath2, numCommits, 80, inputDF3, null, false);
            LOG.info("===== Showing Ends =====");
            return null;
        });
        // let the threads run
        streamFuture.get();
        showFuture.get();
    } finally {
        executor.shutdown();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) FileSystem(org.apache.hadoop.fs.FileSystem) ExecutorService(java.util.concurrent.ExecutorService) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 55 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestBootstrap method generateTestRawTripDataset.

public static Dataset<Row> generateTestRawTripDataset(long timestamp, int from, int to, List<String> partitionPaths, JavaSparkContext jsc, SQLContext sqlContext) {
    boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty();
    final List<String> records = new ArrayList<>();
    IntStream.range(from, to).forEach(i -> {
        String id = "" + i;
        records.add(new HoodieTestDataGenerator().generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, timestamp, false, false).toString());
    });
    if (isPartitioned) {
        sqlContext.udf().register("partgen", (UDF1<String, String>) (val) -> PartitionPathEncodeUtils.escapePathName(partitionPaths.get(Integer.parseInt(val.split("_")[1]) % partitionPaths.size())), DataTypes.StringType);
    }
    JavaRDD rdd = jsc.parallelize(records);
    Dataset<Row> df = sqlContext.read().json(rdd);
    if (isPartitioned) {
        df = df.withColumn("datestr", callUDF("partgen", new Column("_row_key")));
        // Order the columns to ensure generated avro schema aligns with Hive schema
        df = df.select("timestamp", "_row_key", "partition_path", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted", "datestr");
    } else {
        // Order the columns to ensure generated avro schema aligns with Hive schema
        df = df.select("timestamp", "_row_key", "partition_path", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted");
    }
    return df;
}
Also used : BootstrapUtils(org.apache.hudi.table.action.bootstrap.BootstrapUtils) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) Spliterators(java.util.Spliterators) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) LongWritable(org.apache.hadoop.io.LongWritable) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) DataTypes(org.apache.spark.sql.types.DataTypes) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) Set(java.util.Set) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) IntStream(java.util.stream.IntStream) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) Collectors.mapping(java.util.stream.Collectors.mapping) StreamSupport(java.util.stream.StreamSupport) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) SaveMode(org.apache.spark.sql.SaveMode) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) AfterEach(org.junit.jupiter.api.AfterEach) Collectors.toList(java.util.stream.Collectors.toList) UDF1(org.apache.spark.sql.api.java.UDF1) HoodieKey(org.apache.hudi.common.model.HoodieKey) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) PartitionPathEncodeUtils(org.apache.hudi.common.util.PartitionPathEncodeUtils) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBootstrapConfig(org.apache.hudi.config.HoodieBootstrapConfig) Column(org.apache.spark.sql.Column) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) JavaRDD(org.apache.spark.api.java.JavaRDD)

Aggregations

HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)97 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)52 Test (org.junit.jupiter.api.Test)51 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)44 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)38 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)31 TypedProperties (org.apache.hudi.common.config.TypedProperties)29 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 GenericRecord (org.apache.avro.generic.GenericRecord)25 JavaRDD (org.apache.spark.api.java.JavaRDD)25 Path (org.apache.hadoop.fs.Path)24 WriteStatus (org.apache.hudi.client.WriteStatus)22 ArrayList (java.util.ArrayList)21 Properties (java.util.Properties)21 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)18 HoodieTable (org.apache.hudi.table.HoodieTable)18 List (java.util.List)17 ValueSource (org.junit.jupiter.params.provider.ValueSource)17 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)16 IOException (java.io.IOException)15