Search in sources :

Example 91 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestNonpartitionedKeyGenerator method testSingleValueKeyGeneratorNonPartitioned.

@Test
public void testSingleValueKeyGeneratorNonPartitioned() {
    TypedProperties properties = new TypedProperties();
    properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "timestamp");
    properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "");
    NonpartitionedKeyGenerator keyGenerator = new NonpartitionedKeyGenerator(properties);
    assertEquals(keyGenerator.getRecordKeyFields().size(), 1);
    assertEquals(keyGenerator.getPartitionPathFields().size(), 0);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    GenericRecord record = dataGenerator.generateGenericRecords(1).get(0);
    String rowKey = record.get("timestamp").toString();
    HoodieKey hoodieKey = keyGenerator.getKey(record);
    assertEquals(rowKey, hoodieKey.getRecordKey());
    assertEquals("", hoodieKey.getPartitionPath());
}
Also used : HoodieKey(org.apache.hudi.common.model.HoodieKey) TypedProperties(org.apache.hudi.common.config.TypedProperties) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 92 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestNonpartitionedKeyGenerator method testMultipleValueKeyGeneratorNonPartitioned1.

@Test
public void testMultipleValueKeyGeneratorNonPartitioned1() {
    TypedProperties properties = new TypedProperties();
    properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "timestamp,driver");
    properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "");
    NonpartitionedKeyGenerator keyGenerator = new NonpartitionedKeyGenerator(properties);
    assertEquals(keyGenerator.getRecordKeyFields().size(), 2);
    assertEquals(keyGenerator.getPartitionPathFields().size(), 0);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    GenericRecord record = dataGenerator.generateGenericRecords(1).get(0);
    String rowKey = "timestamp" + ComplexAvroKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("timestamp").toString() + "," + "driver" + ComplexAvroKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("driver").toString();
    String partitionPath = "";
    HoodieKey hoodieKey = keyGenerator.getKey(record);
    assertEquals(rowKey, hoodieKey.getRecordKey());
    assertEquals(partitionPath, hoodieKey.getPartitionPath());
}
Also used : HoodieKey(org.apache.hudi.common.model.HoodieKey) TypedProperties(org.apache.hudi.common.config.TypedProperties) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 93 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestBufferedConnectWriter method testSimpleWriteAndFlush.

@Test
public void testSimpleWriteAndFlush() throws Exception {
    String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
    List<HoodieRecord> records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS);
    BufferedConnectWriter writer = new BufferedConnectWriter(javaEngineContext, mockHoodieJavaWriteClient, COMMIT_TIME, configs, writeConfig, null, schemaProvider);
    for (int i = 0; i < NUM_RECORDS; i++) {
        writer.writeHudiRecord(records.get(i));
    }
    Mockito.verify(mockHoodieJavaWriteClient, times(0)).bulkInsertPreppedRecords(anyList(), eq(COMMIT_TIME), eq(Option.empty()));
    writer.flushRecords();
    final ArgumentCaptor<List<HoodieRecord>> actualRecords = ArgumentCaptor.forClass(List.class);
    Mockito.verify(mockHoodieJavaWriteClient, times(1)).bulkInsertPreppedRecords(actualRecords.capture(), eq(COMMIT_TIME), eq(Option.empty()));
    actualRecords.getValue().sort(Comparator.comparing(HoodieRecord::getRecordKey));
    records.sort(Comparator.comparing(HoodieRecord::getRecordKey));
    assertEquals(records, actualRecords.getValue());
}
Also used : BufferedConnectWriter(org.apache.hudi.connect.writers.BufferedConnectWriter) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArgumentMatchers.anyList(org.mockito.ArgumentMatchers.anyList) List(java.util.List) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 94 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class HoodieJavaApp method run.

public void run() throws Exception {
    // Spark session setup..
    SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
    JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
    spark.sparkContext().setLogLevel("WARN");
    FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
    // Generator of some records to be loaded in.
    HoodieTestDataGenerator dataGen = null;
    if (nonPartitionedTable) {
        // All data goes to base-path
        dataGen = new HoodieTestDataGenerator(new String[] { "" });
    } else {
        dataGen = new HoodieTestDataGenerator();
    }
    // Explicitly clear up the hoodie table path if it exists.
    fs.delete(new Path(tablePath), true);
    /**
     * Commit with only inserts
     */
    // Generate some input..
    List<HoodieRecord> recordsSoFar = new ArrayList<>(dataGen.generateInserts("001", /* ignore */
    100));
    List<String> records1 = recordsToStrings(recordsSoFar);
    Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
    // Save as hoodie dataset (copy on write)
    // specify the hoodie source
    DataFrameWriter<Row> writer = inputDF1.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2").option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType).option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()).option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key").option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition").option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp").option(HoodieWriteConfig.TBL_NAME.key(), tableName).option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()).option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false").option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true").mode(SaveMode.Overwrite);
    updateHiveSyncConfig(writer);
    // new dataset if needed
    // ultimately where the dataset will be placed
    writer.save(tablePath);
    String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    LOG.info("First commit at instant time :" + commitInstantTime1);
    /**
     * Commit that updates records
     */
    List<HoodieRecord> recordsToBeUpdated = dataGen.generateUpdates("002", /* ignore */
    100);
    recordsSoFar.addAll(recordsToBeUpdated);
    List<String> records2 = recordsToStrings(recordsToBeUpdated);
    Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
    writer = inputDF2.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2").option(DataSourceWriteOptions.TABLE_TYPE().key(), // Hoodie Table Type
    tableType).option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key").option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition").option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp").option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : // Add Key Extractor
    SimpleKeyGenerator.class.getCanonicalName()).option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key(), "1").option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false").option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true").option(HoodieWriteConfig.TBL_NAME.key(), tableName).mode(SaveMode.Append);
    updateHiveSyncConfig(writer);
    writer.save(tablePath);
    String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    LOG.info("Second commit at instant time :" + commitInstantTime2);
    /**
     * Commit that Deletes some records
     */
    List<String> deletes = randomSelectAsHoodieKeys(recordsSoFar, 20).stream().map(hr -> "{\"_row_key\":\"" + hr.getRecordKey() + "\",\"partition\":\"" + hr.getPartitionPath() + "\"}").collect(Collectors.toList());
    Dataset<Row> inputDF3 = spark.read().json(jssc.parallelize(deletes, 2));
    writer = inputDF3.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2").option("hoodie.delete.shuffle.parallelism", "2").option(DataSourceWriteOptions.TABLE_TYPE().key(), // Hoodie Table Type
    tableType).option(DataSourceWriteOptions.OPERATION().key(), "delete").option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key").option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition").option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp").option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : // Add Key Extractor
    SimpleKeyGenerator.class.getCanonicalName()).option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key(), "1").option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false").option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true").option(HoodieWriteConfig.TBL_NAME.key(), tableName).mode(SaveMode.Append);
    updateHiveSyncConfig(writer);
    writer.save(tablePath);
    String commitInstantTime3 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    LOG.info("Third commit at instant time :" + commitInstantTime3);
    /**
     * Read & do some queries
     */
    Dataset<Row> snapshotQueryDF = spark.read().format("org.apache.hudi").load(tablePath + (nonPartitionedTable ? "/*" : "/*/*/*/*"));
    snapshotQueryDF.registerTempTable("hoodie_ro");
    spark.sql("describe hoodie_ro").show();
    // all trips whose fare amount was greater than 2.
    spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();
    if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
        /**
         * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
         */
        Dataset<Row> incQueryDF = spark.read().format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()).option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), commitInstantTime1).load(tablePath);
        LOG.info("You will only see records from : " + commitInstantTime2);
        incQueryDF.groupBy(incQueryDF.col("_hoodie_commit_time")).count().show();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Dataset(org.apache.spark.sql.Dataset) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) RawTripTestPayload.recordsToStrings(org.apache.hudi.common.testutils.RawTripTestPayload.recordsToStrings) SlashEncodedDayPartitionValueExtractor(org.apache.hudi.hive.SlashEncodedDayPartitionValueExtractor) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) MultiPartKeysValueExtractor(org.apache.hudi.hive.MultiPartKeysValueExtractor) Path(org.apache.hadoop.fs.Path) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) SaveMode(org.apache.spark.sql.SaveMode) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) JCommander(com.beust.jcommander.JCommander) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) List(java.util.List) HoodieDataSourceHelpers(org.apache.hudi.HoodieDataSourceHelpers) DataSourceReadOptions(org.apache.hudi.DataSourceReadOptions) DataFrameWriter(org.apache.spark.sql.DataFrameWriter) NonPartitionedExtractor(org.apache.hudi.hive.NonPartitionedExtractor) LogManager(org.apache.log4j.LogManager) Transformations.randomSelectAsHoodieKeys(org.apache.hudi.common.testutils.Transformations.randomSelectAsHoodieKeys) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Row(org.apache.spark.sql.Row) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 95 with HoodieTestDataGenerator

use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.

the class TestJavaBulkInsertInternalPartitioner method generateTestRecordsForBulkInsert.

public static List<HoodieRecord> generateTestRecordsForBulkInsert(int numRecords) {
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    List<HoodieRecord> records = dataGenerator.generateInserts("0", numRecords);
    return records;
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Aggregations

HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)97 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)52 Test (org.junit.jupiter.api.Test)51 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)44 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)38 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)31 TypedProperties (org.apache.hudi.common.config.TypedProperties)29 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 GenericRecord (org.apache.avro.generic.GenericRecord)25 JavaRDD (org.apache.spark.api.java.JavaRDD)25 Path (org.apache.hadoop.fs.Path)24 WriteStatus (org.apache.hudi.client.WriteStatus)22 ArrayList (java.util.ArrayList)21 Properties (java.util.Properties)21 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)18 HoodieTable (org.apache.hudi.table.HoodieTable)18 List (java.util.List)17 ValueSource (org.junit.jupiter.params.provider.ValueSource)17 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)16 IOException (java.io.IOException)15