use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestNonpartitionedKeyGenerator method testSingleValueKeyGeneratorNonPartitioned.
@Test
public void testSingleValueKeyGeneratorNonPartitioned() {
TypedProperties properties = new TypedProperties();
properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "timestamp");
properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "");
NonpartitionedKeyGenerator keyGenerator = new NonpartitionedKeyGenerator(properties);
assertEquals(keyGenerator.getRecordKeyFields().size(), 1);
assertEquals(keyGenerator.getPartitionPathFields().size(), 0);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
GenericRecord record = dataGenerator.generateGenericRecords(1).get(0);
String rowKey = record.get("timestamp").toString();
HoodieKey hoodieKey = keyGenerator.getKey(record);
assertEquals(rowKey, hoodieKey.getRecordKey());
assertEquals("", hoodieKey.getPartitionPath());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestNonpartitionedKeyGenerator method testMultipleValueKeyGeneratorNonPartitioned1.
@Test
public void testMultipleValueKeyGeneratorNonPartitioned1() {
TypedProperties properties = new TypedProperties();
properties.setProperty(KeyGeneratorOptions.RECORDKEY_FIELD_NAME.key(), "timestamp,driver");
properties.setProperty(KeyGeneratorOptions.PARTITIONPATH_FIELD_NAME.key(), "");
NonpartitionedKeyGenerator keyGenerator = new NonpartitionedKeyGenerator(properties);
assertEquals(keyGenerator.getRecordKeyFields().size(), 2);
assertEquals(keyGenerator.getPartitionPathFields().size(), 0);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
GenericRecord record = dataGenerator.generateGenericRecords(1).get(0);
String rowKey = "timestamp" + ComplexAvroKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("timestamp").toString() + "," + "driver" + ComplexAvroKeyGenerator.DEFAULT_RECORD_KEY_SEPARATOR + record.get("driver").toString();
String partitionPath = "";
HoodieKey hoodieKey = keyGenerator.getKey(record);
assertEquals(rowKey, hoodieKey.getRecordKey());
assertEquals(partitionPath, hoodieKey.getPartitionPath());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestBufferedConnectWriter method testSimpleWriteAndFlush.
@Test
public void testSimpleWriteAndFlush() throws Exception {
String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
List<HoodieRecord> records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS);
BufferedConnectWriter writer = new BufferedConnectWriter(javaEngineContext, mockHoodieJavaWriteClient, COMMIT_TIME, configs, writeConfig, null, schemaProvider);
for (int i = 0; i < NUM_RECORDS; i++) {
writer.writeHudiRecord(records.get(i));
}
Mockito.verify(mockHoodieJavaWriteClient, times(0)).bulkInsertPreppedRecords(anyList(), eq(COMMIT_TIME), eq(Option.empty()));
writer.flushRecords();
final ArgumentCaptor<List<HoodieRecord>> actualRecords = ArgumentCaptor.forClass(List.class);
Mockito.verify(mockHoodieJavaWriteClient, times(1)).bulkInsertPreppedRecords(actualRecords.capture(), eq(COMMIT_TIME), eq(Option.empty()));
actualRecords.getValue().sort(Comparator.comparing(HoodieRecord::getRecordKey));
records.sort(Comparator.comparing(HoodieRecord::getRecordKey));
assertEquals(records, actualRecords.getValue());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class HoodieJavaApp method run.
public void run() throws Exception {
// Spark session setup..
SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
spark.sparkContext().setLogLevel("WARN");
FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
// Generator of some records to be loaded in.
HoodieTestDataGenerator dataGen = null;
if (nonPartitionedTable) {
// All data goes to base-path
dataGen = new HoodieTestDataGenerator(new String[] { "" });
} else {
dataGen = new HoodieTestDataGenerator();
}
// Explicitly clear up the hoodie table path if it exists.
fs.delete(new Path(tablePath), true);
/**
* Commit with only inserts
*/
// Generate some input..
List<HoodieRecord> recordsSoFar = new ArrayList<>(dataGen.generateInserts("001", /* ignore */
100));
List<String> records1 = recordsToStrings(recordsSoFar);
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
// Save as hoodie dataset (copy on write)
// specify the hoodie source
DataFrameWriter<Row> writer = inputDF1.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2").option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType).option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()).option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key").option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition").option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp").option(HoodieWriteConfig.TBL_NAME.key(), tableName).option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()).option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false").option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true").mode(SaveMode.Overwrite);
updateHiveSyncConfig(writer);
// new dataset if needed
// ultimately where the dataset will be placed
writer.save(tablePath);
String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
LOG.info("First commit at instant time :" + commitInstantTime1);
/**
* Commit that updates records
*/
List<HoodieRecord> recordsToBeUpdated = dataGen.generateUpdates("002", /* ignore */
100);
recordsSoFar.addAll(recordsToBeUpdated);
List<String> records2 = recordsToStrings(recordsToBeUpdated);
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
writer = inputDF2.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2").option(DataSourceWriteOptions.TABLE_TYPE().key(), // Hoodie Table Type
tableType).option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key").option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition").option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp").option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : // Add Key Extractor
SimpleKeyGenerator.class.getCanonicalName()).option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key(), "1").option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false").option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true").option(HoodieWriteConfig.TBL_NAME.key(), tableName).mode(SaveMode.Append);
updateHiveSyncConfig(writer);
writer.save(tablePath);
String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
LOG.info("Second commit at instant time :" + commitInstantTime2);
/**
* Commit that Deletes some records
*/
List<String> deletes = randomSelectAsHoodieKeys(recordsSoFar, 20).stream().map(hr -> "{\"_row_key\":\"" + hr.getRecordKey() + "\",\"partition\":\"" + hr.getPartitionPath() + "\"}").collect(Collectors.toList());
Dataset<Row> inputDF3 = spark.read().json(jssc.parallelize(deletes, 2));
writer = inputDF3.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2").option("hoodie.delete.shuffle.parallelism", "2").option(DataSourceWriteOptions.TABLE_TYPE().key(), // Hoodie Table Type
tableType).option(DataSourceWriteOptions.OPERATION().key(), "delete").option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key").option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition").option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp").option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : // Add Key Extractor
SimpleKeyGenerator.class.getCanonicalName()).option(HoodieCompactionConfig.INLINE_COMPACT_NUM_DELTA_COMMITS.key(), "1").option(DataSourceWriteOptions.ASYNC_COMPACT_ENABLE().key(), "false").option(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), "true").option(HoodieWriteConfig.TBL_NAME.key(), tableName).mode(SaveMode.Append);
updateHiveSyncConfig(writer);
writer.save(tablePath);
String commitInstantTime3 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
LOG.info("Third commit at instant time :" + commitInstantTime3);
/**
* Read & do some queries
*/
Dataset<Row> snapshotQueryDF = spark.read().format("org.apache.hudi").load(tablePath + (nonPartitionedTable ? "/*" : "/*/*/*/*"));
snapshotQueryDF.registerTempTable("hoodie_ro");
spark.sql("describe hoodie_ro").show();
// all trips whose fare amount was greater than 2.
spark.sql("select fare.amount, begin_lon, begin_lat, timestamp from hoodie_ro where fare.amount > 2.0").show();
if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
/**
* Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
*/
Dataset<Row> incQueryDF = spark.read().format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE().key(), DataSourceReadOptions.QUERY_TYPE_INCREMENTAL_OPT_VAL()).option(DataSourceReadOptions.BEGIN_INSTANTTIME().key(), commitInstantTime1).load(tablePath);
LOG.info("You will only see records from : " + commitInstantTime2);
incQueryDF.groupBy(incQueryDF.col("_hoodie_commit_time")).count().show();
}
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJavaBulkInsertInternalPartitioner method generateTestRecordsForBulkInsert.
public static List<HoodieRecord> generateTestRecordsForBulkInsert(int numRecords) {
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
List<HoodieRecord> records = dataGenerator.generateInserts("0", numRecords);
return records;
}
Aggregations