use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestTableSchemaEvolution method generateInsertsWithSchema.
private List<HoodieRecord> generateInsertsWithSchema(String commitTime, int numRecords, String schemaStr) {
HoodieTestDataGenerator gen = schemaStr.equals(TRIP_EXAMPLE_SCHEMA_EVOLVED) ? dataGenEvolved : dataGenDevolved;
List<HoodieRecord> records = gen.generateInserts(commitTime, numRecords);
return convertToSchema(records, schemaStr);
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestTableSchemaEvolution method generateUpdatesWithSchema.
private List<HoodieRecord> generateUpdatesWithSchema(String commitTime, int numRecords, String schemaStr) {
HoodieTestDataGenerator gen = schemaStr.equals(TRIP_EXAMPLE_SCHEMA_EVOLVED) ? dataGenEvolved : dataGenDevolved;
List<HoodieRecord> records = gen.generateUniqueUpdates(commitTime, numRecords);
return convertToSchema(records, schemaStr);
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class HoodieJavaGenerateApp method insert.
private void insert(SparkSession spark) throws IOException {
HoodieTestDataGenerator dataGen = getDataGenerate();
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
// Generate some input..
String instantTime = HoodieActiveTimeline.createNewInstantTime();
List<HoodieRecord> recordsSoFar = new ArrayList<>(dataGen.generateInserts(instantTime, /* ignore */
100));
List<String> records1 = recordsToStrings(recordsSoFar);
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
// Save as hoodie dataset (copy on write)
// specify the hoodie source
DataFrameWriter<Row> writer = inputDF1.write().format("org.apache.hudi").option("hoodie.insert.shuffle.parallelism", "2").option("hoodie.upsert.shuffle.parallelism", "2").option(DataSourceWriteOptions.TABLE_TYPE().key(), tableType).option(DataSourceWriteOptions.OPERATION().key(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()).option(DataSourceWriteOptions.RECORDKEY_FIELD().key(), "_row_key").option(DataSourceWriteOptions.PARTITIONPATH_FIELD().key(), "partition").option(DataSourceWriteOptions.PRECOMBINE_FIELD().key(), "timestamp").option(HoodieWriteConfig.TBL_NAME.key(), tableName).option(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()).mode(commitType);
updateHiveSyncConfig(writer);
// new dataset if needed
// ultimately where the dataset will be placed
writer.save(tablePath);
FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
LOG.info("Commit at instant time :" + commitInstantTime1);
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class HoodieJavaStreamingApp method run.
/**
* @throws Exception
*/
public void run() throws Exception {
// Spark session setup..
SparkSession spark = SparkSession.builder().appName("Hoodie Spark Streaming APP").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
// folder path clean up and creation, preparing the environment
FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());
fs.delete(new Path(streamingSourcePath), true);
fs.delete(new Path(streamingCheckpointingPath), true);
fs.delete(new Path(tablePath), true);
fs.mkdirs(new Path(streamingSourcePath));
// Generator of some records to be loaded in.
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
List<String> records1 = recordsToStrings(dataGen.generateInserts("001", 100));
Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));
List<String> records2 = recordsToStrings(dataGen.generateUpdatesForAllRecords("002"));
Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
String ckptPath = streamingCheckpointingPath + "/stream1";
String srcPath = streamingSourcePath + "/stream1";
fs.mkdirs(new Path(ckptPath));
fs.mkdirs(new Path(srcPath));
// setup the input for streaming
Dataset<Row> streamingInput = spark.readStream().schema(inputDF1.schema()).json(srcPath + "/*");
// start streaming and showing
ExecutorService executor = Executors.newFixedThreadPool(2);
int numInitialCommits = 0;
// thread for spark structured streaming
try {
Future<Void> streamFuture = executor.submit(() -> {
LOG.info("===== Streaming Starting =====");
stream(streamingInput, DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL(), ckptPath);
LOG.info("===== Streaming Ends =====");
return null;
});
// thread for adding data to the streaming source and showing results over time
Future<Integer> showFuture = executor.submit(() -> {
LOG.info("===== Showing Starting =====");
int numCommits = addInputAndValidateIngestion(spark, fs, srcPath, 0, 100, inputDF1, inputDF2, true);
LOG.info("===== Showing Ends =====");
return numCommits;
});
// let the threads run
streamFuture.get();
numInitialCommits = showFuture.get();
} finally {
executor.shutdownNow();
}
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(jssc.hadoopConfiguration()).setBasePath(tablePath).build();
if (tableType.equals(HoodieTableType.MERGE_ON_READ.name())) {
// Ensure we have successfully completed one compaction commit
ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() == 1);
} else {
ValidationUtils.checkArgument(metaClient.getActiveTimeline().getCommitTimeline().getInstants().count() >= 1);
}
// Deletes Stream
// Need to restart application to ensure spark does not assume there are multiple streams active.
spark.close();
SparkSession newSpark = SparkSession.builder().appName("Hoodie Spark Streaming APP").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]").getOrCreate();
jssc = new JavaSparkContext(newSpark.sparkContext());
String ckptPath2 = streamingCheckpointingPath + "/stream2";
String srcPath2 = srcPath + "/stream2";
fs.mkdirs(new Path(ckptPath2));
fs.mkdirs(new Path(srcPath2));
Dataset<Row> delStreamingInput = newSpark.readStream().schema(inputDF1.schema()).json(srcPath2 + "/*");
List<String> deletes = recordsToStrings(dataGen.generateUniqueUpdates("002", 20));
Dataset<Row> inputDF3 = newSpark.read().json(jssc.parallelize(deletes, 2));
executor = Executors.newFixedThreadPool(2);
// thread for spark structured streaming
try {
Future<Void> streamFuture = executor.submit(() -> {
LOG.info("===== Streaming Starting =====");
stream(delStreamingInput, DataSourceWriteOptions.DELETE_OPERATION_OPT_VAL(), ckptPath2);
LOG.info("===== Streaming Ends =====");
return null;
});
final int numCommits = numInitialCommits;
// thread for adding data to the streaming source and showing results over time
Future<Void> showFuture = executor.submit(() -> {
LOG.info("===== Showing Starting =====");
addInputAndValidateIngestion(newSpark, fs, srcPath2, numCommits, 80, inputDF3, null, false);
LOG.info("===== Showing Ends =====");
return null;
});
// let the threads run
streamFuture.get();
showFuture.get();
} finally {
executor.shutdown();
}
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestBootstrap method generateTestRawTripDataset.
public static Dataset<Row> generateTestRawTripDataset(long timestamp, int from, int to, List<String> partitionPaths, JavaSparkContext jsc, SQLContext sqlContext) {
boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty();
final List<String> records = new ArrayList<>();
IntStream.range(from, to).forEach(i -> {
String id = "" + i;
records.add(new HoodieTestDataGenerator().generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, timestamp, false, false).toString());
});
if (isPartitioned) {
sqlContext.udf().register("partgen", (UDF1<String, String>) (val) -> PartitionPathEncodeUtils.escapePathName(partitionPaths.get(Integer.parseInt(val.split("_")[1]) % partitionPaths.size())), DataTypes.StringType);
}
JavaRDD rdd = jsc.parallelize(records);
Dataset<Row> df = sqlContext.read().json(rdd);
if (isPartitioned) {
df = df.withColumn("datestr", callUDF("partgen", new Column("_row_key")));
// Order the columns to ensure generated avro schema aligns with Hive schema
df = df.select("timestamp", "_row_key", "partition_path", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted", "datestr");
} else {
// Order the columns to ensure generated avro schema aligns with Hive schema
df = df.select("timestamp", "_row_key", "partition_path", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted");
}
return df;
}
Aggregations