use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestOrcBootstrap method generateTestRawTripDataset.
public static Dataset<Row> generateTestRawTripDataset(long timestamp, int from, int to, List<String> partitionPaths, JavaSparkContext jsc, SQLContext sqlContext) {
boolean isPartitioned = partitionPaths != null && !partitionPaths.isEmpty();
final List<String> records = new ArrayList<>();
IntStream.range(from, to).forEach(i -> {
String id = "" + i;
records.add(new HoodieTestDataGenerator().generateGenericRecord("trip_" + id, Long.toString(timestamp), "rider_" + id, "driver_" + id, timestamp, false, false).toString());
});
if (isPartitioned) {
sqlContext.udf().register("partgen", (UDF1<String, String>) (val) -> PartitionPathEncodeUtils.escapePathName(partitionPaths.get(Integer.parseInt(val.split("_")[1]) % partitionPaths.size())), DataTypes.StringType);
}
JavaRDD rdd = jsc.parallelize(records);
Dataset<Row> df = sqlContext.read().json(rdd);
if (isPartitioned) {
df = df.withColumn("datestr", callUDF("partgen", new Column("_row_key")));
// Order the columns to ensure generated avro schema aligns with Hive schema
df = df.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted", "datestr");
} else {
// Order the columns to ensure generated avro schema aligns with Hive schema
df = df.select("timestamp", "_row_key", "rider", "driver", "begin_lat", "begin_lon", "end_lat", "end_lon", "fare", "tip_history", "_hoodie_is_deleted");
}
return df;
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestDFSHoodieDatasetInputReader method testSimpleHoodieDatasetReader.
@Test
public void testSimpleHoodieDatasetReader() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfig();
SparkRDDWriteClient client = new SparkRDDWriteClient(new HoodieSparkEngineContext(jsc), config);
String commitTime = client.startCommit();
HoodieTestDataGenerator generator = new HoodieTestDataGenerator();
// Insert 100 records across 3 partitions
List<HoodieRecord> inserts = generator.generateInserts(commitTime, 100);
JavaRDD<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime);
writeStatuses.count();
DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(), HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString());
// Try to read 100 records for the same partition path and same file ID
JavaRDD<GenericRecord> records = reader.read(1, 1, 100L);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 1);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 1);
// Try to read 100 records for 3 partition paths and 3 different file ids
records = reader.read(3, 3, 100L);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 3);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 3);
// Try to read 100 records for 3 partition paths and 50% records from each file
records = reader.read(3, 3, 0.5);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(), 3);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(), 3);
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSource method testJsonKafkaSourceFilterNullMsg.
// test whether empty messages can be filtered
@Test
public void testJsonKafkaSourceFilterNullMsg() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceFilterNullMsg";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
// 1. Extract without any checkpoint => get all the data, respecting sourceLimit
assertEquals(Option.empty(), kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch());
// Send 1000 non-null messages to Kafka
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
// Send 100 null messages to Kafka
testUtils.sendMessages(topic, new String[100]);
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
// Verify that messages with null values are filtered
assertEquals(1000, fetch1.getBatch().get().count());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSource method testJsonKafkaSourceInsertRecordsLessSourceLimit.
@Test
public void testJsonKafkaSourceInsertRecordsLessSourceLimit() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceInsertRecordsLessSourceLimit";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, Long.MAX_VALUE, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", "500");
/*
1. maxEventsFromKafkaSourceProp set to more than generated insert records
and sourceLimit less than the generated insert records num.
*/
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 400)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 300);
assertEquals(300, fetch1.getBatch().get().count());
/*
2. Produce new data, extract new data based on sourceLimit
and sourceLimit less than the generated insert records num.
*/
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("001", 600)));
InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), 300);
assertEquals(300, fetch2.getBatch().get().count());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSource method testJsonKafkaSourceWithDefaultUpperCap.
@Test
public void testJsonKafkaSourceWithDefaultUpperCap() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceWithDefaultUpperCap";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, Long.MAX_VALUE, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
/*
1. Extract without any checkpoint => get all the data, respecting default upper cap since both sourceLimit and
maxEventsFromKafkaSourceProp are set to Long.MAX_VALUE
*/
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
assertEquals(1000, fetch1.getBatch().get().count());
// 2. Produce new data, extract new data based on sourceLimit
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("001", 1000)));
InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), 1500);
assertEquals(1000, fetch2.getBatch().get().count());
}
Aggregations