use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSource method testCommitOffsetToKafka.
@Test
public void testCommitOffsetToKafka() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testCommitOffsetToKafka";
testUtils.createTopic(topic, 2);
List<TopicPartition> topicPartitions = new ArrayList<>();
TopicPartition topicPartition0 = new TopicPartition(topic, 0);
topicPartitions.add(topicPartition0);
TopicPartition topicPartition1 = new TopicPartition(topic, 1);
topicPartitions.add(topicPartition1);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
props.put(ENABLE_KAFKA_COMMIT_OFFSET.key(), "true");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
// 1. Extract without any checkpoint => get all the data, respecting sourceLimit
assertEquals(Option.empty(), kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch());
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 599);
// commit to kafka after first batch
kafkaSource.getSource().onCommit(fetch1.getCheckpointForNextBatch());
try (KafkaConsumer consumer = new KafkaConsumer(props)) {
consumer.assign(topicPartitions);
OffsetAndMetadata offsetAndMetadata = consumer.committed(topicPartition0);
assertNotNull(offsetAndMetadata);
assertEquals(300, offsetAndMetadata.offset());
offsetAndMetadata = consumer.committed(topicPartition1);
assertNotNull(offsetAndMetadata);
assertEquals(299, offsetAndMetadata.offset());
// end offsets will point to 500 for each partition because we consumed less messages from first batch
Map endOffsets = consumer.endOffsets(topicPartitions);
assertEquals(500L, endOffsets.get(topicPartition0));
assertEquals(500L, endOffsets.get(topicPartition1));
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("001", 500)));
InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
// commit to Kafka after second batch is processed completely
kafkaSource.getSource().onCommit(fetch2.getCheckpointForNextBatch());
offsetAndMetadata = consumer.committed(topicPartition0);
assertNotNull(offsetAndMetadata);
assertEquals(750, offsetAndMetadata.offset());
offsetAndMetadata = consumer.committed(topicPartition1);
assertNotNull(offsetAndMetadata);
assertEquals(750, offsetAndMetadata.offset());
endOffsets = consumer.endOffsets(topicPartitions);
assertEquals(750L, endOffsets.get(topicPartition0));
assertEquals(750L, endOffsets.get(topicPartition1));
}
// check failure case
props.remove(ConsumerConfig.GROUP_ID_CONFIG);
assertThrows(HoodieNotSupportedException.class, () -> kafkaSource.getSource().onCommit(""));
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSourcePostProcessor method testChainedJsonKafkaSourcePostProcessor.
@Test
public void testChainedJsonKafkaSourcePostProcessor() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testChainedJsonKafkaSourcePostProcessor";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
// processor class name setup
props.setProperty(JSON_KAFKA_PROCESSOR_CLASS_OPT.key(), SampleJsonKafkaSourcePostProcessor.class.getName() + "," + DummyJsonKafkaSourcePostProcessor.class.getName());
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
assertEquals(0, fetch1.getBatch().get().count());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSourcePostProcessor method testNoPostProcessor.
@Test
public void testNoPostProcessor() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testNoPostProcessor";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
assertEquals(900, fetch1.getBatch().get().count());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieSnapshotExporter method init.
@BeforeEach
public void init() throws Exception {
// Initialize test data dirs
sourcePath = Paths.get(basePath(), "source").toString();
targetPath = Paths.get(basePath(), "target").toString();
lfs = (LocalFileSystem) FSUtils.getFs(basePath(), jsc().hadoopConfiguration());
HoodieTableMetaClient.withPropertyBuilder().setTableType(HoodieTableType.COPY_ON_WRITE).setTableName(TABLE_NAME).setPayloadClass(HoodieAvroPayload.class).initTable(jsc().hadoopConfiguration(), sourcePath);
// Prepare data as source Hudi dataset
HoodieWriteConfig cfg = getHoodieWriteConfig(sourcePath);
SparkRDDWriteClient writeClient = getHoodieWriteClient(cfg);
writeClient.startCommitWithTime(COMMIT_TIME);
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] { PARTITION_PATH });
List<HoodieRecord> records = dataGen.generateInserts(COMMIT_TIME, NUM_RECORDS);
JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
writeClient.bulkInsert(recordsRDD, COMMIT_TIME);
writeClient.close();
RemoteIterator<LocatedFileStatus> itr = lfs.listFiles(new Path(sourcePath), true);
while (itr.hasNext()) {
LOG.info(">>> Prepared test file: " + itr.next().getPath());
}
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestHoodieDeltaStreamer method prepareJsonKafkaDFSFiles.
private static void prepareJsonKafkaDFSFiles(int numRecords, boolean createTopic, String topicName) throws IOException {
if (createTopic) {
try {
testUtils.createTopic(topicName, 2);
} catch (TopicExistsException e) {
// no op
}
}
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
testUtils.sendMessages(topicName, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", numRecords, HoodieTestDataGenerator.TRIP_SCHEMA)));
}
Aggregations