use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestJsonKafkaSource method testJsonKafkaSourceFilterNullMsg.
// test whether empty messages can be filtered
@Test
public void testJsonKafkaSourceFilterNullMsg() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceFilterNullMsg";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
// 1. Extract without any checkpoint => get all the data, respecting sourceLimit
assertEquals(Option.empty(), kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch());
// Send 1000 non-null messages to Kafka
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
// Send 100 null messages to Kafka
testUtils.sendMessages(topic, new String[100]);
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
// Verify that messages with null values are filtered
assertEquals(1000, fetch1.getBatch().get().count());
}
use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestJsonKafkaSource method testJsonKafkaSourceInsertRecordsLessSourceLimit.
@Test
public void testJsonKafkaSourceInsertRecordsLessSourceLimit() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceInsertRecordsLessSourceLimit";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, Long.MAX_VALUE, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
props.setProperty("hoodie.deltastreamer.kafka.source.maxEvents", "500");
/*
1. maxEventsFromKafkaSourceProp set to more than generated insert records
and sourceLimit less than the generated insert records num.
*/
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 400)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 300);
assertEquals(300, fetch1.getBatch().get().count());
/*
2. Produce new data, extract new data based on sourceLimit
and sourceLimit less than the generated insert records num.
*/
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("001", 600)));
InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), 300);
assertEquals(300, fetch2.getBatch().get().count());
}
use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestJsonKafkaSource method testJsonKafkaSourceWithDefaultUpperCap.
@Test
public void testJsonKafkaSourceWithDefaultUpperCap() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceWithDefaultUpperCap";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, Long.MAX_VALUE, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
/*
1. Extract without any checkpoint => get all the data, respecting default upper cap since both sourceLimit and
maxEventsFromKafkaSourceProp are set to Long.MAX_VALUE
*/
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
assertEquals(1000, fetch1.getBatch().get().count());
// 2. Produce new data, extract new data based on sourceLimit
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("001", 1000)));
InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), 1500);
assertEquals(1000, fetch2.getBatch().get().count());
}
use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestJsonKafkaSource method testCommitOffsetToKafka.
@Test
public void testCommitOffsetToKafka() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testCommitOffsetToKafka";
testUtils.createTopic(topic, 2);
List<TopicPartition> topicPartitions = new ArrayList<>();
TopicPartition topicPartition0 = new TopicPartition(topic, 0);
topicPartitions.add(topicPartition0);
TopicPartition topicPartition1 = new TopicPartition(topic, 1);
topicPartitions.add(topicPartition1);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
props.put(ENABLE_KAFKA_COMMIT_OFFSET.key(), "true");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
// 1. Extract without any checkpoint => get all the data, respecting sourceLimit
assertEquals(Option.empty(), kafkaSource.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch());
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 599);
// commit to kafka after first batch
kafkaSource.getSource().onCommit(fetch1.getCheckpointForNextBatch());
try (KafkaConsumer consumer = new KafkaConsumer(props)) {
consumer.assign(topicPartitions);
OffsetAndMetadata offsetAndMetadata = consumer.committed(topicPartition0);
assertNotNull(offsetAndMetadata);
assertEquals(300, offsetAndMetadata.offset());
offsetAndMetadata = consumer.committed(topicPartition1);
assertNotNull(offsetAndMetadata);
assertEquals(299, offsetAndMetadata.offset());
// end offsets will point to 500 for each partition because we consumed less messages from first batch
Map endOffsets = consumer.endOffsets(topicPartitions);
assertEquals(500L, endOffsets.get(topicPartition0));
assertEquals(500L, endOffsets.get(topicPartition1));
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("001", 500)));
InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
// commit to Kafka after second batch is processed completely
kafkaSource.getSource().onCommit(fetch2.getCheckpointForNextBatch());
offsetAndMetadata = consumer.committed(topicPartition0);
assertNotNull(offsetAndMetadata);
assertEquals(750, offsetAndMetadata.offset());
offsetAndMetadata = consumer.committed(topicPartition1);
assertNotNull(offsetAndMetadata);
assertEquals(750, offsetAndMetadata.offset());
endOffsets = consumer.endOffsets(topicPartitions);
assertEquals(750L, endOffsets.get(topicPartition0));
assertEquals(750L, endOffsets.get(topicPartition1));
}
// check failure case
props.remove(ConsumerConfig.GROUP_ID_CONFIG);
assertThrows(HoodieNotSupportedException.class, () -> kafkaSource.getSource().onCommit(""));
}
use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestJsonKafkaSourcePostProcessor method testChainedJsonKafkaSourcePostProcessor.
@Test
public void testChainedJsonKafkaSourcePostProcessor() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testChainedJsonKafkaSourcePostProcessor";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
// processor class name setup
props.setProperty(JSON_KAFKA_PROCESSOR_CLASS_OPT.key(), SampleJsonKafkaSourcePostProcessor.class.getName() + "," + DummyJsonKafkaSourcePostProcessor.class.getName());
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
assertEquals(0, fetch1.getBatch().get().count());
}
Aggregations