use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSource method testJsonKafkaSourceWithConfigurableUpperCap.
@Test
public void testJsonKafkaSourceWithConfigurableUpperCap() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testJsonKafkaSourceWithConfigurableUpperCap";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, 500L, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
// 1. Extract without any checkpoint => get all the data, respecting sourceLimit
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
assertEquals(900, fetch1.getBatch().get().count());
// 2. Produce new data, extract new data based on upper cap
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("001", 1000)));
InputBatch<Dataset<Row>> fetch2 = kafkaSource.fetchNewDataInRowFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
assertEquals(500, fetch2.getBatch().get().count());
// fetch data respecting source limit where upper cap > sourceLimit
InputBatch<JavaRDD<GenericRecord>> fetch3 = kafkaSource.fetchNewDataInAvroFormat(Option.of(fetch1.getCheckpointForNextBatch()), 400);
assertEquals(400, fetch3.getBatch().get().count());
// fetch data respecting source limit where upper cap < sourceLimit
InputBatch<JavaRDD<GenericRecord>> fetch4 = kafkaSource.fetchNewDataInAvroFormat(Option.of(fetch2.getCheckpointForNextBatch()), 600);
assertEquals(600, fetch4.getBatch().get().count());
// 3. Extract with previous checkpoint => gives same data back (idempotent)
InputBatch<JavaRDD<GenericRecord>> fetch5 = kafkaSource.fetchNewDataInAvroFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
assertEquals(fetch2.getBatch().get().count(), fetch5.getBatch().get().count());
assertEquals(fetch2.getCheckpointForNextBatch(), fetch5.getCheckpointForNextBatch());
// 4. Extract with latest checkpoint => no new data returned
InputBatch<JavaRDD<GenericRecord>> fetch6 = kafkaSource.fetchNewDataInAvroFormat(Option.of(fetch4.getCheckpointForNextBatch()), Long.MAX_VALUE);
assertEquals(Option.empty(), fetch6.getBatch());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSourcePostProcessor method testInvalidJsonKafkaSourcePostProcessor.
@Test
public void testInvalidJsonKafkaSourcePostProcessor() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testInvalidJsonKafkaSourcePostProcessor";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
// processor class name setup
props.setProperty(JSON_KAFKA_PROCESSOR_CLASS_OPT.key(), "InvalidJsonKafkaSourcePostProcessor");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
Assertions.assertThrows(HoodieSourcePostProcessException.class, () -> kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900));
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class TestJsonKafkaSourcePostProcessor method testSampleJsonKafkaSourcePostProcessor.
@Test
public void testSampleJsonKafkaSourcePostProcessor() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testSampleJsonKafkaSourcePostProcessor";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
// processor class name setup
props.setProperty(JSON_KAFKA_PROCESSOR_CLASS_OPT.key(), SampleJsonKafkaSourcePostProcessor.class.getName());
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
assertNotEquals(900, fetch1.getBatch().get().count());
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class AbstractBaseTestSource method initDataGen.
public static void initDataGen(TypedProperties props, int partition) {
try {
boolean useRocksForTestDataGenKeys = props.getBoolean(SourceConfigs.USE_ROCKSDB_FOR_TEST_DATAGEN_KEYS, SourceConfigs.DEFAULT_USE_ROCKSDB_FOR_TEST_DATAGEN_KEYS);
String baseStoreDir = props.getString(SourceConfigs.ROCKSDB_BASE_DIR_FOR_TEST_DATAGEN_KEYS, File.createTempFile("test_data_gen", ".keys").getParent()) + "/" + partition;
LOG.info("useRocksForTestDataGenKeys=" + useRocksForTestDataGenKeys + ", BaseStoreDir=" + baseStoreDir);
dataGeneratorMap.put(partition, new HoodieTestDataGenerator(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, useRocksForTestDataGenKeys ? new RocksDBBasedMap<>(baseStoreDir) : new HashMap<>()));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
}
use of org.apache.hudi.common.testutils.HoodieTestDataGenerator in project hudi by apache.
the class AbstractBaseTestSource method initDataGen.
public static void initDataGen(SQLContext sqlContext, String globParquetPath, int partition) {
List<Row> rows = sqlContext.read().format("hudi").load(globParquetPath).select("_hoodie_record_key", "_hoodie_partition_path").collectAsList();
Map<Integer, HoodieTestDataGenerator.KeyPartition> keyPartitionMap = IntStream.range(0, rows.size()).boxed().collect(Collectors.toMap(Function.identity(), i -> {
Row r = rows.get(i);
HoodieTestDataGenerator.KeyPartition kp = new HoodieTestDataGenerator.KeyPartition();
kp.key = new HoodieKey(r.getString(0), r.getString(1));
kp.partitionPath = r.getString(1);
return kp;
}));
dataGeneratorMap.put(partition, new HoodieTestDataGenerator(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, keyPartitionMap));
}
Aggregations