use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestJsonKafkaSourcePostProcessor method testNoPostProcessor.
@Test
public void testNoPostProcessor() {
// topic setup.
final String topic = TEST_TOPIC_PREFIX + "testNoPostProcessor";
testUtils.createTopic(topic, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
assertEquals(900, fetch1.getBatch().get().count());
}
use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestSqlSource method testSqlSourceCheckpoint.
/**
* Runs the test scenario of reading data from the source in row format.
* Source has no records.
*
* @throws IOException
*/
@Test
public void testSqlSourceCheckpoint() throws IOException {
props.setProperty(sqlSourceConfig, "select * from test_sql_table where 1=0");
sqlSource = new SqlSource(props, jsc, sparkSession, schemaProvider);
sourceFormatAdapter = new SourceFormatAdapter(sqlSource);
InputBatch<Dataset<Row>> fetch1AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE);
assertNull(fetch1AsRows.getCheckpointForNextBatch());
}
use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestSqlSource method testSqlSourceRowFormat.
/**
* Runs the test scenario of reading data from the source in row format.
* Source has less records than source limit.
*
* @throws IOException
*/
@Test
public void testSqlSourceRowFormat() throws IOException {
props.setProperty(sqlSourceConfig, "select * from test_sql_table");
sqlSource = new SqlSource(props, jsc, sparkSession, schemaProvider);
sourceFormatAdapter = new SourceFormatAdapter(sqlSource);
// Test fetching Row format
InputBatch<Dataset<Row>> fetch1AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE);
assertEquals(10000, fetch1AsRows.getBatch().get().count());
}
use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class TestAbstractDebeziumSource method testDebeziumEvents.
@ParameterizedTest
@MethodSource("testArguments")
public void testDebeziumEvents(Operation operation) throws Exception {
String sourceClass = getSourceClass();
// topic setup.
testUtils.createTopic(TEST_TOPIC_NAME, 2);
TypedProperties props = createPropsForJsonSource();
SchemaProvider schemaProvider = new MockSchemaRegistryProvider(props, jsc, this);
SourceFormatAdapter debeziumSource = new SourceFormatAdapter(UtilHelpers.createSource(sourceClass, props, jsc, sparkSession, schemaProvider, metrics));
testUtils.sendMessages(TEST_TOPIC_NAME, new String[] { generateDebeziumEvent(operation).toString() });
InputBatch<Dataset<Row>> fetch = debeziumSource.fetchNewDataInRowFormat(Option.empty(), 10);
assertEquals(1, fetch.getBatch().get().count());
// Ensure the before fields are picked for DELETE CDC Events,
// and after fields are picked for INSERT and UPDATE CDC Events.
final String fieldPrefix = (operation.equals(Operation.DELETE)) ? "before_" : "after_";
assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
// Validate DB specific meta fields
validateMetaFields(fetch.getBatch().get());
}
use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.
the class AbstractDFSSourceTestBase method testReadingFromSource.
/**
* Runs the test scenario of reading data from the source.
*
* @throws IOException
*/
@Test
public void testReadingFromSource() throws IOException {
dfs.mkdirs(new Path(dfsRoot));
SourceFormatAdapter sourceFormatAdapter = new SourceFormatAdapter(prepareDFSSource());
// 1. Extract without any checkpoint => get all the data, respecting sourceLimit
assertEquals(Option.empty(), sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch());
// Test respecting sourceLimit
int sourceLimit = 10;
RemoteIterator<LocatedFileStatus> files = dfs.listFiles(generateOneFile("1", "000", 100), true);
FileStatus file1Status = files.next();
assertTrue(file1Status.getLen() > sourceLimit);
assertEquals(Option.empty(), sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), sourceLimit).getBatch());
// Test fetching Avro format
InputBatch<JavaRDD<GenericRecord>> fetch1 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
assertEquals(100, fetch1.getBatch().get().count());
// Test fetching Row format
InputBatch<Dataset<Row>> fetch1AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE);
assertEquals(100, fetch1AsRows.getBatch().get().count());
// Test Avro to Row format
Dataset<Row> fetch1Rows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()), schemaProvider.getSourceSchema().toString(), sparkSession);
assertEquals(100, fetch1Rows.count());
// 2. Produce new data, extract new data
generateOneFile("2", "001", 10000);
// Test fetching Avro format
InputBatch<JavaRDD<GenericRecord>> fetch2 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
assertEquals(10000, fetch2.getBatch().get().count());
// Test fetching Row format
InputBatch<Dataset<Row>> fetch2AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.of(fetch1AsRows.getCheckpointForNextBatch()), Long.MAX_VALUE);
assertEquals(10000, fetch2AsRows.getBatch().get().count());
// 3. Extract with previous checkpoint => gives same data back (idempotent)
InputBatch<Dataset<Row>> fetch3AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.of(fetch1AsRows.getCheckpointForNextBatch()), Long.MAX_VALUE);
assertEquals(10000, fetch3AsRows.getBatch().get().count());
assertEquals(fetch2AsRows.getCheckpointForNextBatch(), fetch3AsRows.getCheckpointForNextBatch());
fetch3AsRows.getBatch().get().createOrReplaceTempView("test_dfs_table");
Dataset<Row> rowDataset = SparkSession.builder().sparkContext(jsc.sc()).getOrCreate().sql("select * from test_dfs_table");
assertEquals(10000, rowDataset.count());
// 4. Extract with latest checkpoint => no new data returned
InputBatch<JavaRDD<GenericRecord>> fetch4 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
assertEquals(Option.empty(), fetch4.getBatch());
// 5. Extract from the beginning
InputBatch<JavaRDD<GenericRecord>> fetch5 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
assertEquals(10100, fetch5.getBatch().get().count());
// 6. Should skip files/directories whose names start with prefixes ("_", ".")
generateOneFile(".checkpoint/3", "002", 100);
generateOneFile("_checkpoint/3", "002", 100);
generateOneFile(".3", "002", 100);
generateOneFile("_3", "002", 100);
// also work with nested directory
// not ok
generateOneFile("foo/.bar/3", "002", 1);
// ok
generateOneFile("foo/bar/3", "002", 1);
// fetch everything from the beginning
InputBatch<JavaRDD<GenericRecord>> fetch6 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
assertEquals(10101, fetch6.getBatch().get().count());
}
Aggregations