Examples with SourceFormatAdapter - org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter

Example 16 with SourceFormatAdapter

use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.

the class TestJsonKafkaSourcePostProcessor method testNoPostProcessor.

@Test
public void testNoPostProcessor() {
    // topic setup.
    final String topic = TEST_TOPIC_PREFIX + "testNoPostProcessor";
    testUtils.createTopic(topic, 2);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
    Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
    SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
    testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
    InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
    assertEquals(900, fetch1.getBatch().get().count());
}

Also used : TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) JavaRDD(org.apache.spark.api.java.JavaRDD) Test(org.junit.jupiter.api.Test)

Example 17 with SourceFormatAdapter

use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.

the class TestSqlSource method testSqlSourceCheckpoint.

/**
 * Runs the test scenario of reading data from the source in row format.
 * Source has no records.
 *
 * @throws IOException
 */
@Test
public void testSqlSourceCheckpoint() throws IOException {
    props.setProperty(sqlSourceConfig, "select * from test_sql_table where 1=0");
    sqlSource = new SqlSource(props, jsc, sparkSession, schemaProvider);
    sourceFormatAdapter = new SourceFormatAdapter(sqlSource);
    InputBatch<Dataset<Row>> fetch1AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE);
    assertNull(fetch1AsRows.getCheckpointForNextBatch());
}

Also used : Dataset(org.apache.spark.sql.Dataset) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) Test(org.junit.jupiter.api.Test)

Example 18 with SourceFormatAdapter

use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.

the class TestSqlSource method testSqlSourceRowFormat.

/**
 * Runs the test scenario of reading data from the source in row format.
 * Source has less records than source limit.
 *
 * @throws IOException
 */
@Test
public void testSqlSourceRowFormat() throws IOException {
    props.setProperty(sqlSourceConfig, "select * from test_sql_table");
    sqlSource = new SqlSource(props, jsc, sparkSession, schemaProvider);
    sourceFormatAdapter = new SourceFormatAdapter(sqlSource);
    // Test fetching Row format
    InputBatch<Dataset<Row>> fetch1AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE);
    assertEquals(10000, fetch1AsRows.getBatch().get().count());
}

Also used : Dataset(org.apache.spark.sql.Dataset) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) Test(org.junit.jupiter.api.Test)

Example 19 with SourceFormatAdapter

use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.

the class TestAbstractDebeziumSource method testDebeziumEvents.

@ParameterizedTest
@MethodSource("testArguments")
public void testDebeziumEvents(Operation operation) throws Exception {
    String sourceClass = getSourceClass();
    // topic setup.
    testUtils.createTopic(TEST_TOPIC_NAME, 2);
    TypedProperties props = createPropsForJsonSource();
    SchemaProvider schemaProvider = new MockSchemaRegistryProvider(props, jsc, this);
    SourceFormatAdapter debeziumSource = new SourceFormatAdapter(UtilHelpers.createSource(sourceClass, props, jsc, sparkSession, schemaProvider, metrics));
    testUtils.sendMessages(TEST_TOPIC_NAME, new String[] { generateDebeziumEvent(operation).toString() });
    InputBatch<Dataset<Row>> fetch = debeziumSource.fetchNewDataInRowFormat(Option.empty(), 10);
    assertEquals(1, fetch.getBatch().get().count());
    // Ensure the before fields are picked for DELETE CDC Events,
    // and after fields are picked for INSERT and UPDATE CDC Events.
    final String fieldPrefix = (operation.equals(Operation.DELETE)) ? "before_" : "after_";
    assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
    assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
    // Validate DB specific meta fields
    validateMetaFields(fetch.getBatch().get());
}

Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) Dataset(org.apache.spark.sql.Dataset) SchemaRegistryProvider(org.apache.hudi.utilities.schema.SchemaRegistryProvider) DebeziumConstants(org.apache.hudi.common.model.debezium.DebeziumConstants) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) GenericData(org.apache.avro.generic.GenericData) AfterAll(org.junit.jupiter.api.AfterAll) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) BeforeAll(org.junit.jupiter.api.BeforeAll) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) MethodSource(org.junit.jupiter.params.provider.MethodSource) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) TypedProperties(org.apache.hudi.common.config.TypedProperties) UtilHelpers(org.apache.hudi.utilities.UtilHelpers) UtilitiesTestBase(org.apache.hudi.utilities.testutils.UtilitiesTestBase) ConsumerConfig(org.apache.kafka.clients.consumer.ConsumerConfig) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) KafkaTestUtils(org.apache.spark.streaming.kafka010.KafkaTestUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) InputBatch(org.apache.hudi.utilities.sources.InputBatch) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) Mockito.mock(org.mockito.Mockito.mock) Dataset(org.apache.spark.sql.Dataset) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) TypedProperties(org.apache.hudi.common.config.TypedProperties) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 20 with SourceFormatAdapter

use of org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter in project hudi by apache.

the class AbstractDFSSourceTestBase method testReadingFromSource.

/**
 * Runs the test scenario of reading data from the source.
 *
 * @throws IOException
 */
@Test
public void testReadingFromSource() throws IOException {
    dfs.mkdirs(new Path(dfsRoot));
    SourceFormatAdapter sourceFormatAdapter = new SourceFormatAdapter(prepareDFSSource());
    // 1. Extract without any checkpoint => get all the data, respecting sourceLimit
    assertEquals(Option.empty(), sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE).getBatch());
    // Test respecting sourceLimit
    int sourceLimit = 10;
    RemoteIterator<LocatedFileStatus> files = dfs.listFiles(generateOneFile("1", "000", 100), true);
    FileStatus file1Status = files.next();
    assertTrue(file1Status.getLen() > sourceLimit);
    assertEquals(Option.empty(), sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), sourceLimit).getBatch());
    // Test fetching Avro format
    InputBatch<JavaRDD<GenericRecord>> fetch1 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
    assertEquals(100, fetch1.getBatch().get().count());
    // Test fetching Row format
    InputBatch<Dataset<Row>> fetch1AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.empty(), Long.MAX_VALUE);
    assertEquals(100, fetch1AsRows.getBatch().get().count());
    // Test Avro to Row format
    Dataset<Row> fetch1Rows = AvroConversionUtils.createDataFrame(JavaRDD.toRDD(fetch1.getBatch().get()), schemaProvider.getSourceSchema().toString(), sparkSession);
    assertEquals(100, fetch1Rows.count());
    // 2. Produce new data, extract new data
    generateOneFile("2", "001", 10000);
    // Test fetching Avro format
    InputBatch<JavaRDD<GenericRecord>> fetch2 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.of(fetch1.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(10000, fetch2.getBatch().get().count());
    // Test fetching Row format
    InputBatch<Dataset<Row>> fetch2AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.of(fetch1AsRows.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(10000, fetch2AsRows.getBatch().get().count());
    // 3. Extract with previous checkpoint => gives same data back (idempotent)
    InputBatch<Dataset<Row>> fetch3AsRows = sourceFormatAdapter.fetchNewDataInRowFormat(Option.of(fetch1AsRows.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(10000, fetch3AsRows.getBatch().get().count());
    assertEquals(fetch2AsRows.getCheckpointForNextBatch(), fetch3AsRows.getCheckpointForNextBatch());
    fetch3AsRows.getBatch().get().createOrReplaceTempView("test_dfs_table");
    Dataset<Row> rowDataset = SparkSession.builder().sparkContext(jsc.sc()).getOrCreate().sql("select * from test_dfs_table");
    assertEquals(10000, rowDataset.count());
    // 4. Extract with latest checkpoint => no new data returned
    InputBatch<JavaRDD<GenericRecord>> fetch4 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.of(fetch2.getCheckpointForNextBatch()), Long.MAX_VALUE);
    assertEquals(Option.empty(), fetch4.getBatch());
    // 5. Extract from the beginning
    InputBatch<JavaRDD<GenericRecord>> fetch5 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
    assertEquals(10100, fetch5.getBatch().get().count());
    // 6. Should skip files/directories whose names start with prefixes ("_", ".")
    generateOneFile(".checkpoint/3", "002", 100);
    generateOneFile("_checkpoint/3", "002", 100);
    generateOneFile(".3", "002", 100);
    generateOneFile("_3", "002", 100);
    // also work with nested directory
    // not ok
    generateOneFile("foo/.bar/3", "002", 1);
    // ok
    generateOneFile("foo/bar/3", "002", 1);
    // fetch everything from the beginning
    InputBatch<JavaRDD<GenericRecord>> fetch6 = sourceFormatAdapter.fetchNewDataInAvroFormat(Option.empty(), Long.MAX_VALUE);
    assertEquals(10101, fetch6.getBatch().get().count());
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Dataset(org.apache.spark.sql.Dataset) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Row(org.apache.spark.sql.Row) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) JavaRDD(org.apache.spark.api.java.JavaRDD) Test(org.junit.jupiter.api.Test)

Aggregations

SourceFormatAdapter (org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter)20 Test (org.junit.jupiter.api.Test)19 JavaRDD (org.apache.spark.api.java.JavaRDD)13 TypedProperties (org.apache.hudi.common.config.TypedProperties)12 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)11 Dataset (org.apache.spark.sql.Dataset)11 Row (org.apache.spark.sql.Row)4 GenericRecord (org.apache.avro.generic.GenericRecord)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 UUID (java.util.UUID)1 Stream (java.util.stream.Stream)1 Schema (org.apache.avro.Schema)1 GenericData (org.apache.avro.generic.GenericData)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 LocatedFileStatus (org.apache.hadoop.fs.LocatedFileStatus)1 Path (org.apache.hadoop.fs.Path)1 DebeziumConstants (org.apache.hudi.common.model.debezium.DebeziumConstants)1 Option (org.apache.hudi.common.util.Option)1