Search in sources :

Example 81 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestJsonKafkaSourcePostProcessor method testMaxwellJsonKafkaSourcePostProcessor.

@Test
public void testMaxwellJsonKafkaSourcePostProcessor() throws IOException {
    // ------------------------------------------------------------------------
    // Maxwell data
    // ------------------------------------------------------------------------
    // database hudi, table hudi_maxwell_01 (insert, update and delete)
    String hudiMaxwell01Insert = "{\"database\":\"hudi\",\"table\":\"hudi_maxwell_01\",\"type\":\"insert\"," + "\"ts\":1647074402,\"xid\":6233,\"commit\":true,\"data\":{\"id\":\"6018220e39e74477b45c7cf42f66bdc0\"," + "\"name\":\"mathieu\",\"age\":18,\"insert_time\":\"2022-03-12 08:40:02\"," + "\"update_time\":\"2022-03-12 08:40:02\"}}";
    String hudiMaxwell01Update = "{\"database\":\"hudi\",\"table\":\"hudi_maxwell_01\",\"type\":\"update\"," + "\"ts\":1647074482,\"xid\":6440,\"commit\":true,\"data\":{\"id\":\"6018220e39e74477b45c7cf42f66bdc0\"," + "\"name\":\"mathieu\",\"age\":20,\"insert_time\":\"2022-03-12 04:40:02\",\"update_time\":\"2022-03-12 04:42:25\"}," + "\"old\":{\"age\":18,\"insert_time\":\"2022-03-12 08:40:02\",\"update_time\":\"2022-03-12 08:40:02\"}}";
    String hudiMaxwell01Delete = "{\"database\":\"hudi\",\"table\":\"hudi_maxwell_01\",\"type\":\"delete\"," + "\"ts\":1647074555,\"xid\":6631,\"commit\":true,\"data\":{\"id\":\"6018220e39e74477b45c7cf42f66bdc0\"," + "\"name\":\"mathieu\",\"age\":20,\"insert_time\":\"2022-03-12 04:40:02\",\"update_time\":\"2022-03-12 04:42:25\"}}";
    String hudiMaxwell01Ddl = "{\"type\":\"table-alter\",\"database\":\"hudi\",\"table\":\"hudi_maxwell_01\"," + "\"old\":{\"database\":\"hudi\",\"charset\":\"utf8\",\"table\":\"hudi_maxwell_01\"," + "\"primary-key\":[\"id\"],\"columns\":[{\"type\":\"varchar\",\"name\":\"id\",\"charset\":\"utf8\"}," + "{\"type\":\"varchar\",\"name\":\"name\",\"charset\":\"utf8\"},{\"type\":\"int\",\"name\":\"age\"," + "\"signed\":true},{\"type\":\"timestamp\",\"name\":\"insert_time\",\"column-length\":0}," + "{\"type\":\"timestamp\",\"name\":\"update_time\",\"column-length\":0}]},\"def\":{\"database\":\"hudi\"," + "\"charset\":\"utf8\",\"table\":\"hudi_maxwell_01\",\"primary-key\":[\"id\"]," + "\"columns\":[{\"type\":\"varchar\",\"name\":\"id\",\"charset\":\"utf8\"},{\"type\":\"varchar\"," + "\"name\":\"name\",\"charset\":\"utf8\"},{\"type\":\"int\",\"name\":\"age\",\"signed\":true}," + "{\"type\":\"timestamp\",\"name\":\"insert_time\",\"column-length\":0},{\"type\":\"timestamp\"," + "\"name\":\"update_time\",\"column-length\":0}]},\"ts\":1647072305000,\"sql\":\"/* ApplicationName=DBeaver " + "21.0.4 - Main */ ALTER TABLE hudi.hudi_maxwell_01 MODIFY COLUMN age int(3) NULL\"}";
    // database hudi, table hudi_maxwell_010, insert
    String hudiMaxwell010Insert = "{\"database\":\"hudi\",\"table\":\"hudi_maxwell_010\",\"type\":\"insert\"," + "\"ts\":1647073982,\"xid\":5164,\"commit\":true,\"data\":{\"id\":\"f3eaf4cdf7534e47a88cdf93d19b2ee6\"," + "\"name\":\"wangxianghu\",\"age\":18,\"insert_time\":\"2022-03-12 08:33:02\"," + "\"update_time\":\"2022-03-12 08:33:02\"}}";
    // database hudi_02, table hudi_maxwell_02, insert
    String hudi02Maxwell02Insert = "{\"database\":\"hudi_02\",\"table\":\"hudi_maxwell_02\",\"type\":\"insert\"," + "\"ts\":1647073916,\"xid\":4990,\"commit\":true,\"data\":{\"id\":\"9bb17f316ee8488cb107621ddf0f3cb0\"," + "\"name\":\"andy\",\"age\":17,\"insert_time\":\"2022-03-12 08:31:56\"," + "\"update_time\":\"2022-03-12 08:31:56\"}}";
    // ------------------------------------------------------------------------
    // Tests
    // ------------------------------------------------------------------------
    ObjectMapper mapper = new ObjectMapper();
    TypedProperties props = new TypedProperties();
    props.setProperty(MaxwellJsonKafkaSourcePostProcessor.Config.DATABASE_NAME_REGEX_PROP.key(), "hudi(_)?[0-9]{0,2}");
    props.setProperty(MaxwellJsonKafkaSourcePostProcessor.Config.TABLE_NAME_REGEX_PROP.key(), "hudi_maxwell(_)?[0-9]{0,2}");
    // test insert and update
    JavaRDD<String> inputInsertAndUpdate = jsc().parallelize(Arrays.asList(hudiMaxwell01Insert, hudiMaxwell01Update));
    MaxwellJsonKafkaSourcePostProcessor processor = new MaxwellJsonKafkaSourcePostProcessor(props);
    processor.process(inputInsertAndUpdate).map(mapper::readTree).foreach(record -> {
        // database name should be null
        JsonNode database = record.get("database");
        // insert and update records should be tagged as no delete
        boolean isDelete = record.get(HoodieRecord.HOODIE_IS_DELETED).booleanValue();
        assertFalse(isDelete);
        assertNull(database);
    });
    // test delete
    props.setProperty(MaxwellJsonKafkaSourcePostProcessor.Config.PRECOMBINE_FIELD_TYPE_PROP.key(), "DATE_STRING");
    props.setProperty(MaxwellJsonKafkaSourcePostProcessor.Config.PRECOMBINE_FIELD_FORMAT_PROP.key(), "yyyy-MM-dd HH:mm:ss");
    props.setProperty(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "update_time");
    JavaRDD<String> inputDelete = jsc().parallelize(Collections.singletonList(hudiMaxwell01Delete));
    long ts = mapper.readTree(hudiMaxwell01Delete).get("ts").longValue();
    String formatTs = DateTimeUtils.formatUnixTimestamp(ts, "yyyy-MM-dd HH:mm:ss");
    new MaxwellJsonKafkaSourcePostProcessor(props).process(inputDelete).map(mapper::readTree).foreach(record -> {
        // delete records should be tagged as delete
        boolean isDelete = record.get(HoodieRecord.HOODIE_IS_DELETED).booleanValue();
        // update_time should equals ts
        String updateTime = record.get("update_time").textValue();
        assertEquals(formatTs, updateTime);
        assertTrue(isDelete);
    });
    // test preCombine field is not time
    props.setProperty(MaxwellJsonKafkaSourcePostProcessor.Config.PRECOMBINE_FIELD_TYPE_PROP.key(), "NON_TIMESTAMP");
    props.setProperty(HoodieWriteConfig.PRECOMBINE_FIELD_NAME.key(), "id");
    JavaRDD<String> inputDelete2 = jsc().parallelize(Collections.singletonList(hudiMaxwell01Delete));
    String updateTimeInUpdate = mapper.readTree(hudiMaxwell01Update).get("data").get("update_time").textValue();
    new MaxwellJsonKafkaSourcePostProcessor(props).process(inputDelete2).map(mapper::readTree).foreach(record -> {
        // updateTimeInUpdate should updateTimeInDelete
        String updateTimeInDelete = record.get("update_time").textValue();
        assertEquals(updateTimeInUpdate, updateTimeInDelete);
    });
    // test database, table regex
    JavaRDD<String> dirtyData = jsc().parallelize(Arrays.asList(hudiMaxwell01Insert, hudiMaxwell010Insert, hudi02Maxwell02Insert));
    long validDataNum = processor.process(dirtyData).count();
    // hudiMaxwell010Insert is dirty data
    assertEquals(2, validDataNum);
    // test ddl
    JavaRDD<String> ddlData = jsc().parallelize(Collections.singletonList(hudiMaxwell01Ddl));
    // ddl data will be ignored, ths count should be 0
    long ddlDataNum = processor.process(ddlData).count();
    assertEquals(0, ddlDataNum);
}
Also used : JsonNode(com.fasterxml.jackson.databind.JsonNode) TypedProperties(org.apache.hudi.common.config.TypedProperties) MaxwellJsonKafkaSourcePostProcessor(org.apache.hudi.utilities.sources.processor.maxwell.MaxwellJsonKafkaSourcePostProcessor) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Test(org.junit.jupiter.api.Test)

Example 82 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestJsonKafkaSourcePostProcessor method testNoPostProcessor.

@Test
public void testNoPostProcessor() {
    // topic setup.
    final String topic = TEST_TOPIC_PREFIX + "testNoPostProcessor";
    testUtils.createTopic(topic, 2);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    TypedProperties props = createPropsForJsonSource(topic, null, "earliest");
    Source jsonSource = new JsonKafkaSource(props, jsc(), spark(), schemaProvider, metrics);
    SourceFormatAdapter kafkaSource = new SourceFormatAdapter(jsonSource);
    testUtils.sendMessages(topic, jsonifyRecords(dataGenerator.generateInserts("000", 1000)));
    InputBatch<JavaRDD<GenericRecord>> fetch1 = kafkaSource.fetchNewDataInAvroFormat(Option.empty(), 900);
    assertEquals(900, fetch1.getBatch().get().count());
}
Also used : TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) JavaRDD(org.apache.spark.api.java.JavaRDD) Test(org.junit.jupiter.api.Test)

Example 83 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestS3EventsSource method prepareCloudObjectSource.

@Override
public Source prepareCloudObjectSource() {
    TypedProperties props = new TypedProperties();
    props.setProperty(S3_SOURCE_QUEUE_URL, sqsUrl);
    props.setProperty(S3_SOURCE_QUEUE_REGION, regionName);
    props.setProperty(S3_SOURCE_QUEUE_FS, "hdfs");
    S3EventsSource dfsSource = new S3EventsSource(props, jsc, sparkSession, null);
    dfsSource.sqs = this.sqs;
    return dfsSource;
}
Also used : TypedProperties(org.apache.hudi.common.config.TypedProperties)

Example 84 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestSqlSource method setup.

@BeforeEach
public void setup() throws Exception {
    dfsRoot = UtilitiesTestBase.dfsBasePath + "/parquetFiles";
    UtilitiesTestBase.dfs.mkdirs(new Path(dfsRoot));
    props = new TypedProperties();
    super.setup();
    schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFS(), jsc);
    // Produce new data, extract new data
    generateTestTable("1", "001", 10000);
}
Also used : Path(org.apache.hadoop.fs.Path) TypedProperties(org.apache.hudi.common.config.TypedProperties) FilebasedSchemaProvider(org.apache.hudi.utilities.schema.FilebasedSchemaProvider) BeforeEach(org.junit.jupiter.api.BeforeEach)

Example 85 with TypedProperties

use of org.apache.hudi.common.config.TypedProperties in project hudi by apache.

the class TestAbstractDebeziumSource method testDebeziumEvents.

@ParameterizedTest
@MethodSource("testArguments")
public void testDebeziumEvents(Operation operation) throws Exception {
    String sourceClass = getSourceClass();
    // topic setup.
    testUtils.createTopic(TEST_TOPIC_NAME, 2);
    TypedProperties props = createPropsForJsonSource();
    SchemaProvider schemaProvider = new MockSchemaRegistryProvider(props, jsc, this);
    SourceFormatAdapter debeziumSource = new SourceFormatAdapter(UtilHelpers.createSource(sourceClass, props, jsc, sparkSession, schemaProvider, metrics));
    testUtils.sendMessages(TEST_TOPIC_NAME, new String[] { generateDebeziumEvent(operation).toString() });
    InputBatch<Dataset<Row>> fetch = debeziumSource.fetchNewDataInRowFormat(Option.empty(), 10);
    assertEquals(1, fetch.getBatch().get().count());
    // Ensure the before fields are picked for DELETE CDC Events,
    // and after fields are picked for INSERT and UPDATE CDC Events.
    final String fieldPrefix = (operation.equals(Operation.DELETE)) ? "before_" : "after_";
    assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
    assertTrue(fetch.getBatch().get().select("type").collectAsList().stream().allMatch(r -> r.getString(0).startsWith(fieldPrefix)));
    // Validate DB specific meta fields
    validateMetaFields(fetch.getBatch().get());
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) Dataset(org.apache.spark.sql.Dataset) SchemaRegistryProvider(org.apache.hudi.utilities.schema.SchemaRegistryProvider) DebeziumConstants(org.apache.hudi.common.model.debezium.DebeziumConstants) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HoodieDeltaStreamerMetrics(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamerMetrics) GenericData(org.apache.avro.generic.GenericData) AfterAll(org.junit.jupiter.api.AfterAll) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) BeforeAll(org.junit.jupiter.api.BeforeAll) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) MethodSource(org.junit.jupiter.params.provider.MethodSource) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) TypedProperties(org.apache.hudi.common.config.TypedProperties) UtilHelpers(org.apache.hudi.utilities.UtilHelpers) UtilitiesTestBase(org.apache.hudi.utilities.testutils.UtilitiesTestBase) ConsumerConfig(org.apache.kafka.clients.consumer.ConsumerConfig) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) KafkaTestUtils(org.apache.spark.streaming.kafka010.KafkaTestUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) InputBatch(org.apache.hudi.utilities.sources.InputBatch) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Stream(java.util.stream.Stream) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) Mockito.mock(org.mockito.Mockito.mock) Dataset(org.apache.spark.sql.Dataset) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) TypedProperties(org.apache.hudi.common.config.TypedProperties) SourceFormatAdapter(org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Aggregations

TypedProperties (org.apache.hudi.common.config.TypedProperties)143 Test (org.junit.jupiter.api.Test)47 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)22 JavaRDD (org.apache.spark.api.java.JavaRDD)16 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 Path (org.apache.hadoop.fs.Path)14 Properties (java.util.Properties)13 GenericRecord (org.apache.avro.generic.GenericRecord)13 SourceFormatAdapter (org.apache.hudi.utilities.deltastreamer.SourceFormatAdapter)12 Row (org.apache.spark.sql.Row)12 BeforeEach (org.junit.jupiter.api.BeforeEach)11 ArrayList (java.util.ArrayList)10 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)10 HoodieKey (org.apache.hudi.common.model.HoodieKey)9 DFSPropertiesConfiguration (org.apache.hudi.common.config.DFSPropertiesConfiguration)8 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)8 Dataset (org.apache.spark.sql.Dataset)8 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)7