Search in sources :

Example 1 with TableExecutionContext

use of org.apache.hudi.utilities.deltastreamer.TableExecutionContext in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testMultiTableExecutionWithParquetSource.

@Test
public void testMultiTableExecutionWithParquetSource() throws IOException {
    // ingest test data to 2 parquet source paths
    String parquetSourceRoot1 = dfsBasePath + "/parquetSrcPath1/";
    prepareParquetDFSFiles(10, parquetSourceRoot1);
    String parquetSourceRoot2 = dfsBasePath + "/parquetSrcPath2/";
    prepareParquetDFSFiles(5, parquetSourceRoot2);
    // add only common props. later we can add per table props
    String parquetPropsFile = populateCommonPropsAndWriteToFile();
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(parquetPropsFile, dfsBasePath + "/config", ParquetDFSSource.class.getName(), false, false, false, "multi_table_parquet", null);
    HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    List<TableExecutionContext> executionContexts = streamer.getTableExecutionContexts();
    // fetch per parquet source props and add per table properties
    ingestPerParquetSourceProps(executionContexts, Arrays.asList(new String[] { parquetSourceRoot1, parquetSourceRoot2 }));
    String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath;
    String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath;
    // sync and verify
    syncAndVerify(streamer, targetBasePath1, targetBasePath2, 10, 5);
    int totalTable1Records = 10;
    int totalTable2Records = 5;
    // ingest multiple rounds and verify
    for (int i = 0; i < 3; i++) {
        int table1Records = 10 + RANDOM.nextInt(100);
        int table2Records = 15 + RANDOM.nextInt(100);
        prepareParquetDFSFiles(table1Records, parquetSourceRoot1, (i + 2) + ".parquet", false, null, null);
        prepareParquetDFSFiles(table2Records, parquetSourceRoot2, (i + 2) + ".parquet", false, null, null);
        totalTable1Records += table1Records;
        totalTable2Records += table2Records;
        // sync and verify
        syncAndVerify(streamer, targetBasePath1, targetBasePath2, totalTable1Records, totalTable2Records);
    }
}
Also used : HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TableExecutionContext(org.apache.hudi.utilities.deltastreamer.TableExecutionContext) ParquetDFSSource(org.apache.hudi.utilities.sources.ParquetDFSSource) Test(org.junit.jupiter.api.Test)

Example 2 with TableExecutionContext

use of org.apache.hudi.utilities.deltastreamer.TableExecutionContext in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testMultiTableExecutionWithKafkaSource.

// 0 corresponds to fg
@Test
public void testMultiTableExecutionWithKafkaSource() throws IOException {
    // create topics for each table
    String topicName1 = "topic" + testNum++;
    String topicName2 = "topic" + testNum;
    testUtils.createTopic(topicName1, 2);
    testUtils.createTopic(topicName2, 2);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
    testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false, false, null);
    HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    List<TableExecutionContext> executionContexts = streamer.getTableExecutionContexts();
    TypedProperties properties = executionContexts.get(1).getProperties();
    properties.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_uber.avsc");
    properties.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_uber.avsc");
    properties.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName2);
    executionContexts.get(1).setProperties(properties);
    TypedProperties properties1 = executionContexts.get(0).getProperties();
    properties1.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_short_trip_uber.avsc");
    properties1.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_short_trip_uber.avsc");
    properties1.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName1);
    executionContexts.get(0).setProperties(properties1);
    String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath;
    String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath;
    streamer.sync();
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
    // insert updates for already existing records in kafka topics
    testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
    testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
    streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    streamer.getTableExecutionContexts().get(1).setProperties(properties);
    streamer.getTableExecutionContexts().get(0).setProperties(properties1);
    streamer.sync();
    assertEquals(2, streamer.getSuccessTables().size());
    assertTrue(streamer.getFailedTables().isEmpty());
    // assert the record count matches now
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
    testNum++;
}
Also used : JsonKafkaSource(org.apache.hudi.utilities.sources.JsonKafkaSource) HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TableExecutionContext(org.apache.hudi.utilities.deltastreamer.TableExecutionContext) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 3 with TableExecutionContext

use of org.apache.hudi.utilities.deltastreamer.TableExecutionContext in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testCustomConfigProps.

@Test
public void testCustomConfigProps() throws IOException {
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false, false, SchemaRegistryProvider.class);
    HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    TableExecutionContext executionContext = streamer.getTableExecutionContexts().get(1);
    assertEquals(2, streamer.getTableExecutionContexts().size());
    assertEquals(dfsBasePath + "/multi_table_dataset/uber_db/dummy_table_uber", executionContext.getConfig().targetBasePath);
    assertEquals("uber_db.dummy_table_uber", executionContext.getConfig().targetTableName);
    assertEquals("topic1", executionContext.getProperties().getString(HoodieMultiTableDeltaStreamer.Constants.KAFKA_TOPIC_PROP));
    assertEquals("_row_key", executionContext.getProperties().getString(DataSourceWriteOptions.RECORDKEY_FIELD().key()));
    assertEquals(TestHoodieDeltaStreamer.TestGenerator.class.getName(), executionContext.getProperties().getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key()));
    assertEquals("uber_hive_dummy_table", executionContext.getProperties().getString(HoodieMultiTableDeltaStreamer.Constants.HIVE_SYNC_TABLE_PROP));
    assertEquals("http://localhost:8081/subjects/random-value/versions/latest", executionContext.getProperties().getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP));
    assertEquals("http://localhost:8081/subjects/topic2-value/versions/latest", streamer.getTableExecutionContexts().get(0).getProperties().getString(SchemaRegistryProvider.Config.SRC_SCHEMA_REGISTRY_URL_PROP));
}
Also used : HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TableExecutionContext(org.apache.hudi.utilities.deltastreamer.TableExecutionContext) TestDataSource(org.apache.hudi.utilities.sources.TestDataSource) Test(org.junit.jupiter.api.Test)

Example 4 with TableExecutionContext

use of org.apache.hudi.utilities.deltastreamer.TableExecutionContext in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testTableLevelProperties.

@Test
public void testTableLevelProperties() throws IOException {
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), false, false, null);
    HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    List<TableExecutionContext> tableExecutionContexts = streamer.getTableExecutionContexts();
    tableExecutionContexts.forEach(tableExecutionContext -> {
        switch(tableExecutionContext.getTableName()) {
            case "dummy_table_short_trip":
                String tableLevelKeyGeneratorClass = tableExecutionContext.getProperties().getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key());
                assertEquals(TestHoodieDeltaStreamer.TestTableLevelGenerator.class.getName(), tableLevelKeyGeneratorClass);
                break;
            default:
                String defaultKeyGeneratorClass = tableExecutionContext.getProperties().getString(DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME().key());
                assertEquals(TestHoodieDeltaStreamer.TestGenerator.class.getName(), defaultKeyGeneratorClass);
        }
    });
}
Also used : HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TableExecutionContext(org.apache.hudi.utilities.deltastreamer.TableExecutionContext) TestDataSource(org.apache.hudi.utilities.sources.TestDataSource) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieMultiTableDeltaStreamer (org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer)4 TableExecutionContext (org.apache.hudi.utilities.deltastreamer.TableExecutionContext)4 Test (org.junit.jupiter.api.Test)4 TestDataSource (org.apache.hudi.utilities.sources.TestDataSource)2 TypedProperties (org.apache.hudi.common.config.TypedProperties)1 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)1 JsonKafkaSource (org.apache.hudi.utilities.sources.JsonKafkaSource)1 ParquetDFSSource (org.apache.hudi.utilities.sources.ParquetDFSSource)1