Search in sources :

Example 1 with HoodieMultiTableDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testMultiTableExecutionWithParquetSource.

@Test
public void testMultiTableExecutionWithParquetSource() throws IOException {
    // ingest test data to 2 parquet source paths
    String parquetSourceRoot1 = dfsBasePath + "/parquetSrcPath1/";
    prepareParquetDFSFiles(10, parquetSourceRoot1);
    String parquetSourceRoot2 = dfsBasePath + "/parquetSrcPath2/";
    prepareParquetDFSFiles(5, parquetSourceRoot2);
    // add only common props. later we can add per table props
    String parquetPropsFile = populateCommonPropsAndWriteToFile();
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(parquetPropsFile, dfsBasePath + "/config", ParquetDFSSource.class.getName(), false, false, false, "multi_table_parquet", null);
    HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    List<TableExecutionContext> executionContexts = streamer.getTableExecutionContexts();
    // fetch per parquet source props and add per table properties
    ingestPerParquetSourceProps(executionContexts, Arrays.asList(new String[] { parquetSourceRoot1, parquetSourceRoot2 }));
    String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath;
    String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath;
    // sync and verify
    syncAndVerify(streamer, targetBasePath1, targetBasePath2, 10, 5);
    int totalTable1Records = 10;
    int totalTable2Records = 5;
    // ingest multiple rounds and verify
    for (int i = 0; i < 3; i++) {
        int table1Records = 10 + RANDOM.nextInt(100);
        int table2Records = 15 + RANDOM.nextInt(100);
        prepareParquetDFSFiles(table1Records, parquetSourceRoot1, (i + 2) + ".parquet", false, null, null);
        prepareParquetDFSFiles(table2Records, parquetSourceRoot2, (i + 2) + ".parquet", false, null, null);
        totalTable1Records += table1Records;
        totalTable2Records += table2Records;
        // sync and verify
        syncAndVerify(streamer, targetBasePath1, targetBasePath2, totalTable1Records, totalTable2Records);
    }
}
Also used : HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TableExecutionContext(org.apache.hudi.utilities.deltastreamer.TableExecutionContext) ParquetDFSSource(org.apache.hudi.utilities.sources.ParquetDFSSource) Test(org.junit.jupiter.api.Test)

Example 2 with HoodieMultiTableDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testMultiTableExecutionWithKafkaSource.

// 0 corresponds to fg
@Test
public void testMultiTableExecutionWithKafkaSource() throws IOException {
    // create topics for each table
    String topicName1 = "topic" + testNum++;
    String topicName2 = "topic" + testNum;
    testUtils.createTopic(topicName1, 2);
    testUtils.createTopic(topicName2, 2);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
    testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
    testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false, false, null);
    HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    List<TableExecutionContext> executionContexts = streamer.getTableExecutionContexts();
    TypedProperties properties = executionContexts.get(1).getProperties();
    properties.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_uber.avsc");
    properties.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_uber.avsc");
    properties.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName2);
    executionContexts.get(1).setProperties(properties);
    TypedProperties properties1 = executionContexts.get(0).getProperties();
    properties1.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_short_trip_uber.avsc");
    properties1.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_short_trip_uber.avsc");
    properties1.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName1);
    executionContexts.get(0).setProperties(properties1);
    String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath;
    String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath;
    streamer.sync();
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
    // insert updates for already existing records in kafka topics
    testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
    testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
    streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
    streamer.getTableExecutionContexts().get(1).setProperties(properties);
    streamer.getTableExecutionContexts().get(0).setProperties(properties1);
    streamer.sync();
    assertEquals(2, streamer.getSuccessTables().size());
    assertTrue(streamer.getFailedTables().isEmpty());
    // assert the record count matches now
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
    TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
    testNum++;
}
Also used : JsonKafkaSource(org.apache.hudi.utilities.sources.JsonKafkaSource) HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TableExecutionContext(org.apache.hudi.utilities.deltastreamer.TableExecutionContext) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 3 with HoodieMultiTableDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testInvalidHiveSyncProps.

@Test
public void testInvalidHiveSyncProps() throws IOException {
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_HIVE_SYNC_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null);
    Exception e = assertThrows(HoodieException.class, () -> {
        new HoodieMultiTableDeltaStreamer(cfg, jsc);
    }, "Should fail when hive sync table not provided with enableHiveSync flag");
    log.debug("Expected error when creating table execution objects", e);
    assertTrue(e.getMessage().contains("Meta sync table field not provided!"));
}
Also used : HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TestDataSource(org.apache.hudi.utilities.sources.TestDataSource) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) Test(org.junit.jupiter.api.Test)

Example 4 with HoodieMultiTableDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testInvalidTableConfigFilePath.

@Test
public void testInvalidTableConfigFilePath() throws IOException {
    HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_TABLE_CONFIG_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null);
    Exception e = assertThrows(IllegalArgumentException.class, () -> {
        new HoodieMultiTableDeltaStreamer(cfg, jsc);
    }, "Should fail when invalid table config props file path is provided");
    log.debug("Expected error when creating table execution objects", e);
    assertTrue(e.getMessage().contains("Please provide valid table config file path!"));
}
Also used : HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TestDataSource(org.apache.hudi.utilities.sources.TestDataSource) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) Test(org.junit.jupiter.api.Test)

Example 5 with HoodieMultiTableDeltaStreamer

use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.

the class TestHoodieMultiTableDeltaStreamer method testInvalidIngestionProps.

@Test
@Disabled
public void testInvalidIngestionProps() {
    Exception e = assertThrows(Exception.class, () -> {
        HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null);
        new HoodieMultiTableDeltaStreamer(cfg, jsc);
    }, "Creation of execution object should fail without kafka topic");
    log.debug("Creation of execution object failed with error: " + e.getMessage(), e);
    assertTrue(e.getMessage().contains("Please provide valid table config arguments!"));
}
Also used : HoodieMultiTableDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer) TestDataSource(org.apache.hudi.utilities.sources.TestDataSource) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) Test(org.junit.jupiter.api.Test) Disabled(org.junit.jupiter.api.Disabled)

Aggregations

HoodieMultiTableDeltaStreamer (org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer)8 Test (org.junit.jupiter.api.Test)8 TestDataSource (org.apache.hudi.utilities.sources.TestDataSource)6 IOException (java.io.IOException)4 HoodieException (org.apache.hudi.exception.HoodieException)4 TableExecutionContext (org.apache.hudi.utilities.deltastreamer.TableExecutionContext)4 TypedProperties (org.apache.hudi.common.config.TypedProperties)1 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)1 JsonKafkaSource (org.apache.hudi.utilities.sources.JsonKafkaSource)1 ParquetDFSSource (org.apache.hudi.utilities.sources.ParquetDFSSource)1 Disabled (org.junit.jupiter.api.Disabled)1