use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.
the class TestHoodieMultiTableDeltaStreamer method testMultiTableExecutionWithParquetSource.
@Test
public void testMultiTableExecutionWithParquetSource() throws IOException {
// ingest test data to 2 parquet source paths
String parquetSourceRoot1 = dfsBasePath + "/parquetSrcPath1/";
prepareParquetDFSFiles(10, parquetSourceRoot1);
String parquetSourceRoot2 = dfsBasePath + "/parquetSrcPath2/";
prepareParquetDFSFiles(5, parquetSourceRoot2);
// add only common props. later we can add per table props
String parquetPropsFile = populateCommonPropsAndWriteToFile();
HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(parquetPropsFile, dfsBasePath + "/config", ParquetDFSSource.class.getName(), false, false, false, "multi_table_parquet", null);
HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
List<TableExecutionContext> executionContexts = streamer.getTableExecutionContexts();
// fetch per parquet source props and add per table properties
ingestPerParquetSourceProps(executionContexts, Arrays.asList(new String[] { parquetSourceRoot1, parquetSourceRoot2 }));
String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath;
String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath;
// sync and verify
syncAndVerify(streamer, targetBasePath1, targetBasePath2, 10, 5);
int totalTable1Records = 10;
int totalTable2Records = 5;
// ingest multiple rounds and verify
for (int i = 0; i < 3; i++) {
int table1Records = 10 + RANDOM.nextInt(100);
int table2Records = 15 + RANDOM.nextInt(100);
prepareParquetDFSFiles(table1Records, parquetSourceRoot1, (i + 2) + ".parquet", false, null, null);
prepareParquetDFSFiles(table2Records, parquetSourceRoot2, (i + 2) + ".parquet", false, null, null);
totalTable1Records += table1Records;
totalTable2Records += table2Records;
// sync and verify
syncAndVerify(streamer, targetBasePath1, targetBasePath2, totalTable1Records, totalTable2Records);
}
}
use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.
the class TestHoodieMultiTableDeltaStreamer method testMultiTableExecutionWithKafkaSource.
// 0 corresponds to fg
@Test
public void testMultiTableExecutionWithKafkaSource() throws IOException {
// create topics for each table
String topicName1 = "topic" + testNum++;
String topicName2 = "topic" + testNum;
testUtils.createTopic(topicName1, 2);
testUtils.createTopic(topicName2, 2);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateInsertsAsPerSchema("000", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", JsonKafkaSource.class.getName(), false, false, null);
HoodieMultiTableDeltaStreamer streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
List<TableExecutionContext> executionContexts = streamer.getTableExecutionContexts();
TypedProperties properties = executionContexts.get(1).getProperties();
properties.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_uber.avsc");
properties.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_uber.avsc");
properties.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName2);
executionContexts.get(1).setProperties(properties);
TypedProperties properties1 = executionContexts.get(0).getProperties();
properties1.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source_short_trip_uber.avsc");
properties1.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/target_short_trip_uber.avsc");
properties1.setProperty("hoodie.deltastreamer.source.kafka.topic", topicName1);
executionContexts.get(0).setProperties(properties1);
String targetBasePath1 = executionContexts.get(0).getConfig().targetBasePath;
String targetBasePath2 = executionContexts.get(1).getConfig().targetBasePath;
streamer.sync();
TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
// insert updates for already existing records in kafka topics
testUtils.sendMessages(topicName1, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 5, HoodieTestDataGenerator.TRIP_SCHEMA)));
testUtils.sendMessages(topicName2, Helpers.jsonifyRecords(dataGenerator.generateUpdatesAsPerSchema("001", 10, HoodieTestDataGenerator.SHORT_TRIP_SCHEMA)));
streamer = new HoodieMultiTableDeltaStreamer(cfg, jsc);
streamer.getTableExecutionContexts().get(1).setProperties(properties);
streamer.getTableExecutionContexts().get(0).setProperties(properties1);
streamer.sync();
assertEquals(2, streamer.getSuccessTables().size());
assertTrue(streamer.getFailedTables().isEmpty());
// assert the record count matches now
TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(5, targetBasePath1 + "/*/*.parquet", sqlContext);
TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(10, targetBasePath2 + "/*/*.parquet", sqlContext);
testNum++;
}
use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.
the class TestHoodieMultiTableDeltaStreamer method testInvalidHiveSyncProps.
@Test
public void testInvalidHiveSyncProps() throws IOException {
HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_HIVE_SYNC_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null);
Exception e = assertThrows(HoodieException.class, () -> {
new HoodieMultiTableDeltaStreamer(cfg, jsc);
}, "Should fail when hive sync table not provided with enableHiveSync flag");
log.debug("Expected error when creating table execution objects", e);
assertTrue(e.getMessage().contains("Meta sync table field not provided!"));
}
use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.
the class TestHoodieMultiTableDeltaStreamer method testInvalidTableConfigFilePath.
@Test
public void testInvalidTableConfigFilePath() throws IOException {
HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_INVALID_TABLE_CONFIG_FILE, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null);
Exception e = assertThrows(IllegalArgumentException.class, () -> {
new HoodieMultiTableDeltaStreamer(cfg, jsc);
}, "Should fail when invalid table config props file path is provided");
log.debug("Expected error when creating table execution objects", e);
assertTrue(e.getMessage().contains("Please provide valid table config file path!"));
}
use of org.apache.hudi.utilities.deltastreamer.HoodieMultiTableDeltaStreamer in project hudi by apache.
the class TestHoodieMultiTableDeltaStreamer method testInvalidIngestionProps.
@Test
@Disabled
public void testInvalidIngestionProps() {
Exception e = assertThrows(Exception.class, () -> {
HoodieMultiTableDeltaStreamer.Config cfg = TestHelpers.getConfig(PROPS_FILENAME_TEST_SOURCE1, dfsBasePath + "/config", TestDataSource.class.getName(), true, true, null);
new HoodieMultiTableDeltaStreamer(cfg, jsc);
}, "Creation of execution object should fail without kafka topic");
log.debug("Creation of execution object failed with error: " + e.getMessage(), e);
assertTrue(e.getMessage().contains("Please provide valid table config arguments!"));
}
Aggregations