use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testORCDFSSource.
private void testORCDFSSource(boolean useSchemaProvider, List<String> transformerClassNames) throws Exception {
// prepare ORCDFSSource
TypedProperties orcProps = new TypedProperties();
// Properties used for testing delta-streamer with orc source
orcProps.setProperty("include", "base.properties");
orcProps.setProperty("hoodie.embed.timeline.server", "false");
orcProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
orcProps.setProperty("hoodie.datasource.write.partitionpath.field", "not_there");
if (useSchemaProvider) {
orcProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/" + "source.avsc");
if (transformerClassNames != null) {
orcProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/" + "target.avsc");
}
}
orcProps.setProperty("hoodie.deltastreamer.source.dfs.root", ORC_SOURCE_ROOT);
UtilitiesTestBase.Helpers.savePropsToDFS(orcProps, dfs, dfsBasePath + "/" + PROPS_FILENAME_TEST_ORC);
String tableBasePath = dfsBasePath + "/test_orc_source_table" + testNum;
HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, ORCDFSSource.class.getName(), transformerClassNames, PROPS_FILENAME_TEST_ORC, false, useSchemaProvider, 100000, false, null, null, "timestamp", null), jsc);
deltaStreamer.sync();
TestHelpers.assertRecordCount(ORC_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
testNum++;
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method insertInTable.
private void insertInTable(String tableBasePath, int count, WriteOperationType operationType) throws Exception {
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, operationType, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, false);
if (cfg.configs == null) {
cfg.configs = new ArrayList<>();
}
cfg.configs.add("hoodie.cleaner.commits.retained=3");
cfg.configs.add("hoodie.keep.min.commits=4");
cfg.configs.add("hoodie.keep.max.commits=5");
cfg.configs.add("hoodie.test.source.generate.inserts=true");
for (int i = 0; i < count; i++) {
new HoodieDeltaStreamer(cfg, jsc).sync();
}
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testPayloadClassUpdate.
@Test
public void testPayloadClassUpdate() throws Exception {
String dataSetBasePath = dfsBasePath + "/test_dataset_mor";
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true, true, false, null, "MERGE_ON_READ");
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
TestHelpers.assertRecordCount(1000, dataSetBasePath + "/*/*.parquet", sqlContext);
// now create one more deltaStreamer instance and update payload class
cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true, true, true, DummyAvroPayload.class.getName(), "MERGE_ON_READ");
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf());
// now assert that hoodie.properties file now has updated payload class name
Properties props = new Properties();
String metaPath = dataSetBasePath + "/.hoodie/hoodie.properties";
FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jsc.hadoopConfiguration());
try (FSDataInputStream inputStream = fs.open(new Path(metaPath))) {
props.load(inputStream);
}
assertEquals(new HoodieConfig(props).getString(HoodieTableConfig.PAYLOAD_CLASS_NAME), DummyAvroPayload.class.getName());
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testDeltaStreamerTransitionFromParquetToKafkaSource.
/**
* Tests Deltastreamer with parquet dfs source and transitions to JsonKafkaSource.
*
* @param autoResetToLatest true if auto reset value to be set to LATEST. false to leave it as default(i.e. EARLIEST)
* @throws Exception
*/
private void testDeltaStreamerTransitionFromParquetToKafkaSource(boolean autoResetToLatest) throws Exception {
// prep parquet source
PARQUET_SOURCE_ROOT = dfsBasePath + "/parquetFilesDfsToKafka" + testNum;
int parquetRecords = 10;
prepareParquetDFSFiles(parquetRecords, PARQUET_SOURCE_ROOT, FIRST_PARQUET_FILE_NAME, true, HoodieTestDataGenerator.TRIP_SCHEMA, HoodieTestDataGenerator.AVRO_TRIP_SCHEMA);
prepareParquetDFSSource(true, false, "source_uber.avsc", "target_uber.avsc", PROPS_FILENAME_TEST_PARQUET, PARQUET_SOURCE_ROOT, false);
// delta streamer w/ parquet source
String tableBasePath = dfsBasePath + "/test_dfs_to_kafka" + testNum;
HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, ParquetDFSSource.class.getName(), Collections.EMPTY_LIST, PROPS_FILENAME_TEST_PARQUET, false, false, 100000, false, null, null, "timestamp", null), jsc);
deltaStreamer.sync();
TestHelpers.assertRecordCount(parquetRecords, tableBasePath + "/*/*.parquet", sqlContext);
deltaStreamer.shutdownGracefully();
// prep json kafka source
topicName = "topic" + testNum;
prepareJsonKafkaDFSFiles(JSON_KAFKA_NUM_RECORDS, true, topicName);
prepareJsonKafkaDFSSource(PROPS_FILENAME_TEST_JSON_KAFKA, autoResetToLatest ? "latest" : "earliest", topicName);
// delta streamer w/ json kafka source
deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, JsonKafkaSource.class.getName(), Collections.EMPTY_LIST, PROPS_FILENAME_TEST_JSON_KAFKA, false, true, 100000, false, null, null, "timestamp", null), jsc);
deltaStreamer.sync();
// if auto reset value is set to LATEST, this all kafka records so far may not be synced.
int totalExpectedRecords = parquetRecords + ((autoResetToLatest) ? 0 : JSON_KAFKA_NUM_RECORDS);
TestHelpers.assertRecordCount(totalExpectedRecords, tableBasePath + "/*/*.parquet", sqlContext);
// verify 2nd batch to test LATEST auto reset value.
prepareJsonKafkaDFSFiles(20, false, topicName);
totalExpectedRecords += 20;
deltaStreamer.sync();
TestHelpers.assertRecordCount(totalExpectedRecords, tableBasePath + "/*/*.parquet", sqlContext);
testNum++;
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testAsyncClusteringServiceWithConflicts.
/**
* When deltastreamer writes clashes with pending clustering, deltastreamer should keep retrying and eventually succeed(once clustering completes)
* w/o failing mid way.
*
* @throws Exception
*/
@Test
public void testAsyncClusteringServiceWithConflicts() throws Exception {
String tableBasePath = dfsBasePath + "/asyncClusteringWithConflicts";
// Keep it higher than batch-size to test continuous mode
int totalRecords = 2000;
// Initial bulk insert
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT);
cfg.continuousMode = true;
cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "", "", "true", "3"));
HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
deltaStreamerTestRunner(ds, cfg, (r) -> {
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
return true;
});
// There should be 4 commits, one of which should be a replace commit
TestHelpers.assertAtLeastNCommits(4, tableBasePath, dfs);
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
TestHelpers.assertDistinctRecordCount(1900, tableBasePath + "/*/*.parquet", sqlContext);
}
Aggregations