use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testUpsertsContinuousMode.
private void testUpsertsContinuousMode(HoodieTableType tableType, String tempDir) throws Exception {
String tableBasePath = dfsBasePath + "/" + tempDir;
// Keep it higher than batch-size to test continuous mode
int totalRecords = 3000;
// Initial bulk insert
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT);
cfg.continuousMode = true;
cfg.tableType = tableType.name();
cfg.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
cfg.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
deltaStreamerTestRunner(ds, cfg, (r) -> {
if (tableType.equals(HoodieTableType.MERGE_ON_READ)) {
TestHelpers.assertAtleastNDeltaCommits(5, tableBasePath, dfs);
TestHelpers.assertAtleastNCompactionCommits(2, tableBasePath, dfs);
} else {
TestHelpers.assertAtleastNCompactionCommits(5, tableBasePath, dfs);
}
TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
return true;
});
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testFilterDupes.
@Test
public void testFilterDupes() throws Exception {
String tableBasePath = dfsBasePath + "/test_dupes_table";
// Initial bulk insert
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT);
new HoodieDeltaStreamer(cfg, jsc).sync();
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
// Generate the same 1000 records + 1000 new ones for upsert
cfg.filterDupes = true;
cfg.sourceLimit = 2000;
cfg.operation = WriteOperationType.INSERT;
new HoodieDeltaStreamer(cfg, jsc).sync();
TestHelpers.assertRecordCount(2000, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
// 1000 records for commit 00000 & 1000 for commit 00001
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
assertEquals(1000, counts.get(0).getLong(1));
assertEquals(1000, counts.get(1).getLong(1));
// Test with empty commits
HoodieTableMetaClient mClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(tableBasePath).setLoadActiveTimelineOnLoad(true).build();
HoodieInstant lastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get();
HoodieDeltaStreamer.Config cfg2 = TestHelpers.makeDropAllConfig(tableBasePath, WriteOperationType.UPSERT);
cfg2.filterDupes = false;
cfg2.sourceLimit = 2000;
cfg2.operation = WriteOperationType.UPSERT;
cfg2.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
HoodieDeltaStreamer ds2 = new HoodieDeltaStreamer(cfg2, jsc);
ds2.sync();
mClient = HoodieTableMetaClient.builder().setConf(jsc.hadoopConfiguration()).setBasePath(tableBasePath).setLoadActiveTimelineOnLoad(true).build();
HoodieInstant newLastFinished = mClient.getCommitsTimeline().filterCompletedInstants().lastInstant().get();
assertTrue(HoodieTimeline.compareTimestamps(newLastFinished.getTimestamp(), HoodieTimeline.GREATER_THAN, lastFinished.getTimestamp()));
// Ensure it is empty
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(mClient.getActiveTimeline().getInstantDetails(newLastFinished).get(), HoodieCommitMetadata.class);
System.out.println("New Commit Metadata=" + commitMetadata);
assertTrue(commitMetadata.getPartitionToWriteStats().isEmpty());
// Try UPSERT with filterDupes true. Expect exception
cfg2.filterDupes = true;
cfg2.operation = WriteOperationType.UPSERT;
try {
new HoodieDeltaStreamer(cfg2, jsc).sync();
} catch (IllegalArgumentException e) {
assertTrue(e.getMessage().contains("'--filter-dupes' needs to be disabled when '--op' is 'UPSERT' to ensure updates are not missed."));
}
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testHoodieIncrFallback.
@Test
public void testHoodieIncrFallback() throws Exception {
String tableBasePath = dfsBasePath + "/incr_test_table";
String downstreamTableBasePath = dfsBasePath + "/incr_test_downstream_table";
insertInTable(tableBasePath, 1, WriteOperationType.BULK_INSERT);
HoodieDeltaStreamer.Config downstreamCfg = TestHelpers.makeConfigForHudiIncrSrc(tableBasePath, downstreamTableBasePath, WriteOperationType.BULK_INSERT, true, null);
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
insertInTable(tableBasePath, 9, WriteOperationType.UPSERT);
// No change as this fails with Path not exist error
assertThrows(org.apache.spark.sql.AnalysisException.class, () -> new HoodieDeltaStreamer(downstreamCfg, jsc).sync());
TestHelpers.assertRecordCount(1000, downstreamTableBasePath + "/*/*", sqlContext);
if (downstreamCfg.configs == null) {
downstreamCfg.configs = new ArrayList<>();
}
downstreamCfg.configs.add(DataSourceReadOptions.INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES().key() + "=true");
// Adding this conf to make testing easier :)
downstreamCfg.configs.add("hoodie.deltastreamer.source.hoodieincr.num_instants=10");
downstreamCfg.operation = WriteOperationType.UPSERT;
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
new HoodieDeltaStreamer(downstreamCfg, jsc).sync();
long baseTableRecords = sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*.parquet").count();
long downStreamTableRecords = sqlContext.read().format("org.apache.hudi").load(downstreamTableBasePath + "/*/*.parquet").count();
assertEquals(baseTableRecords, downStreamTableRecords);
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testPayloadClassUpdateWithCOWTable.
@Test
public void testPayloadClassUpdateWithCOWTable() throws Exception {
String dataSetBasePath = dfsBasePath + "/test_dataset_cow";
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true, true, false, null, null);
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf()).sync();
TestHelpers.assertRecordCount(1000, dataSetBasePath + "/*/*.parquet", sqlContext);
// now create one more deltaStreamer instance and update payload class
cfg = TestHelpers.makeConfig(dataSetBasePath, WriteOperationType.BULK_INSERT, Collections.singletonList(SqlQueryBasedTransformer.class.getName()), PROPS_FILENAME_TEST_SOURCE, true, true, true, DummyAvroPayload.class.getName(), null);
new HoodieDeltaStreamer(cfg, jsc, dfs, hiveServer.getHiveConf());
// now assert that hoodie.properties file does not have payload class prop since it is a COW table
Properties props = new Properties();
String metaPath = dataSetBasePath + "/.hoodie/hoodie.properties";
FileSystem fs = FSUtils.getFs(cfg.targetBasePath, jsc.hadoopConfiguration());
try (FSDataInputStream inputStream = fs.open(new Path(metaPath))) {
props.load(inputStream);
}
assertFalse(props.containsKey(HoodieTableConfig.PAYLOAD_CLASS_NAME.key()));
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testSqlSourceSource.
@Test
public void testSqlSourceSource() throws Exception {
prepareSqlSource();
String tableBasePath = dfsBasePath + "/test_sql_source_table" + testNum++;
HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT, SqlSource.class.getName(), Collections.emptyList(), PROPS_FILENAME_TEST_SQL_SOURCE, false, false, 1000, false, null, null, "timestamp", null, true), jsc);
deltaStreamer.sync();
TestHelpers.assertRecordCount(SQL_SOURCE_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
}
Aggregations