use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testDeltaSyncWithPendingClustering.
@Test
public void testDeltaSyncWithPendingClustering() throws Exception {
String tableBasePath = dfsBasePath + "/inlineClusteringPending";
// ingest data
int totalRecords = 2000;
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
cfg.continuousMode = false;
cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
ds.sync();
// assert ingest successful
TestHelpers.assertAtLeastNCommits(1, tableBasePath, dfs);
// schedule a clustering job to build a clustering plan and transition to inflight
HoodieClusteringJob clusteringJob = initialHoodieClusteringJob(tableBasePath, null, false, "schedule");
clusteringJob.cluster(0);
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build();
List<HoodieInstant> hoodieClusteringInstants = meta.getActiveTimeline().filterPendingReplaceTimeline().getInstants().collect(Collectors.toList());
HoodieInstant clusteringRequest = hoodieClusteringInstants.get(0);
meta.getActiveTimeline().transitionReplaceRequestedToInflight(clusteringRequest, Option.empty());
// do another ingestion with inline clustering enabled
cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", ""));
cfg.retryLastPendingInlineClusteringJob = true;
HoodieDeltaStreamer ds2 = new HoodieDeltaStreamer(cfg, jsc);
ds2.sync();
String completeClusteringTimeStamp = meta.reloadActiveTimeline().getCompletedReplaceTimeline().lastInstant().get().getTimestamp();
assertEquals(clusteringRequest.getTimestamp(), completeClusteringTimeStamp);
TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testBulkInsertsAndUpsertsWithBootstrap.
@Test
public void testBulkInsertsAndUpsertsWithBootstrap() throws Exception {
String tableBasePath = dfsBasePath + "/test_table";
// Initial bulk insert
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.BULK_INSERT);
new HoodieDeltaStreamer(cfg, jsc).sync();
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
// No new data => no commits.
cfg.sourceLimit = 0;
new HoodieDeltaStreamer(cfg, jsc).sync();
TestHelpers.assertRecordCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertDistanceCount(1000, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertCommitMetadata("00000", tableBasePath, dfs, 1);
// upsert() #1
cfg.sourceLimit = 2000;
cfg.operation = WriteOperationType.UPSERT;
new HoodieDeltaStreamer(cfg, jsc).sync();
TestHelpers.assertRecordCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertDistanceCount(1950, tableBasePath + "/*/*.parquet", sqlContext);
TestHelpers.assertCommitMetadata("00001", tableBasePath, dfs, 2);
List<Row> counts = TestHelpers.countsPerCommit(tableBasePath + "/*/*.parquet", sqlContext);
assertEquals(1950, counts.stream().mapToLong(entry -> entry.getLong(1)).sum());
// Perform bootstrap with tableBasePath as source
String bootstrapSourcePath = dfsBasePath + "/src_bootstrapped";
Dataset<Row> sourceDf = sqlContext.read().format("org.apache.hudi").load(tableBasePath + "/*/*.parquet");
sourceDf.write().format("parquet").save(bootstrapSourcePath);
String newDatasetBasePath = dfsBasePath + "/test_dataset_bootstrapped";
cfg.runBootstrap = true;
cfg.configs.add(String.format("hoodie.bootstrap.base.path=%s", bootstrapSourcePath));
cfg.configs.add(String.format("hoodie.bootstrap.keygen.class=%s", SimpleKeyGenerator.class.getName()));
cfg.configs.add("hoodie.bootstrap.parallelism=5");
cfg.targetBasePath = newDatasetBasePath;
new HoodieDeltaStreamer(cfg, jsc).sync();
Dataset<Row> res = sqlContext.read().format("org.apache.hudi").load(newDatasetBasePath + "/*.parquet");
LOG.info("Schema :");
res.printSchema();
TestHelpers.assertRecordCount(1950, newDatasetBasePath + "/*.parquet", sqlContext);
res.registerTempTable("bootstrapped");
assertEquals(1950, sqlContext.sql("select distinct _hoodie_record_key from bootstrapped").count());
StructField[] fields = res.schema().fields();
List<String> fieldNames = Arrays.asList(res.schema().fieldNames());
List<String> expectedFieldNames = Arrays.asList(sourceDf.schema().fieldNames());
assertEquals(expectedFieldNames.size(), fields.length);
assertTrue(fieldNames.containsAll(HoodieRecord.HOODIE_META_COLUMNS));
assertTrue(fieldNames.containsAll(expectedFieldNames));
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testAsyncClusteringServiceWithCompaction.
@Test
public void testAsyncClusteringServiceWithCompaction() throws Exception {
String tableBasePath = dfsBasePath + "/asyncClusteringCompaction";
// Keep it higher than batch-size to test continuous mode
int totalRecords = 2000;
// Initial bulk insert
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
cfg.continuousMode = true;
cfg.tableType = HoodieTableType.MERGE_ON_READ.name();
cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "", "", "true", "3"));
HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
deltaStreamerTestRunner(ds, cfg, (r) -> {
TestHelpers.assertAtleastNCompactionCommits(2, tableBasePath, dfs);
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
return true;
});
// There should be 4 commits, one of which should be a replace commit
TestHelpers.assertAtLeastNCommits(4, tableBasePath, dfs);
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
TestHelpers.assertDistinctRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testJsonKafkaDFSSource.
@Test
public void testJsonKafkaDFSSource() throws Exception {
topicName = "topic" + testNum;
prepareJsonKafkaDFSFiles(JSON_KAFKA_NUM_RECORDS, true, topicName);
prepareJsonKafkaDFSSource(PROPS_FILENAME_TEST_JSON_KAFKA, "earliest", topicName);
String tableBasePath = dfsBasePath + "/test_json_kafka_table" + testNum;
HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(tableBasePath, WriteOperationType.UPSERT, JsonKafkaSource.class.getName(), Collections.EMPTY_LIST, PROPS_FILENAME_TEST_JSON_KAFKA, false, true, 100000, false, null, null, "timestamp", null), jsc);
deltaStreamer.sync();
TestHelpers.assertRecordCount(JSON_KAFKA_NUM_RECORDS, tableBasePath + "/*/*.parquet", sqlContext);
int totalRecords = JSON_KAFKA_NUM_RECORDS;
int records = 10;
totalRecords += records;
prepareJsonKafkaDFSFiles(records, false, topicName);
deltaStreamer.sync();
TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext);
}
use of org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer in project hudi by apache.
the class TestHoodieDeltaStreamer method testTableCreation.
@Test
public void testTableCreation() throws Exception {
Exception e = assertThrows(TableNotFoundException.class, () -> {
dfs.mkdirs(new Path(dfsBasePath + "/not_a_table"));
HoodieDeltaStreamer deltaStreamer = new HoodieDeltaStreamer(TestHelpers.makeConfig(dfsBasePath + "/not_a_table", WriteOperationType.BULK_INSERT), jsc);
deltaStreamer.sync();
}, "Should error out when pointed out at a dir thats not a table");
// expected
LOG.debug("Expected error during table creation", e);
}
Aggregations