use of org.apache.hudi.utilities.HoodieClusteringJob in project hudi by apache.
the class SparkMain method cluster.
private static int cluster(JavaSparkContext jsc, String basePath, String tableName, String clusteringInstant, int parallelism, String sparkMemory, int retry, String runningMode, String propsFilePath, List<String> configs) {
HoodieClusteringJob.Config cfg = new HoodieClusteringJob.Config();
cfg.basePath = basePath;
cfg.tableName = tableName;
cfg.clusteringInstantTime = clusteringInstant;
cfg.parallelism = parallelism;
cfg.runningMode = runningMode;
cfg.propsFilePath = propsFilePath;
cfg.configs = configs;
jsc.getConf().set("spark.executor.memory", sparkMemory);
return new HoodieClusteringJob(jsc, cfg).cluster(retry);
}
use of org.apache.hudi.utilities.HoodieClusteringJob in project hudi by apache.
the class TestHoodieDeltaStreamer method testHoodieAsyncClusteringJob.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testHoodieAsyncClusteringJob(boolean shouldPassInClusteringInstantTime) throws Exception {
String tableBasePath = dfsBasePath + "/asyncClusteringJob";
HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "true");
deltaStreamerTestRunner(ds, (r) -> {
TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
Option<String> scheduleClusteringInstantTime = Option.empty();
try {
HoodieClusteringJob scheduleClusteringJob = initialHoodieClusteringJob(tableBasePath, null, true, null);
scheduleClusteringInstantTime = scheduleClusteringJob.doSchedule();
} catch (Exception e) {
LOG.warn("Schedule clustering failed", e);
return false;
}
if (scheduleClusteringInstantTime.isPresent()) {
LOG.info("Schedule clustering success, now cluster with instant time " + scheduleClusteringInstantTime.get());
HoodieClusteringJob.Config clusterClusteringConfig = buildHoodieClusteringUtilConfig(tableBasePath, shouldPassInClusteringInstantTime ? scheduleClusteringInstantTime.get() : null, false);
HoodieClusteringJob clusterClusteringJob = new HoodieClusteringJob(jsc, clusterClusteringConfig);
clusterClusteringJob.cluster(clusterClusteringConfig.retry);
LOG.info("Cluster success");
} else {
LOG.warn("Schedule clustering failed");
}
TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs);
return true;
});
}
use of org.apache.hudi.utilities.HoodieClusteringJob in project hudi by apache.
the class TestHoodieDeltaStreamer method testHoodieAsyncClusteringJobWithScheduleAndExecute.
@ParameterizedTest
@ValueSource(strings = { "execute", "schedule", "scheduleAndExecute" })
public void testHoodieAsyncClusteringJobWithScheduleAndExecute(String runningMode) throws Exception {
String tableBasePath = dfsBasePath + "/asyncClustering2";
HoodieDeltaStreamer ds = initialHoodieDeltaStreamer(tableBasePath, 3000, "false");
HoodieClusteringJob scheduleClusteringJob = initialHoodieClusteringJob(tableBasePath, null, true, runningMode);
deltaStreamerTestRunner(ds, (r) -> {
Exception exception = null;
TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
try {
int result = scheduleClusteringJob.cluster(0);
if (result == 0) {
LOG.info("Cluster success");
} else {
LOG.warn("Import failed");
if (!runningMode.toLowerCase().equals(HoodieClusteringJob.EXECUTE)) {
return false;
}
}
} catch (Exception e) {
LOG.warn("ScheduleAndExecute clustering failed", e);
exception = e;
if (!runningMode.equalsIgnoreCase(HoodieClusteringJob.EXECUTE)) {
return false;
}
}
switch(runningMode.toLowerCase()) {
case HoodieClusteringJob.SCHEDULE_AND_EXECUTE:
{
TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs);
return true;
}
case HoodieClusteringJob.SCHEDULE:
{
TestHelpers.assertAtLeastNReplaceRequests(2, tableBasePath, dfs);
TestHelpers.assertNoReplaceCommits(tableBasePath, dfs);
return true;
}
case HoodieClusteringJob.EXECUTE:
{
TestHelpers.assertNoReplaceCommits(tableBasePath, dfs);
return true;
}
default:
throw new IllegalStateException("Unexpected value: " + runningMode);
}
});
}
use of org.apache.hudi.utilities.HoodieClusteringJob in project hudi by apache.
the class TestHoodieDeltaStreamer method testDeltaSyncWithPendingClustering.
@Test
public void testDeltaSyncWithPendingClustering() throws Exception {
String tableBasePath = dfsBasePath + "/inlineClusteringPending";
// ingest data
int totalRecords = 2000;
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
cfg.continuousMode = false;
cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
ds.sync();
// assert ingest successful
TestHelpers.assertAtLeastNCommits(1, tableBasePath, dfs);
// schedule a clustering job to build a clustering plan and transition to inflight
HoodieClusteringJob clusteringJob = initialHoodieClusteringJob(tableBasePath, null, false, "schedule");
clusteringJob.cluster(0);
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build();
List<HoodieInstant> hoodieClusteringInstants = meta.getActiveTimeline().filterPendingReplaceTimeline().getInstants().collect(Collectors.toList());
HoodieInstant clusteringRequest = hoodieClusteringInstants.get(0);
meta.getActiveTimeline().transitionReplaceRequestedToInflight(clusteringRequest, Option.empty());
// do another ingestion with inline clustering enabled
cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", ""));
cfg.retryLastPendingInlineClusteringJob = true;
HoodieDeltaStreamer ds2 = new HoodieDeltaStreamer(cfg, jsc);
ds2.sync();
String completeClusteringTimeStamp = meta.reloadActiveTimeline().getCompletedReplaceTimeline().lastInstant().get().getTimestamp();
assertEquals(clusteringRequest.getTimestamp(), completeClusteringTimeStamp);
TestHelpers.assertAtLeastNCommits(2, tableBasePath, dfs);
TestHelpers.assertAtLeastNReplaceCommits(1, tableBasePath, dfs);
}
use of org.apache.hudi.utilities.HoodieClusteringJob in project hudi by apache.
the class TestHoodieDeltaStreamer method testAsyncClusteringJobWithRetry.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testAsyncClusteringJobWithRetry(boolean retryLastFailedClusteringJob) throws Exception {
String tableBasePath = dfsBasePath + "/asyncClustering3";
// ingest data
int totalRecords = 3000;
HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
cfg.continuousMode = false;
cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "false", "0", "false", "0"));
HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
ds.sync();
// assert ingest successful
TestHelpers.assertAtLeastNCommits(1, tableBasePath, dfs);
// schedule a clustering job to build a clustering plan
HoodieClusteringJob schedule = initialHoodieClusteringJob(tableBasePath, null, false, "schedule");
schedule.cluster(0);
// do another ingestion
HoodieDeltaStreamer ds2 = new HoodieDeltaStreamer(cfg, jsc);
ds2.sync();
// convert clustering request into inflight, Simulate the last clustering failed scenario
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build();
List<HoodieInstant> hoodieClusteringInstants = meta.getActiveTimeline().filterPendingReplaceTimeline().getInstants().collect(Collectors.toList());
HoodieInstant clusteringRequest = hoodieClusteringInstants.get(0);
HoodieInstant hoodieInflightInstant = meta.getActiveTimeline().transitionReplaceRequestedToInflight(clusteringRequest, Option.empty());
// trigger a scheduleAndExecute clustering job
// when retryFailedClustering true => will rollback and re-execute failed clustering plan with same instant timestamp.
// when retryFailedClustering false => will make and execute a new clustering plan with new instant timestamp.
HoodieClusteringJob scheduleAndExecute = initialHoodieClusteringJob(tableBasePath, null, false, "scheduleAndExecute", retryLastFailedClusteringJob);
scheduleAndExecute.cluster(0);
String completeClusteringTimeStamp = meta.getActiveTimeline().reload().getCompletedReplaceTimeline().lastInstant().get().getTimestamp();
if (retryLastFailedClusteringJob) {
assertEquals(clusteringRequest.getTimestamp(), completeClusteringTimeStamp);
} else {
assertFalse(clusteringRequest.getTimestamp().equalsIgnoreCase(completeClusteringTimeStamp));
}
}
Aggregations