use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class StatsCommand method writeAmplificationStats.
@CliCommand(value = "stats wa", help = "Write Amplification. Ratio of how many records were upserted to how many " + "records were actually written")
public String writeAmplificationStats(@CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
long totalRecordsUpserted = 0;
long totalRecordsWritten = 0;
HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCommitTimeline().filterCompletedInstants();
List<Comparable[]> rows = new ArrayList<>();
DecimalFormat df = new DecimalFormat("#.00");
for (HoodieInstant instantTime : timeline.getInstants().collect(Collectors.toList())) {
String waf = "0";
HoodieCommitMetadata commit = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(instantTime).get(), HoodieCommitMetadata.class);
if (commit.fetchTotalUpdateRecordsWritten() > 0) {
waf = df.format((float) commit.fetchTotalRecordsWritten() / commit.fetchTotalUpdateRecordsWritten());
}
rows.add(new Comparable[] { instantTime.getTimestamp(), commit.fetchTotalUpdateRecordsWritten(), commit.fetchTotalRecordsWritten(), waf });
totalRecordsUpserted += commit.fetchTotalUpdateRecordsWritten();
totalRecordsWritten += commit.fetchTotalRecordsWritten();
}
String waf = "0";
if (totalRecordsUpserted > 0) {
waf = df.format((float) totalRecordsWritten / totalRecordsUpserted);
}
rows.add(new Comparable[] { "Total", totalRecordsUpserted, totalRecordsWritten, waf });
TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_COMMIT_TIME).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_UPSERTED).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_WRITTEN).addTableHeaderField(HoodieTableHeaderFields.HEADER_WRITE_AMPLIFICATION_FACTOR);
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class CommitsCommand method rollbackCommit.
@CliCommand(value = "commit rollback", help = "Rollback a commit")
public String rollbackCommit(@CliOption(key = { "commit" }, help = "Commit to rollback") final String instantTime, @CliOption(key = { "sparkProperties" }, help = "Spark Properties File Path") final String sparkPropertiesPath, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory, @CliOption(key = "rollbackUsingMarkers", unspecifiedDefaultValue = "true", help = "Enabling marker based rollback") final String rollbackUsingMarkers) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline();
HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
HoodieTimeline filteredTimeline = completedTimeline.filter(instant -> instant.getTimestamp().equals(instantTime));
if (filteredTimeline.empty()) {
return "Commit " + instantTime + " not found in Commits " + completedTimeline;
}
SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK.toString(), master, sparkMemory, instantTime, HoodieCLI.getTableMetaClient().getBasePath(), rollbackUsingMarkers);
Process process = sparkLauncher.launch();
InputStreamConsumer.captureOutput(process);
int exitCode = process.waitFor();
// Refresh the current
HoodieCLI.refreshTableMetadata();
if (exitCode != 0) {
return "Commit " + instantTime + " failed to roll back";
}
return "Commit " + instantTime + " rolled back";
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class CleansCommand method showCleanPartitions.
@CliCommand(value = "clean showpartitions", help = "Show partition level details of a clean")
public String showCleanPartitions(@CliOption(key = { "clean" }, help = "clean to show") final String instantTime, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws Exception {
HoodieActiveTimeline activeTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline();
HoodieTimeline timeline = activeTimeline.getCleanerTimeline().filterCompletedInstants();
HoodieInstant cleanInstant = new HoodieInstant(false, HoodieTimeline.CLEAN_ACTION, instantTime);
if (!timeline.containsInstant(cleanInstant)) {
return "Clean " + instantTime + " not found in metadata " + timeline;
}
HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(timeline.getInstantDetails(cleanInstant).get());
List<Comparable[]> rows = new ArrayList<>();
for (Map.Entry<String, HoodieCleanPartitionMetadata> entry : cleanMetadata.getPartitionMetadata().entrySet()) {
String path = entry.getKey();
HoodieCleanPartitionMetadata stats = entry.getValue();
String policy = stats.getPolicy();
int totalSuccessDeletedFiles = stats.getSuccessDeleteFiles().size();
int totalFailedDeletedFiles = stats.getFailedDeleteFiles().size();
rows.add(new Comparable[] { path, policy, totalSuccessDeletedFiles, totalFailedDeletedFiles });
}
TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION_PATH).addTableHeaderField(HoodieTableHeaderFields.HEADER_CLEANING_POLICY).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_SUCCESSFULLY_DELETED).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FAILED_DELETIONS);
return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class BaseCommitActionExecutor method saveWorkloadProfileMetadataToInflight.
/**
* Save the workload profile in an intermediate file (here re-using commit files) This is useful when performing
* rollback for MOR tables. Only updates are recorded in the workload profile metadata since updates to log blocks
* are unknown across batches Inserts (which are new parquet files) are rolled back based on commit time. // TODO :
* Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata
*/
void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, String instantTime) throws HoodieCommitException {
try {
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
profile.getOutputPartitionPaths().forEach(path -> {
WorkloadStat partitionStat = profile.getOutputWorkloadStat(path);
HoodieWriteStat insertStat = new HoodieWriteStat();
insertStat.setNumInserts(partitionStat.getNumInserts());
insertStat.setFileId("");
insertStat.setPrevCommit(HoodieWriteStat.NULL_COMMIT);
metadata.addWriteStat(path, insertStat);
Map<String, Pair<String, Long>> updateLocationMap = partitionStat.getUpdateLocationToCount();
Map<String, Pair<String, Long>> insertLocationMap = partitionStat.getInsertLocationToCount();
Stream.concat(updateLocationMap.keySet().stream(), insertLocationMap.keySet().stream()).distinct().forEach(fileId -> {
HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setFileId(fileId);
Pair<String, Long> updateLocation = updateLocationMap.get(fileId);
Pair<String, Long> insertLocation = insertLocationMap.get(fileId);
// TODO : Write baseCommitTime is possible here ?
writeStat.setPrevCommit(updateLocation != null ? updateLocation.getKey() : insertLocation.getKey());
if (updateLocation != null) {
writeStat.setNumUpdateWrites(updateLocation.getValue());
}
if (insertLocation != null) {
writeStat.setNumInserts(insertLocation.getValue());
}
metadata.addWriteStat(path, writeStat);
});
});
metadata.setOperationType(operationType);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
String commitActionType = getCommitActionType();
HoodieInstant requested = new HoodieInstant(State.REQUESTED, commitActionType, instantTime);
activeTimeline.transitionRequestedToInflight(requested, Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)), config.shouldAllowMultiWriteOnSameInstant());
} catch (IOException io) {
throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", io);
}
}
use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testRollbackFailedCommits.
@ParameterizedTest
@MethodSource("rollbackFailedCommitsParams")
public void testRollbackFailedCommits(HoodieFailedWritesCleaningPolicy cleaningPolicy, boolean populateMetaFields) throws Exception {
HoodieTestUtils.init(hadoopConf, basePath);
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
// perform 1 successfull commit
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, true);
// Perform 2 failed writes to table
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, false);
client.close();
// refresh data generator to delete records generated from failed commits
dataGen = new HoodieTestDataGenerator();
// Perform 1 successful write
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
writeBatch(client, "400", "300", Option.of(Arrays.asList("400")), "400", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, true);
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
assertTrue(metaClient.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0);
assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 2);
// Await till enough time passes such that the first 2 failed commits heartbeats are expired
boolean conditionMet = false;
while (!conditionMet) {
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("300");
Thread.sleep(2000);
}
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, populateMetaFields));
// Perform 1 successful write
writeBatch(client, "500", "400", Option.of(Arrays.asList("500")), "500", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 300, 0, true);
client.clean();
HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
if (cleaningPolicy.isLazy()) {
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 2);
// Since we write rollbacks not clean, there should be no clean action on the timeline
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)).countInstants() == 0);
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3);
} else if (cleaningPolicy.isNever()) {
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0);
// There should be no clean or rollback action on the timeline
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)).countInstants() == 0);
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3);
}
}
Aggregations