use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.
the class HoodieTable method reconcileAgainstMarkers.
/**
* Reconciles WriteStats and marker files to detect and safely delete duplicate data files created because of Spark
* retries.
*
* @param context HoodieEngineContext
* @param instantTs Instant Timestamp
* @param stats Hoodie Write Stat
* @param consistencyCheckEnabled Consistency Check Enabled
* @throws HoodieIOException
*/
protected void reconcileAgainstMarkers(HoodieEngineContext context, String instantTs, List<HoodieWriteStat> stats, boolean consistencyCheckEnabled) throws HoodieIOException {
try {
// Reconcile marker and data files with WriteStats so that partially written data-files due to failed
// (but succeeded on retry) tasks are removed.
String basePath = getMetaClient().getBasePath();
WriteMarkers markers = WriteMarkersFactory.get(config.getMarkersType(), this, instantTs);
if (!markers.doesMarkerDirExist()) {
// can happen if it was an empty write say.
return;
}
// we are not including log appends here, since they are already fail-safe.
Set<String> invalidDataPaths = getInvalidDataPaths(markers);
Set<String> validDataPaths = stats.stream().map(HoodieWriteStat::getPath).filter(p -> p.endsWith(this.getBaseFileExtension())).collect(Collectors.toSet());
// Contains list of partially created files. These needs to be cleaned up.
invalidDataPaths.removeAll(validDataPaths);
if (!invalidDataPaths.isEmpty()) {
LOG.info("Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths);
Map<String, List<Pair<String, String>>> invalidPathsByPartition = invalidDataPaths.stream().map(dp -> Pair.of(new Path(basePath, dp).getParent().toString(), new Path(basePath, dp).toString())).collect(Collectors.groupingBy(Pair::getKey));
// Otherwise, we may miss deleting such files. If files are not found even after retries, fail the commit
if (consistencyCheckEnabled) {
// This will either ensure all files to be deleted are present.
waitForAllFiles(context, invalidPathsByPartition, FileVisibility.APPEAR);
}
// Now delete partially written files
context.setJobStatus(this.getClass().getSimpleName(), "Delete all partially written files");
deleteInvalidFilesByPartitions(context, invalidPathsByPartition);
// Now ensure the deleted files disappear
if (consistencyCheckEnabled) {
// This will either ensure all files to be deleted are absent.
waitForAllFiles(context, invalidPathsByPartition, FileVisibility.DISAPPEAR);
}
}
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.
the class OneToZeroDowngradeHandler method downgrade.
@Override
public Map<ConfigProperty, String> downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) {
HoodieTable table = upgradeDowngradeHelper.getTable(config, context);
// fetch pending commit info
HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction();
List<HoodieInstant> commits = inflightTimeline.getReverseOrderedInstants().collect(Collectors.toList());
for (HoodieInstant inflightInstant : commits) {
// delete existing markers
WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), table, inflightInstant.getTimestamp());
writeMarkers.quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism());
}
return Collections.EMPTY_MAP;
}
use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.
the class ZeroToOneUpgradeHandler method recreateMarkers.
/**
* Recreate markers in new format.
* Step1: Delete existing markers
* Step2: Collect all rollback file info.
* Step3: recreate markers for all interested files.
*
* @param commitInstantTime instant of interest for which markers need to be recreated.
* @param table instance of {@link HoodieTable} to use
* @param context instance of {@link HoodieEngineContext} to use
* @throws HoodieRollbackException on any exception during upgrade.
*/
protected void recreateMarkers(final String commitInstantTime, HoodieTable table, HoodieEngineContext context, int parallelism) throws HoodieRollbackException {
try {
// fetch hoodie instant
Option<HoodieInstant> commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants().filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime)).findFirst());
if (commitInstantOpt.isPresent()) {
// delete existing markers
WriteMarkers writeMarkers = WriteMarkersFactory.get(MarkerType.DIRECT, table, commitInstantTime);
writeMarkers.quietDeleteMarkerDir(context, parallelism);
// generate rollback stats
List<ListingBasedRollbackRequest> rollbackRequests;
if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context, table.getMetaClient().getBasePath());
} else {
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
}
List<HoodieRollbackStat> rollbackStats = getListBasedRollBackStats(table.getMetaClient(), table.getConfig(), context, commitInstantOpt, rollbackRequests);
// recreate markers adhering to marker based rollback
for (HoodieRollbackStat rollbackStat : rollbackStats) {
for (String path : rollbackStat.getSuccessDeleteFiles()) {
String dataFileName = path.substring(path.lastIndexOf("/") + 1);
// not feasible to differentiate MERGE from CREATE. hence creating with MERGE IOType for all base files.
writeMarkers.create(rollbackStat.getPartitionPath(), dataFileName, IOType.MERGE);
}
for (FileStatus fileStatus : rollbackStat.getCommandBlocksCount().keySet()) {
writeMarkers.create(rollbackStat.getPartitionPath(), getFileNameForMarkerFromLogFile(fileStatus.getPath().toString(), table), IOType.APPEND);
}
}
}
} catch (Exception e) {
throw new HoodieRollbackException("Exception thrown while upgrading Hoodie Table from version 0 to 1", e);
}
}
use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.
the class TestUpgradeDowngrade method assertMarkerFilesForDowngrade.
private void assertMarkerFilesForDowngrade(HoodieTable table, HoodieInstant commitInstant, boolean assertExists) throws IOException {
// Verify recreated marker files are as expected
WriteMarkers writeMarkers = WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp());
if (assertExists) {
assertTrue(writeMarkers.doesMarkerDirExist());
assertEquals(0, getTimelineServerBasedMarkerFileCount(table.getMetaClient().getMarkerFolderPath(commitInstant.getTimestamp()), table.getMetaClient().getFs()));
} else {
assertFalse(writeMarkers.doesMarkerDirExist());
}
}
use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.
the class TestUpgradeDowngrade method assertMarkerFilesForUpgrade.
private void assertMarkerFilesForUpgrade(HoodieTable table, HoodieInstant commitInstant, List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices) throws IOException {
// Verify recreated marker files are as expected
WriteMarkers writeMarkers = WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp());
assertTrue(writeMarkers.doesMarkerDirExist());
Set<String> files = writeMarkers.allMarkerFilePaths();
assertEquals(2, files.size());
List<String> actualFiles = new ArrayList<>();
for (String file : files) {
String fileName = WriteMarkers.stripMarkerSuffix(file);
actualFiles.add(fileName);
}
List<FileSlice> expectedFileSlices = new ArrayList<>();
expectedFileSlices.addAll(firstPartitionCommit2FileSlices);
expectedFileSlices.addAll(secondPartitionCommit2FileSlices);
List<String> expectedPaths = new ArrayList<>();
List<Pair<String, String>> expectedLogFilePaths = new ArrayList<>();
for (FileSlice fileSlice : expectedFileSlices) {
String partitionPath = fileSlice.getPartitionPath();
if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
for (HoodieLogFile logFile : fileSlice.getLogFiles().collect(Collectors.toList())) {
// log file format can't be matched as is, since the write token can't be asserted. Hence asserting for partitionpath, fileId and baseCommit time.
String logBaseCommitTime = logFile.getBaseCommitTime();
expectedLogFilePaths.add(Pair.of(partitionPath + "/" + logFile.getFileId(), logBaseCommitTime));
}
}
if (fileSlice.getBaseInstantTime().equals(commitInstant.getTimestamp())) {
String path = fileSlice.getBaseFile().get().getPath();
// for base files, path can be asserted as is.
expectedPaths.add(path.substring(path.indexOf(partitionPath)));
}
}
// Trim log file paths only
List<String> trimmedActualFiles = new ArrayList<>();
for (String actualFile : actualFiles) {
if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
trimmedActualFiles.add(actualFile.substring(0, actualFile.lastIndexOf('.')));
} else {
trimmedActualFiles.add(actualFile);
}
}
// assert for base files.
for (String expected : expectedPaths) {
if (trimmedActualFiles.contains(expected)) {
trimmedActualFiles.remove(expected);
}
}
if (expectedLogFilePaths.size() > 0) {
// assert for log files
List<Pair<String, String>> actualLogFiles = new ArrayList<>();
for (String actual : trimmedActualFiles) {
actualLogFiles.add(Pair.of(actual.substring(0, actual.indexOf('_')), actual.substring(actual.lastIndexOf('_') + 1)));
}
assertEquals(expectedLogFilePaths.size(), actualLogFiles.size());
for (Pair<String, String> entry : expectedLogFilePaths) {
assertTrue(actualLogFiles.contains(entry));
}
} else {
assertTrue(trimmedActualFiles.size() == 0);
}
}
Aggregations