Search in sources :

Example 6 with WriteMarkers

use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.

the class HoodieTable method reconcileAgainstMarkers.

/**
 * Reconciles WriteStats and marker files to detect and safely delete duplicate data files created because of Spark
 * retries.
 *
 * @param context HoodieEngineContext
 * @param instantTs Instant Timestamp
 * @param stats Hoodie Write Stat
 * @param consistencyCheckEnabled Consistency Check Enabled
 * @throws HoodieIOException
 */
protected void reconcileAgainstMarkers(HoodieEngineContext context, String instantTs, List<HoodieWriteStat> stats, boolean consistencyCheckEnabled) throws HoodieIOException {
    try {
        // Reconcile marker and data files with WriteStats so that partially written data-files due to failed
        // (but succeeded on retry) tasks are removed.
        String basePath = getMetaClient().getBasePath();
        WriteMarkers markers = WriteMarkersFactory.get(config.getMarkersType(), this, instantTs);
        if (!markers.doesMarkerDirExist()) {
            // can happen if it was an empty write say.
            return;
        }
        // we are not including log appends here, since they are already fail-safe.
        Set<String> invalidDataPaths = getInvalidDataPaths(markers);
        Set<String> validDataPaths = stats.stream().map(HoodieWriteStat::getPath).filter(p -> p.endsWith(this.getBaseFileExtension())).collect(Collectors.toSet());
        // Contains list of partially created files. These needs to be cleaned up.
        invalidDataPaths.removeAll(validDataPaths);
        if (!invalidDataPaths.isEmpty()) {
            LOG.info("Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths);
            Map<String, List<Pair<String, String>>> invalidPathsByPartition = invalidDataPaths.stream().map(dp -> Pair.of(new Path(basePath, dp).getParent().toString(), new Path(basePath, dp).toString())).collect(Collectors.groupingBy(Pair::getKey));
            // Otherwise, we may miss deleting such files. If files are not found even after retries, fail the commit
            if (consistencyCheckEnabled) {
                // This will either ensure all files to be deleted are present.
                waitForAllFiles(context, invalidPathsByPartition, FileVisibility.APPEAR);
            }
            // Now delete partially written files
            context.setJobStatus(this.getClass().getSimpleName(), "Delete all partially written files");
            deleteInvalidFilesByPartitions(context, invalidPathsByPartition);
            // Now ensure the deleted files disappear
            if (consistencyCheckEnabled) {
                // This will either ensure all files to be deleted are absent.
                waitForAllFiles(context, invalidPathsByPartition, FileVisibility.DISAPPEAR);
            }
        }
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
}
Also used : HoodieRestorePlan(org.apache.hudi.avro.model.HoodieRestorePlan) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodiePendingRollbackInfo(org.apache.hudi.common.HoodiePendingRollbackInfo) ConsistencyGuard(org.apache.hudi.common.fs.ConsistencyGuard) TimeoutException(java.util.concurrent.TimeoutException) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) Logger(org.apache.log4j.Logger) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieLayoutFactory(org.apache.hudi.table.storage.HoodieLayoutFactory) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) Schema(org.apache.avro.Schema) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) HoodieRollbackPlan(org.apache.hudi.avro.model.HoodieRollbackPlan) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) FileSystemViewManager(org.apache.hudi.common.table.view.FileSystemViewManager) Serializable(java.io.Serializable) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) OptimisticConsistencyGuard(org.apache.hudi.common.fs.OptimisticConsistencyGuard) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieStorageLayout(org.apache.hudi.table.storage.HoodieStorageLayout) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) HoodieInsertException(org.apache.hudi.exception.HoodieInsertException) HoodieBootstrapWriteMetadata(org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileVisibility(org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Function(java.util.function.Function) FailSafeConsistencyGuard(org.apache.hudi.common.fs.FailSafeConsistencyGuard) ArrayList(java.util.ArrayList) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) SpecificRecordBase(org.apache.avro.specific.SpecificRecordBase) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieKey(org.apache.hudi.common.model.HoodieKey) Functions(org.apache.hudi.common.util.Functions) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) List(java.util.List) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 7 with WriteMarkers

use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.

the class OneToZeroDowngradeHandler method downgrade.

@Override
public Map<ConfigProperty, String> downgrade(HoodieWriteConfig config, HoodieEngineContext context, String instantTime, SupportsUpgradeDowngrade upgradeDowngradeHelper) {
    HoodieTable table = upgradeDowngradeHelper.getTable(config, context);
    // fetch pending commit info
    HoodieTimeline inflightTimeline = table.getMetaClient().getCommitsTimeline().filterPendingExcludingCompaction();
    List<HoodieInstant> commits = inflightTimeline.getReverseOrderedInstants().collect(Collectors.toList());
    for (HoodieInstant inflightInstant : commits) {
        // delete existing markers
        WriteMarkers writeMarkers = WriteMarkersFactory.get(config.getMarkersType(), table, inflightInstant.getTimestamp());
        writeMarkers.quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism());
    }
    return Collections.EMPTY_MAP;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTable(org.apache.hudi.table.HoodieTable) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers)

Example 8 with WriteMarkers

use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.

the class ZeroToOneUpgradeHandler method recreateMarkers.

/**
 * Recreate markers in new format.
 * Step1: Delete existing markers
 * Step2: Collect all rollback file info.
 * Step3: recreate markers for all interested files.
 *
 * @param commitInstantTime instant of interest for which markers need to be recreated.
 * @param table             instance of {@link HoodieTable} to use
 * @param context           instance of {@link HoodieEngineContext} to use
 * @throws HoodieRollbackException on any exception during upgrade.
 */
protected void recreateMarkers(final String commitInstantTime, HoodieTable table, HoodieEngineContext context, int parallelism) throws HoodieRollbackException {
    try {
        // fetch hoodie instant
        Option<HoodieInstant> commitInstantOpt = Option.fromJavaOptional(table.getActiveTimeline().getCommitsTimeline().getInstants().filter(instant -> HoodieActiveTimeline.EQUALS.test(instant.getTimestamp(), commitInstantTime)).findFirst());
        if (commitInstantOpt.isPresent()) {
            // delete existing markers
            WriteMarkers writeMarkers = WriteMarkersFactory.get(MarkerType.DIRECT, table, commitInstantTime);
            writeMarkers.quietDeleteMarkerDir(context, parallelism);
            // generate rollback stats
            List<ListingBasedRollbackRequest> rollbackRequests;
            if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
                rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context, table.getMetaClient().getBasePath());
            } else {
                rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
            }
            List<HoodieRollbackStat> rollbackStats = getListBasedRollBackStats(table.getMetaClient(), table.getConfig(), context, commitInstantOpt, rollbackRequests);
            // recreate markers adhering to marker based rollback
            for (HoodieRollbackStat rollbackStat : rollbackStats) {
                for (String path : rollbackStat.getSuccessDeleteFiles()) {
                    String dataFileName = path.substring(path.lastIndexOf("/") + 1);
                    // not feasible to differentiate MERGE from CREATE. hence creating with MERGE IOType for all base files.
                    writeMarkers.create(rollbackStat.getPartitionPath(), dataFileName, IOType.MERGE);
                }
                for (FileStatus fileStatus : rollbackStat.getCommandBlocksCount().keySet()) {
                    writeMarkers.create(rollbackStat.getPartitionPath(), getFileNameForMarkerFromLogFile(fileStatus.getPath().toString(), table), IOType.APPEND);
                }
            }
        }
    } catch (Exception e) {
        throw new HoodieRollbackException("Exception thrown while upgrading Hoodie Table from version 0 to 1", e);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieRollbackStat(org.apache.hudi.common.HoodieRollbackStat) HoodieRollbackException(org.apache.hudi.exception.HoodieRollbackException) FileStatus(org.apache.hadoop.fs.FileStatus) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) ListingBasedRollbackRequest(org.apache.hudi.table.action.rollback.ListingBasedRollbackRequest) HoodieRollbackException(org.apache.hudi.exception.HoodieRollbackException)

Example 9 with WriteMarkers

use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.

the class TestUpgradeDowngrade method assertMarkerFilesForDowngrade.

private void assertMarkerFilesForDowngrade(HoodieTable table, HoodieInstant commitInstant, boolean assertExists) throws IOException {
    // Verify recreated marker files are as expected
    WriteMarkers writeMarkers = WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp());
    if (assertExists) {
        assertTrue(writeMarkers.doesMarkerDirExist());
        assertEquals(0, getTimelineServerBasedMarkerFileCount(table.getMetaClient().getMarkerFolderPath(commitInstant.getTimestamp()), table.getMetaClient().getFs()));
    } else {
        assertFalse(writeMarkers.doesMarkerDirExist());
    }
}
Also used : WriteMarkers(org.apache.hudi.table.marker.WriteMarkers)

Example 10 with WriteMarkers

use of org.apache.hudi.table.marker.WriteMarkers in project hudi by apache.

the class TestUpgradeDowngrade method assertMarkerFilesForUpgrade.

private void assertMarkerFilesForUpgrade(HoodieTable table, HoodieInstant commitInstant, List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices) throws IOException {
    // Verify recreated marker files are as expected
    WriteMarkers writeMarkers = WriteMarkersFactory.get(getConfig().getMarkersType(), table, commitInstant.getTimestamp());
    assertTrue(writeMarkers.doesMarkerDirExist());
    Set<String> files = writeMarkers.allMarkerFilePaths();
    assertEquals(2, files.size());
    List<String> actualFiles = new ArrayList<>();
    for (String file : files) {
        String fileName = WriteMarkers.stripMarkerSuffix(file);
        actualFiles.add(fileName);
    }
    List<FileSlice> expectedFileSlices = new ArrayList<>();
    expectedFileSlices.addAll(firstPartitionCommit2FileSlices);
    expectedFileSlices.addAll(secondPartitionCommit2FileSlices);
    List<String> expectedPaths = new ArrayList<>();
    List<Pair<String, String>> expectedLogFilePaths = new ArrayList<>();
    for (FileSlice fileSlice : expectedFileSlices) {
        String partitionPath = fileSlice.getPartitionPath();
        if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
            for (HoodieLogFile logFile : fileSlice.getLogFiles().collect(Collectors.toList())) {
                // log file format can't be matched as is, since the write token can't be asserted. Hence asserting for partitionpath, fileId and baseCommit time.
                String logBaseCommitTime = logFile.getBaseCommitTime();
                expectedLogFilePaths.add(Pair.of(partitionPath + "/" + logFile.getFileId(), logBaseCommitTime));
            }
        }
        if (fileSlice.getBaseInstantTime().equals(commitInstant.getTimestamp())) {
            String path = fileSlice.getBaseFile().get().getPath();
            // for base files, path can be asserted as is.
            expectedPaths.add(path.substring(path.indexOf(partitionPath)));
        }
    }
    // Trim log file paths only
    List<String> trimmedActualFiles = new ArrayList<>();
    for (String actualFile : actualFiles) {
        if (table.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ) {
            trimmedActualFiles.add(actualFile.substring(0, actualFile.lastIndexOf('.')));
        } else {
            trimmedActualFiles.add(actualFile);
        }
    }
    // assert for base files.
    for (String expected : expectedPaths) {
        if (trimmedActualFiles.contains(expected)) {
            trimmedActualFiles.remove(expected);
        }
    }
    if (expectedLogFilePaths.size() > 0) {
        // assert for log files
        List<Pair<String, String>> actualLogFiles = new ArrayList<>();
        for (String actual : trimmedActualFiles) {
            actualLogFiles.add(Pair.of(actual.substring(0, actual.indexOf('_')), actual.substring(actual.lastIndexOf('_') + 1)));
        }
        assertEquals(expectedLogFilePaths.size(), actualLogFiles.size());
        for (Pair<String, String> entry : expectedLogFilePaths) {
            assertTrue(actualLogFiles.contains(entry));
        }
    } else {
        assertTrue(trimmedActualFiles.size() == 0);
    }
}
Also used : FileSlice(org.apache.hudi.common.model.FileSlice) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) ArrayList(java.util.ArrayList) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

WriteMarkers (org.apache.hudi.table.marker.WriteMarkers)11 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)5 ArrayList (java.util.ArrayList)4 List (java.util.List)3 Path (org.apache.hadoop.fs.Path)3 FileSlice (org.apache.hudi.common.model.FileSlice)3 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)3 HoodieTable (org.apache.hudi.table.HoodieTable)3 HashMap (java.util.HashMap)2 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)2 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)2 TimelineLayoutVersion (org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion)2 IOException (java.io.IOException)1 Serializable (java.io.Serializable)1 Map (java.util.Map)1 Set (java.util.Set)1 TimeoutException (java.util.concurrent.TimeoutException)1 Function (java.util.function.Function)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1