Search in sources :

Example 6 with HoodieCleanerPlan

use of org.apache.hudi.avro.model.HoodieCleanerPlan in project hudi by apache.

the class CleanPlanActionExecutor method requestClean.

/**
 * Creates a Cleaner plan if there are files to be cleaned and stores them in instant file.
 * Cleaner Plan contains absolute file paths.
 *
 * @param startCleanTime Cleaner Instant Time
 * @return Cleaner Plan if generated
 */
protected Option<HoodieCleanerPlan> requestClean(String startCleanTime) {
    final HoodieCleanerPlan cleanerPlan = requestClean(context);
    if ((cleanerPlan.getFilePathsToBeDeletedPerPartition() != null) && !cleanerPlan.getFilePathsToBeDeletedPerPartition().isEmpty() && cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).sum() > 0) {
        // Only create cleaner plan which does some work
        final HoodieInstant cleanInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.CLEAN_ACTION, startCleanTime);
        // Save to both aux and timeline folder
        try {
            table.getActiveTimeline().saveToCleanRequested(cleanInstant, TimelineMetadataUtils.serializeCleanerPlan(cleanerPlan));
            LOG.info("Requesting Cleaning with instant time " + cleanInstant);
        } catch (IOException e) {
            LOG.error("Got exception when saving cleaner requested file", e);
            throw new HoodieIOException(e.getMessage(), e);
        }
        return Option.of(cleanerPlan);
    }
    return Option.empty();
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieIOException(org.apache.hudi.exception.HoodieIOException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan)

Example 7 with HoodieCleanerPlan

use of org.apache.hudi.avro.model.HoodieCleanerPlan in project hudi by apache.

the class HoodieTestTable method getHoodieCleanMetadata.

public Pair<HoodieCleanerPlan, HoodieCleanMetadata> getHoodieCleanMetadata(String commitTime, HoodieTestTableState testTableState) {
    HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant(commitTime, CLEAN_ACTION, EMPTY_STRING), EMPTY_STRING, new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>());
    List<HoodieCleanStat> cleanStats = new ArrayList<>();
    for (Map.Entry<String, List<String>> entry : testTableState.getPartitionToFileIdMapForCleaner(commitTime).entrySet()) {
        cleanStats.add(new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, entry.getKey(), entry.getValue(), entry.getValue(), Collections.emptyList(), commitTime));
    }
    return Pair.of(cleanerPlan, convertCleanMetadata(commitTime, Option.of(0L), cleanStats));
}
Also used : HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) ArrayList(java.util.ArrayList) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) List(java.util.List) ArrayList(java.util.ArrayList) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) Map(java.util.Map) CollectionUtils.createImmutableMap(org.apache.hudi.common.util.CollectionUtils.createImmutableMap) HashMap(java.util.HashMap)

Example 8 with HoodieCleanerPlan

use of org.apache.hudi.avro.model.HoodieCleanerPlan in project hudi by apache.

the class CleanActionExecutor method clean.

/**
 * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
 * skews in partitions to clean by making files to clean as the unit of task distribution.
 *
 * @throws IllegalArgumentException if unknown cleaning policy is provided
 */
List<HoodieCleanStat> clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) {
    int cleanerParallelism = Math.min((int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), config.getCleanerParallelism());
    LOG.info("Using cleanerParallelism: " + cleanerParallelism);
    context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions");
    Stream<Pair<String, CleanFileInfo>> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream().flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile()))));
    Stream<ImmutablePair<String, PartitionCleanStat>> partitionCleanStats = context.mapPartitionsToPairAndReduceByKey(filesToBeDeletedPerPartition, iterator -> deleteFilesFunc(iterator, table), PartitionCleanStat::merge, cleanerParallelism);
    Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    // Return PartitionCleanStat for each partition passed.
    return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> {
        PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) ? partitionCleanStatsMap.get(partitionPath) : new PartitionCleanStat(partitionPath);
        HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain();
        return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath).withEarliestCommitRetained(Option.ofNullable(actionInstant != null ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), actionInstant.getAction(), actionInstant.getTimestamp()) : null)).withDeletePathPattern(partitionCleanStat.deletePathPatterns()).withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()).withFailedDeletes(partitionCleanStat.failedDeleteFiles()).withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()).withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()).withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()).build();
    }).collect(Collectors.toList());
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) BaseActionExecutor(org.apache.hudi.table.action.BaseActionExecutor) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) TransactionManager(org.apache.hudi.client.transaction.TransactionManager) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Stream(java.util.stream.Stream) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) ArrayList(java.util.ArrayList) List(java.util.List) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 9 with HoodieCleanerPlan

use of org.apache.hudi.avro.model.HoodieCleanerPlan in project hudi by apache.

the class HoodieMetadataTableValidator method doMetadataTableValidation.

public void doMetadataTableValidation() {
    boolean finalResult = true;
    metaClient.reloadActiveTimeline();
    String basePath = metaClient.getBasePath();
    Set<String> baseFilesForCleaning = Collections.emptySet();
    if (cfg.skipDataFilesForCleaning) {
        HoodieTimeline inflightCleaningTimeline = metaClient.getActiveTimeline().getCleanerTimeline().filterInflights();
        baseFilesForCleaning = inflightCleaningTimeline.getInstants().flatMap(instant -> {
            try {
                // convert inflight instant to requested and get clean plan
                instant = new HoodieInstant(HoodieInstant.State.REQUESTED, instant.getAction(), instant.getTimestamp());
                HoodieCleanerPlan cleanerPlan = CleanerUtils.getCleanerPlan(metaClient, instant);
                return cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().flatMap(cleanerFileInfoList -> {
                    return cleanerFileInfoList.stream().map(fileInfo -> {
                        return new Path(fileInfo.getFilePath()).getName();
                    });
                });
            } catch (IOException e) {
                throw new HoodieIOException("Error reading cleaner metadata for " + instant);
            }
        // only take care of base files here.
        }).filter(path -> {
            String fileExtension = FSUtils.getFileExtension(path);
            return HoodieFileFormat.BASE_FILE_EXTENSIONS.contains(fileExtension);
        }).collect(Collectors.toSet());
    }
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    List<String> allPartitions = validatePartitions(engineContext, basePath);
    HoodieMetadataValidationContext metadataTableBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, true);
    HoodieMetadataValidationContext fsBasedContext = new HoodieMetadataValidationContext(engineContext, cfg, metaClient, false);
    Set<String> finalBaseFilesForCleaning = baseFilesForCleaning;
    List<Boolean> result = engineContext.parallelize(allPartitions, allPartitions.size()).map(partitionPath -> {
        try {
            validateFilesInPartition(metadataTableBasedContext, fsBasedContext, partitionPath, finalBaseFilesForCleaning);
            LOG.info("Metadata table validation succeeded for " + partitionPath);
            return true;
        } catch (HoodieValidationException e) {
            LOG.error("Metadata table validation failed for " + partitionPath + " due to HoodieValidationException", e);
            if (!cfg.ignoreFailed) {
                throw e;
            }
            return false;
        }
    }).collectAsList();
    for (Boolean res : result) {
        finalResult &= res;
    }
    if (finalResult) {
        LOG.info("Metadata table validation succeeded.");
    } else {
        LOG.warn("Metadata table validation failed.");
    }
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) ByteBuffer(java.nio.ByteBuffer) BloomFilterData(org.apache.hudi.utilities.util.BloomFilterData) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) Set(java.util.Set) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) FileSystemViewManager(org.apache.hudi.common.table.view.FileSystemViewManager) Serializable(java.io.Serializable) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Parameter(com.beust.jcommander.Parameter) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) ArrayList(java.util.ArrayList) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieAsyncService(org.apache.hudi.async.HoodieAsyncService) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) ExecutorService(java.util.concurrent.ExecutorService) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) Log(jline.internal.Log) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) JCommander(com.beust.jcommander.JCommander) SparkConf(org.apache.spark.SparkConf) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan)

Example 10 with HoodieCleanerPlan

use of org.apache.hudi.avro.model.HoodieCleanerPlan in project hudi by apache.

the class TestCleaner method testMultiClean.

/**
 * Tests no more than 1 clean is scheduled/executed if HoodieCompactionConfig.allowMultipleCleanSchedule config is disabled.
 */
@Test
public void testMultiClean() {
    HoodieWriteConfig writeConfig = getConfigBuilder().withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER).allowMultipleCleans(false).withAutoClean(false).retainCommits(1).retainFileVersions(1).build()).withEmbeddedTimelineServerEnabled(false).build();
    int index = 0;
    String cleanInstantTime;
    final String partition = "2015/03/16";
    try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) {
        // Three writes so we can initiate a clean
        for (; index < 3; ++index) {
            String newCommitTime = "00" + index;
            List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 1, partition);
            client.startCommitWithTime(newCommitTime);
            client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
        }
    }
    // mimic failed/leftover clean by scheduling a clean but not performing it
    cleanInstantTime = "00" + index++;
    HoodieTable table = HoodieSparkTable.create(writeConfig, context);
    Option<HoodieCleanerPlan> cleanPlan = table.scheduleCleaning(context, cleanInstantTime, Option.empty());
    assertEquals(cleanPlan.get().getFilePathsToBeDeletedPerPartition().get(partition).size(), 1);
    assertEquals(metaClient.reloadActiveTimeline().getCleanerTimeline().filterInflightsAndRequested().countInstants(), 1);
    try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) {
        // Next commit. This is required so that there is an additional file version to clean.
        String newCommitTime = "00" + index++;
        List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 1, partition);
        client.startCommitWithTime(newCommitTime);
        client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Initiate another clean. The previous leftover clean will be attempted first, followed by another clean
        // due to the commit above.
        String newCleanInstantTime = "00" + index++;
        HoodieCleanMetadata cleanMetadata = client.clean(newCleanInstantTime);
        // subsequent clean should not be triggered since allowMultipleCleanSchedules is set to false
        assertNull(cleanMetadata);
        // let the old clean complete
        table = HoodieSparkTable.create(writeConfig, context);
        cleanMetadata = table.clean(context, cleanInstantTime, false);
        assertNotNull(cleanMetadata);
        // any new clean should go ahead
        cleanMetadata = client.clean(newCleanInstantTime);
        // subsequent clean should not be triggered since allowMultipleCleanSchedules is set to false
        assertNotNull(cleanMetadata);
        // 1 file cleaned
        assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getSuccessDeleteFiles().size(), 1);
        assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getFailedDeleteFiles().size(), 0);
        assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getDeletePathPatterns().size(), 1);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieCleanerPlan (org.apache.hudi.avro.model.HoodieCleanerPlan)11 HoodieActionInstant (org.apache.hudi.avro.model.HoodieActionInstant)6 List (java.util.List)5 HoodieCleanMetadata (org.apache.hudi.avro.model.HoodieCleanMetadata)5 HoodieCleanStat (org.apache.hudi.common.HoodieCleanStat)5 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)5 IOException (java.io.IOException)4 ArrayList (java.util.ArrayList)4 HoodieIOException (org.apache.hudi.exception.HoodieIOException)4 HashMap (java.util.HashMap)3 Map (java.util.Map)3 Collectors (java.util.stream.Collectors)3 Path (org.apache.hadoop.fs.Path)3 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)3 CleanerUtils (org.apache.hudi.common.util.CleanerUtils)3 Option (org.apache.hudi.common.util.Option)3 Pair (org.apache.hudi.common.util.collection.Pair)3 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)3 LogManager (org.apache.log4j.LogManager)3 Logger (org.apache.log4j.Logger)3