Search in sources :

Example 1 with HoodieCleanMetadata

use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.

the class TestMetadataConversionUtils method createCleanMetadata.

private void createCleanMetadata(String instantTime) throws IOException {
    HoodieCleanerPlan cleanerPlan = new HoodieCleanerPlan(new HoodieActionInstant("", "", ""), "", new HashMap<>(), CleanPlanV2MigrationHandler.VERSION, new HashMap<>());
    HoodieCleanStat cleanStats = new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, HoodieTestUtils.DEFAULT_PARTITION_PATHS[new Random().nextInt(HoodieTestUtils.DEFAULT_PARTITION_PATHS.length)], Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), instantTime);
    HoodieCleanMetadata cleanMetadata = convertCleanMetadata(instantTime, Option.of(0L), Collections.singletonList(cleanStats));
    HoodieTestTable.of(metaClient).addClean(instantTime, cleanerPlan, cleanMetadata);
}
Also used : HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) Random(java.util.Random) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan)

Example 2 with HoodieCleanMetadata

use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.

the class TestCleaner method testCleanMetadataUpgradeDowngrade.

@Test
public void testCleanMetadataUpgradeDowngrade() {
    String instantTime = "000";
    String partition1 = DEFAULT_PARTITION_PATHS[0];
    String partition2 = DEFAULT_PARTITION_PATHS[1];
    String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
    String fileName1 = "data1_1_000" + extension;
    String fileName2 = "data2_1_000" + extension;
    String filePath1 = metaClient.getBasePath() + "/" + partition1 + "/" + fileName1;
    String filePath2 = metaClient.getBasePath() + "/" + partition1 + "/" + fileName2;
    List<String> deletePathPatterns1 = Arrays.asList(filePath1, filePath2);
    List<String> successDeleteFiles1 = Collections.singletonList(filePath1);
    List<String> failedDeleteFiles1 = Collections.singletonList(filePath2);
    // create partition1 clean stat.
    HoodieCleanStat cleanStat1 = new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS, partition1, deletePathPatterns1, successDeleteFiles1, failedDeleteFiles1, instantTime);
    List<String> deletePathPatterns2 = new ArrayList<>();
    List<String> successDeleteFiles2 = new ArrayList<>();
    List<String> failedDeleteFiles2 = new ArrayList<>();
    // create partition2 empty clean stat.
    HoodieCleanStat cleanStat2 = new HoodieCleanStat(HoodieCleaningPolicy.KEEP_LATEST_COMMITS, partition2, deletePathPatterns2, successDeleteFiles2, failedDeleteFiles2, instantTime);
    // map with absolute file path.
    Map<String, Tuple3> oldExpected = new HashMap<>();
    oldExpected.put(partition1, new Tuple3<>(deletePathPatterns1, successDeleteFiles1, failedDeleteFiles1));
    oldExpected.put(partition2, new Tuple3<>(deletePathPatterns2, successDeleteFiles2, failedDeleteFiles2));
    // map with relative path.
    Map<String, Tuple3> newExpected = new HashMap<>();
    newExpected.put(partition1, new Tuple3<>(Arrays.asList(fileName1, fileName2), Collections.singletonList(fileName1), Collections.singletonList(fileName2)));
    newExpected.put(partition2, new Tuple3<>(deletePathPatterns2, successDeleteFiles2, failedDeleteFiles2));
    HoodieCleanMetadata metadata = CleanerUtils.convertCleanMetadata(instantTime, Option.of(0L), Arrays.asList(cleanStat1, cleanStat2));
    metadata.setVersion(CleanerUtils.CLEAN_METADATA_VERSION_1);
    // NOw upgrade and check
    CleanMetadataMigrator metadataMigrator = new CleanMetadataMigrator(metaClient);
    metadata = metadataMigrator.upgradeToLatest(metadata, metadata.getVersion());
    assertCleanMetadataPathEquals(newExpected, metadata);
    CleanMetadataMigrator migrator = new CleanMetadataMigrator(metaClient);
    HoodieCleanMetadata oldMetadata = migrator.migrateToVersion(metadata, metadata.getVersion(), CleanerUtils.CLEAN_METADATA_VERSION_1);
    assertEquals(CleanerUtils.CLEAN_METADATA_VERSION_1, oldMetadata.getVersion());
    assertCleanMetadataEquals(metadata, oldMetadata);
    assertCleanMetadataPathEquals(oldExpected, oldMetadata);
    HoodieCleanMetadata newMetadata = migrator.upgradeToLatest(oldMetadata, oldMetadata.getVersion());
    assertEquals(CleanerUtils.LATEST_CLEAN_METADATA_VERSION, newMetadata.getVersion());
    assertCleanMetadataEquals(oldMetadata, newMetadata);
    assertCleanMetadataPathEquals(newExpected, newMetadata);
    assertCleanMetadataPathEquals(oldExpected, oldMetadata);
}
Also used : HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HashMap(java.util.HashMap) Tuple3(scala.Tuple3) CleanMetadataMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator) ArrayList(java.util.ArrayList) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 3 with HoodieCleanMetadata

use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.

the class BaseHoodieWriteClient method clean.

/**
 * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the
 * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be
 * cleaned). This API provides the flexibility to schedule clean instant asynchronously via
 * {@link BaseHoodieWriteClient#scheduleTableService(String, Option, TableServiceType)} and disable inline scheduling
 * of clean.
 * @param cleanInstantTime instant time for clean.
 * @param scheduleInline true if needs to be scheduled inline. false otherwise.
 * @param skipLocking if this is triggered by another parent transaction, locking can be skipped.
 */
public HoodieCleanMetadata clean(String cleanInstantTime, boolean scheduleInline, boolean skipLocking) throws HoodieIOException {
    if (!tableServicesEnabled(config)) {
        return null;
    }
    final Timer.Context timerContext = metrics.getCleanCtx();
    CleanerUtils.rollbackFailedWrites(config.getFailedWritesCleanPolicy(), HoodieTimeline.CLEAN_ACTION, () -> rollbackFailedWrites(skipLocking));
    HoodieCleanMetadata metadata = null;
    HoodieTable table = createTable(config, hadoopConf);
    if (config.allowMultipleCleans() || !table.getActiveTimeline().getCleanerTimeline().filterInflightsAndRequested().firstInstant().isPresent()) {
        LOG.info("Cleaner started");
        // proceed only if multiple clean schedules are enabled or if there are no pending cleans.
        if (scheduleInline) {
            scheduleTableServiceInternal(cleanInstantTime, Option.empty(), TableServiceType.CLEAN);
            table.getMetaClient().reloadActiveTimeline();
        }
        metadata = table.clean(context, cleanInstantTime, skipLocking);
        if (timerContext != null && metadata != null) {
            long durationMs = metrics.getDurationInMs(timerContext.stop());
            metrics.updateCleanMetrics(durationMs, metadata.getTotalFilesDeleted());
            LOG.info("Cleaned " + metadata.getTotalFilesDeleted() + " files" + " Earliest Retained Instant :" + metadata.getEarliestCommitToRetain() + " cleanerElapsedMs" + durationMs);
        }
    }
    return metadata;
}
Also used : Timer(com.codahale.metrics.Timer) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata)

Example 4 with HoodieCleanMetadata

use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.

the class CleanerUtils method getCleanerMetadata.

/**
 * Get Latest Version of Hoodie Cleaner Metadata - Output of cleaner operation.
 * @param metaClient Hoodie Table Meta Client
 * @param cleanInstant Instant referring to clean action
 * @return Latest version of Clean metadata corresponding to clean instant
 * @throws IOException
 */
public static HoodieCleanMetadata getCleanerMetadata(HoodieTableMetaClient metaClient, HoodieInstant cleanInstant) throws IOException {
    CleanMetadataMigrator metadataMigrator = new CleanMetadataMigrator(metaClient);
    HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(metaClient.getActiveTimeline().readCleanerInfoAsBytes(cleanInstant).get());
    return metadataMigrator.upgradeToLatest(cleanMetadata, cleanMetadata.getVersion());
}
Also used : CleanMetadataMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata)

Example 5 with HoodieCleanMetadata

use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.

the class CleanerUtils method convertCleanMetadata.

public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime, Option<Long> durationInMs, List<HoodieCleanStat> cleanStats) {
    Map<String, HoodieCleanPartitionMetadata> partitionMetadataMap = new HashMap<>();
    Map<String, HoodieCleanPartitionMetadata> partitionBootstrapMetadataMap = new HashMap<>();
    int totalDeleted = 0;
    String earliestCommitToRetain = null;
    for (HoodieCleanStat stat : cleanStats) {
        HoodieCleanPartitionMetadata metadata = new HoodieCleanPartitionMetadata(stat.getPartitionPath(), stat.getPolicy().name(), stat.getDeletePathPatterns(), stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles());
        partitionMetadataMap.put(stat.getPartitionPath(), metadata);
        if ((null != stat.getDeleteBootstrapBasePathPatterns()) && (!stat.getDeleteBootstrapBasePathPatterns().isEmpty())) {
            HoodieCleanPartitionMetadata bootstrapMetadata = new HoodieCleanPartitionMetadata(stat.getPartitionPath(), stat.getPolicy().name(), stat.getDeleteBootstrapBasePathPatterns(), stat.getSuccessDeleteBootstrapBaseFiles(), stat.getFailedDeleteBootstrapBaseFiles());
            partitionBootstrapMetadataMap.put(stat.getPartitionPath(), bootstrapMetadata);
        }
        totalDeleted += stat.getSuccessDeleteFiles().size();
        if (earliestCommitToRetain == null) {
            // This will be the same for all partitions
            earliestCommitToRetain = stat.getEarliestCommitToRetain();
        }
    }
    return new HoodieCleanMetadata(startCleanTime, durationInMs.orElseGet(() -> -1L), totalDeleted, earliestCommitToRetain, partitionMetadataMap, CLEAN_METADATA_VERSION_2, partitionBootstrapMetadataMap);
}
Also used : HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HashMap(java.util.HashMap) HoodieCleanPartitionMetadata(org.apache.hudi.avro.model.HoodieCleanPartitionMetadata) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata)

Aggregations

HoodieCleanMetadata (org.apache.hudi.avro.model.HoodieCleanMetadata)22 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)11 HoodieCleanStat (org.apache.hudi.common.HoodieCleanStat)8 HashMap (java.util.HashMap)7 Map (java.util.Map)7 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 ArrayList (java.util.ArrayList)6 List (java.util.List)6 Collectors (java.util.stream.Collectors)6 Path (org.apache.hadoop.fs.Path)6 HoodieCleanPartitionMetadata (org.apache.hudi.avro.model.HoodieCleanPartitionMetadata)6 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)6 IOException (java.io.IOException)5 HoodieCleanerPlan (org.apache.hudi.avro.model.HoodieCleanerPlan)5 FSUtils (org.apache.hudi.common.fs.FSUtils)5 Pair (org.apache.hudi.common.util.collection.Pair)5 Test (org.junit.jupiter.api.Test)5 HoodieRollbackMetadata (org.apache.hudi.avro.model.HoodieRollbackMetadata)4 CleanerUtils (org.apache.hudi.common.util.CleanerUtils)4