Search in sources :

Example 16 with HoodieReplaceCommitMetadata

use of org.apache.hudi.common.model.HoodieReplaceCommitMetadata in project hudi by apache.

the class AbstractTableFileSystemView method resetFileGroupsReplaced.

/**
 * Get replaced instant for each file group by looking at all commit instants.
 */
private void resetFileGroupsReplaced(HoodieTimeline timeline) {
    HoodieTimer hoodieTimer = new HoodieTimer();
    hoodieTimer.startTimer();
    // for each REPLACE instant, get map of (partitionPath -> deleteFileGroup)
    HoodieTimeline replacedTimeline = timeline.getCompletedReplaceTimeline();
    Stream<Map.Entry<HoodieFileGroupId, HoodieInstant>> resultStream = replacedTimeline.getInstants().flatMap(instant -> {
        try {
            HoodieReplaceCommitMetadata replaceMetadata = HoodieReplaceCommitMetadata.fromBytes(metaClient.getActiveTimeline().getInstantDetails(instant).get(), HoodieReplaceCommitMetadata.class);
            // get replace instant mapping for each partition, fileId
            return replaceMetadata.getPartitionToReplaceFileIds().entrySet().stream().flatMap(entry -> entry.getValue().stream().map(e -> new AbstractMap.SimpleEntry<>(new HoodieFileGroupId(entry.getKey(), e), instant)));
        } catch (HoodieIOException ex) {
            if (ex.getIOException() instanceof FileNotFoundException) {
                // Replace instant could be deleted by archive and FileNotFoundException could be threw during getInstantDetails function
                // So that we need to catch the FileNotFoundException here and continue
                LOG.warn(ex.getMessage());
                return Stream.empty();
            } else {
                throw ex;
            }
        } catch (IOException e) {
            throw new HoodieIOException("error reading commit metadata for " + instant);
        }
    });
    Map<HoodieFileGroupId, HoodieInstant> replacedFileGroups = resultStream.collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
    resetReplacedFileGroups(replacedFileGroups);
    LOG.info("Took " + hoodieTimer.endTimer() + " ms to read  " + replacedTimeline.countInstants() + " instants, " + replacedFileGroups.size() + " replaced file groups");
}
Also used : BootstrapBaseFileMapping(org.apache.hudi.common.model.BootstrapBaseFileMapping) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) ReadLock(java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) WriteLock(java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock) Predicate(java.util.function.Predicate) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) AbstractMap(java.util.AbstractMap) List(java.util.List) GREATER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS) Stream(java.util.stream.Stream) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) METADATA_BOOTSTRAP_INSTANT_TS(org.apache.hudi.common.table.timeline.HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileNotFoundException(java.io.FileNotFoundException) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) AbstractMap(java.util.AbstractMap) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata)

Example 17 with HoodieReplaceCommitMetadata

use of org.apache.hudi.common.model.HoodieReplaceCommitMetadata in project hudi by apache.

the class TestCleaner method testCleanWithReplaceCommits.

@Test
public void testCleanWithReplaceCommits() throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
    HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
    String p0 = "2020/01/01";
    String p1 = "2020/01/02";
    // make 1 commit, with 1 file per partition
    String file1P0C0 = UUID.randomUUID().toString();
    String file1P1C0 = UUID.randomUUID().toString();
    testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
    HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", Collections.unmodifiableMap(new HashMap<String, List<String>>() {

        {
            put(p0, CollectionUtils.createImmutableList(file1P0C0));
            put(p1, CollectionUtils.createImmutableList(file1P1C0));
        }
    }));
    metadataWriter.update(commitMetadata, "00000000000001", false);
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    metaClient = HoodieTableMetaClient.reload(metaClient);
    List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
    assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next replacecommit, with 1 clustering operation. logically delete p0. No change to p1
    // notice that clustering generates empty inflight commit files
    Map<String, String> partitionAndFileId002 = testTable.forReplaceCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0);
    String file2P0C1 = partitionAndFileId002.get(p0);
    Pair<HoodieRequestedReplaceMetadata, HoodieReplaceCommitMetadata> replaceMetadata = generateReplaceCommitMetadata("00000000000002", p0, file1P0C0, file2P0C1);
    testTable.addReplaceCommit("00000000000002", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
    // run cleaner
    List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config);
    assertEquals(0, hoodieCleanStatsTwo.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next replacecommit, with 1 clustering operation. Replace data in p1. No change to p0
    // notice that clustering generates empty inflight commit files
    Map<String, String> partitionAndFileId003 = testTable.forReplaceCommit("00000000000003").getFileIdsWithBaseFilesInPartitions(p1);
    String file3P1C2 = partitionAndFileId003.get(p1);
    replaceMetadata = generateReplaceCommitMetadata("00000000000003", p1, file1P1C0, file3P1C2);
    testTable.addReplaceCommit("00000000000003", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
    // run cleaner
    List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config);
    assertEquals(0, hoodieCleanStatsThree.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next replacecommit, with 1 clustering operation. Replace data in p0 again
    // notice that clustering generates empty inflight commit files
    Map<String, String> partitionAndFileId004 = testTable.forReplaceCommit("00000000000004").getFileIdsWithBaseFilesInPartitions(p0);
    String file4P0C3 = partitionAndFileId004.get(p0);
    replaceMetadata = generateReplaceCommitMetadata("00000000000004", p0, file2P0C1, file4P0C3);
    testTable.addReplaceCommit("00000000000004", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
    // run cleaner
    List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config, 5);
    assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3));
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2));
    assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    // file1P1C0 still stays because its not replaced until 3 and its the only version available
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next replacecommit, with 1 clustering operation. Replace all data in p1. no new files created
    // notice that clustering generates empty inflight commit files
    Map<String, String> partitionAndFileId005 = testTable.forReplaceCommit("00000000000006").getFileIdsWithBaseFilesInPartitions(p1);
    String file4P1C4 = partitionAndFileId005.get(p1);
    replaceMetadata = generateReplaceCommitMetadata("00000000000006", p0, file3P1C2, file4P1C4);
    testTable.addReplaceCommit("00000000000006", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
    List<HoodieCleanStat> hoodieCleanStatsFive = runCleaner(config, 7);
    assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3));
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2));
    assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HashMap(java.util.HashMap) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 18 with HoodieReplaceCommitMetadata

use of org.apache.hudi.common.model.HoodieReplaceCommitMetadata in project hudi by apache.

the class TestTimelineUtils method getReplaceCommitMetadata.

private byte[] getReplaceCommitMetadata(String basePath, String commitTs, String replacePartition, int replaceCount, String newFilePartition, int newFileCount, Map<String, String> extraMetadata, WriteOperationType operationType) throws IOException {
    HoodieReplaceCommitMetadata commit = new HoodieReplaceCommitMetadata();
    commit.setOperationType(operationType);
    for (int i = 1; i <= newFileCount; i++) {
        HoodieWriteStat stat = new HoodieWriteStat();
        stat.setFileId(i + "");
        stat.setPartitionPath(Paths.get(basePath, newFilePartition).toString());
        stat.setPath(commitTs + "." + i + metaClient.getTableConfig().getBaseFileFormat().getFileExtension());
        commit.addWriteStat(newFilePartition, stat);
    }
    Map<String, List<String>> partitionToReplaceFileIds = new HashMap<>();
    if (replaceCount > 0) {
        partitionToReplaceFileIds.put(replacePartition, new ArrayList<>());
    }
    for (int i = 1; i <= replaceCount; i++) {
        partitionToReplaceFileIds.get(replacePartition).add(FSUtils.createNewFileIdPfx());
    }
    commit.setPartitionToReplaceFileIds(partitionToReplaceFileIds);
    for (Map.Entry<String, String> extraEntries : extraMetadata.entrySet()) {
        commit.addMetadata(extraEntries.getKey(), extraEntries.getValue());
    }
    return commit.toJsonString().getBytes(StandardCharsets.UTF_8);
}
Also used : HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata)

Example 19 with HoodieReplaceCommitMetadata

use of org.apache.hudi.common.model.HoodieReplaceCommitMetadata in project hudi by apache.

the class HoodieTestReplaceCommitMetadataGenerator method generateReplaceCommitMetadata.

private static HoodieReplaceCommitMetadata generateReplaceCommitMetadata(HashMap<String, List<String>> partitionToFilePaths, Option<Integer> writes, Option<Integer> updates) {
    HoodieReplaceCommitMetadata metadata = new HoodieReplaceCommitMetadata();
    partitionToFilePaths.forEach((key, value) -> value.forEach(f -> {
        HoodieWriteStat writeStat = new HoodieWriteStat();
        writeStat.setPartitionPath(key);
        writeStat.setPath(DEFAULT_PATH);
        writeStat.setFileId(DEFAULT_FILEID);
        writeStat.setTotalWriteBytes(DEFAULT_TOTAL_WRITE_BYTES);
        writeStat.setPrevCommit(DEFAULT_PRE_COMMIT);
        writeStat.setNumWrites(writes.orElse(DEFAULT_NUM_WRITES));
        writeStat.setNumUpdateWrites(updates.orElse(DEFAULT_NUM_UPDATE_WRITES));
        writeStat.setTotalLogBlocks(DEFAULT_TOTAL_LOG_BLOCKS);
        writeStat.setTotalLogRecords(DEFAULT_TOTAL_LOG_RECORDS);
        metadata.addWriteStat(key, writeStat);
    }));
    metadata.setPartitionToReplaceFileIds(new HashMap<String, List<String>>() {

        {
            // TODO fix
            put(DEFAULT_FIRST_PARTITION_PATH, createImmutableList(baseFileName(DEFAULT_FIRST_PARTITION_PATH, "1")));
        }
    });
    return metadata;
}
Also used : HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) UUID(java.util.UUID) FileCreateUtils.baseFileName(org.apache.hudi.common.testutils.FileCreateUtils.baseFileName) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) List(java.util.List) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) CollectionUtils.createImmutableList(org.apache.hudi.common.util.CollectionUtils.createImmutableList) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) Collections(java.util.Collections) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) List(java.util.List) CollectionUtils.createImmutableList(org.apache.hudi.common.util.CollectionUtils.createImmutableList) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata)

Aggregations

HoodieReplaceCommitMetadata (org.apache.hudi.common.model.HoodieReplaceCommitMetadata)19 List (java.util.List)14 HoodieWriteStat (org.apache.hudi.common.model.HoodieWriteStat)13 HashMap (java.util.HashMap)12 ArrayList (java.util.ArrayList)11 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)10 Map (java.util.Map)8 HoodieRequestedReplaceMetadata (org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata)8 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)6 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)6 Option (org.apache.hudi.common.util.Option)6 LogManager (org.apache.log4j.LogManager)6 Logger (org.apache.log4j.Logger)6 Collectors (java.util.stream.Collectors)5 WriteOperationType (org.apache.hudi.common.model.WriteOperationType)5 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)5 IOException (java.io.IOException)4 Path (org.apache.hadoop.fs.Path)4 FSUtils (org.apache.hudi.common.fs.FSUtils)4 Set (java.util.Set)3