Search in sources :

Example 91 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieTimelineArchiver method archiveAndGetCommitsList.

private Pair<List<HoodieInstant>, List<HoodieInstant>> archiveAndGetCommitsList(HoodieWriteConfig writeConfig) throws IOException {
    metaClient.reloadActiveTimeline();
    HoodieTimeline timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
    List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
    HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient);
    HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table);
    archiver.archiveIfRequired(context);
    timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
    List<HoodieInstant> commitsAfterArchival = timeline.getInstants().collect(Collectors.toList());
    return Pair.of(originalCommits, commitsAfterArchival);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTable(org.apache.hudi.table.HoodieTable)

Example 92 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieTimelineArchiver method verifyInflightInstants.

private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) {
    HoodieTimeline timeline = metaClient.getActiveTimeline().reload().getTimelineOfActions(Collections.singleton(HoodieTimeline.CLEAN_ACTION)).filterInflights();
    assertEquals(expectedTotalInstants, timeline.countInstants(), "Loaded inflight clean actions and the count should match");
}
Also used : HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 93 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieSparkCopyOnWriteTableArchiveWithReplace method testDeletePartitionAndArchive.

@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testDeletePartitionAndArchive(boolean metadataEnabled) throws IOException {
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.COPY_ON_WRITE);
    HoodieWriteConfig writeConfig = getConfigBuilder(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).retainCommits(1).build()).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(metadataEnabled).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(writeConfig);
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(DEFAULT_PARTITION_PATHS)) {
        // 1st write batch; 3 commits for 3 partitions
        String instantTime1 = HoodieActiveTimeline.createNewInstantTime(1000);
        client.startCommitWithTime(instantTime1);
        client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime1, 10, DEFAULT_FIRST_PARTITION_PATH), 1), instantTime1);
        String instantTime2 = HoodieActiveTimeline.createNewInstantTime(2000);
        client.startCommitWithTime(instantTime2);
        client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime2, 10, DEFAULT_SECOND_PARTITION_PATH), 1), instantTime2);
        String instantTime3 = HoodieActiveTimeline.createNewInstantTime(3000);
        client.startCommitWithTime(instantTime3);
        client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime3, 1, DEFAULT_THIRD_PARTITION_PATH), 1), instantTime3);
        final HoodieTimeline timeline1 = metaClient.getCommitsTimeline().filterCompletedInstants();
        assertEquals(21, countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline1, Option.empty()));
        // delete the 1st and the 2nd partition; 1 replace commit
        final String instantTime4 = HoodieActiveTimeline.createNewInstantTime(4000);
        client.startCommitWithTime(instantTime4, HoodieActiveTimeline.REPLACE_COMMIT_ACTION);
        client.deletePartitions(Arrays.asList(DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH), instantTime4);
        // 2nd write batch; 4 commits for the 3rd partition; the 3rd commit to trigger archiving the replace commit
        for (int i = 5; i < 9; i++) {
            String instantTime = HoodieActiveTimeline.createNewInstantTime(i * 1000);
            client.startCommitWithTime(instantTime);
            client.insert(jsc().parallelize(dataGen.generateInsertsForPartition(instantTime, 1, DEFAULT_THIRD_PARTITION_PATH), 1), instantTime);
        }
        // verify archived timeline
        metaClient = HoodieTableMetaClient.reload(metaClient);
        final HoodieTimeline archivedTimeline = metaClient.getArchivedTimeline();
        assertTrue(archivedTimeline.containsInstant(instantTime1));
        assertTrue(archivedTimeline.containsInstant(instantTime2));
        assertTrue(archivedTimeline.containsInstant(instantTime3));
        assertTrue(archivedTimeline.containsInstant(instantTime4), "should contain the replace commit.");
        // verify records
        final HoodieTimeline timeline2 = metaClient.getCommitTimeline().filterCompletedInstants();
        assertEquals(5, countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline2, Option.empty()), "should only have the 4 records from the 3rd partition.");
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 94 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class SparkUpsertDeltaCommitPartitioner method getSmallFiles.

@Override
protected List<SmallFile> getSmallFiles(String partitionPath) {
    // Init here since this class (and member variables) might not have been initialized
    HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
    if (commitTimeline.empty()) {
        return Collections.emptyList();
    }
    HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
    // Find out all eligible small file slices, looking for
    // smallest file in the partition to append to
    List<FileSlice> smallFileSlicesCandidates = getSmallFileCandidates(partitionPath, latestCommitTime);
    List<SmallFile> smallFileLocations = new ArrayList<>();
    // Create SmallFiles from the eligible file slices
    for (FileSlice smallFileSlice : smallFileSlicesCandidates) {
        SmallFile sf = new SmallFile();
        if (smallFileSlice.getBaseFile().isPresent()) {
            // TODO : Move logic of file name, file id, base commit time handling inside file slice
            String filename = smallFileSlice.getBaseFile().get().getFileName();
            sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
            sf.sizeBytes = getTotalFileSize(smallFileSlice);
            smallFileLocations.add(sf);
        } else {
            HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get();
            sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
            sf.sizeBytes = getTotalFileSize(smallFileSlice);
            smallFileLocations.add(sf);
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 95 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class UpsertPartitioner method getSmallFiles.

/**
 * Returns a list of small files in the given partition path.
 */
protected List<SmallFile> getSmallFiles(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants();
    if (!commitTimeline.empty()) {
        // if we have some commits
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        List<HoodieBaseFile> allFiles = table.getBaseFileOnlyView().getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
        for (HoodieBaseFile file : allFiles) {
            if (file.getFileSize() < config.getParquetSmallFileLimit()) {
                String filename = file.getFileName();
                SmallFile sf = new SmallFile();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = file.getFileSize();
                smallFileLocations.add(sf);
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Aggregations

HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)118 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)74 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)39 List (java.util.List)36 IOException (java.io.IOException)34 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)34 ArrayList (java.util.ArrayList)32 Option (org.apache.hudi.common.util.Option)30 Collectors (java.util.stream.Collectors)29 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)29 HoodieException (org.apache.hudi.exception.HoodieException)26 Map (java.util.Map)25 FileStatus (org.apache.hadoop.fs.FileStatus)24 Path (org.apache.hadoop.fs.Path)24 Set (java.util.Set)22 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)22 FileSlice (org.apache.hudi.common.model.FileSlice)21 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)21 Pair (org.apache.hudi.common.util.collection.Pair)21 FSUtils (org.apache.hudi.common.fs.FSUtils)20