Search in sources :

Example 1 with SmallFile

use of org.apache.hudi.table.action.commit.SmallFile in project hudi by apache.

the class DeltaWriteProfile method smallFilesProfile.

@Override
protected List<SmallFile> smallFilesProfile(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    // Init here since this class (and member variables) might not have been initialized
    HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
    // Find out all eligible small file slices
    if (!commitTimeline.empty()) {
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        // find the smallest file in partition and append to it
        List<FileSlice> allSmallFileSlices = new ArrayList<>();
        // If we can index log files, we can add more inserts to log files for fileIds including those under
        // pending compaction.
        List<FileSlice> allFileSlices = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).collect(Collectors.toList());
        for (FileSlice fileSlice : allFileSlices) {
            if (isSmallFile(fileSlice)) {
                allSmallFileSlices.add(fileSlice);
            }
        }
        // Create SmallFiles from the eligible file slices
        for (FileSlice smallFileSlice : allSmallFileSlices) {
            SmallFile sf = new SmallFile();
            if (smallFileSlice.getBaseFile().isPresent()) {
                // TODO : Move logic of file name, file id, base commit time handling inside file slice
                String filename = smallFileSlice.getBaseFile().get().getFileName();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = getTotalFileSize(smallFileSlice);
                smallFileLocations.add(sf);
            } else {
                smallFileSlice.getLogFiles().findFirst().ifPresent(logFile -> {
                    // in case there is something error, and the file slice has no log file
                    sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
                    sf.sizeBytes = getTotalFileSize(smallFileSlice);
                    smallFileLocations.add(sf);
                });
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 2 with SmallFile

use of org.apache.hudi.table.action.commit.SmallFile in project hudi by apache.

the class WriteProfile method smallFilesProfile.

/**
 * Returns a list of small files in the given partition path from the latest filesystem view.
 */
protected List<SmallFile> smallFilesProfile(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
    if (!commitTimeline.empty()) {
        // if we have some commits
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        List<HoodieBaseFile> allFiles = fsView.getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
        for (HoodieBaseFile file : allFiles) {
            // filter out the corrupted files.
            if (file.getFileSize() < config.getParquetSmallFileLimit() && file.getFileSize() > 0) {
                String filename = file.getFileName();
                SmallFile sf = new SmallFile();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = file.getFileSize();
                smallFileLocations.add(sf);
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 3 with SmallFile

use of org.apache.hudi.table.action.commit.SmallFile in project hudi by apache.

the class TestBucketAssigner method testUpdateAndInsertWithPartialSmallFiles.

/**
 * Test that only partial small files are assigned to the task.
 */
@Test
public void testUpdateAndInsertWithPartialSmallFiles() {
    SmallFile f0 = new SmallFile();
    f0.location = new HoodieRecordLocation("t0", "f0");
    f0.sizeBytes = 12;
    SmallFile f1 = new SmallFile();
    f1.location = new HoodieRecordLocation("t0", "f1");
    // no left space to append new records to this bucket
    f1.sizeBytes = 122879;
    SmallFile f2 = new SmallFile();
    f2.location = new HoodieRecordLocation("t0", "f2");
    f2.sizeBytes = 56;
    Map<String, List<SmallFile>> smallFilesMap = new HashMap<>();
    smallFilesMap.put("par1", Arrays.asList(f0, f1, f2));
    MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(0, 2, context, writeConfig, smallFilesMap);
    mockBucketAssigner.addUpdate("par1", "f0");
    BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2");
    mockBucketAssigner.addUpdate("par1", "f2");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2");
    MockBucketAssigner mockBucketAssigner2 = new MockBucketAssigner(1, 2, context, writeConfig, smallFilesMap);
    mockBucketAssigner2.addUpdate("par1", "f0");
    BucketInfo bucketInfo2 = mockBucketAssigner2.addInsert("par1");
    assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner2.addInsert("par1");
    bucketInfo2 = mockBucketAssigner2.addInsert("par1");
    assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner2.addUpdate("par1", "f2");
    mockBucketAssigner2.addInsert("par1");
    bucketInfo2 = mockBucketAssigner2.addInsert("par1");
    assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0");
}
Also used : HashMap(java.util.HashMap) SmallFile(org.apache.hudi.table.action.commit.SmallFile) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) List(java.util.List) BucketInfo(org.apache.hudi.table.action.commit.BucketInfo) Test(org.junit.jupiter.api.Test)

Example 4 with SmallFile

use of org.apache.hudi.table.action.commit.SmallFile in project hudi by apache.

the class TestBucketAssigner method testInsertWithSmallFiles.

@Test
public void testInsertWithSmallFiles() {
    SmallFile f0 = new SmallFile();
    f0.location = new HoodieRecordLocation("t0", "f0");
    f0.sizeBytes = 12;
    SmallFile f1 = new SmallFile();
    f1.location = new HoodieRecordLocation("t0", "f1");
    // no left space to append new records to this bucket
    f1.sizeBytes = 122879;
    SmallFile f2 = new SmallFile();
    f2.location = new HoodieRecordLocation("t0", "f2");
    f2.sizeBytes = 56;
    Map<String, List<SmallFile>> smallFilesMap = new HashMap<>();
    smallFilesMap.put("par1", Arrays.asList(f0, f1));
    smallFilesMap.put("par2", Collections.singletonList(f2));
    MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(context, writeConfig, smallFilesMap);
    BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner.addInsert("par2");
    bucketInfo = mockBucketAssigner.addInsert("par2");
    assertBucketEquals(bucketInfo, "par2", BucketType.UPDATE, "f2");
    bucketInfo = mockBucketAssigner.addInsert("par3");
    assertBucketEquals(bucketInfo, "par3", BucketType.INSERT);
    bucketInfo = mockBucketAssigner.addInsert("par3");
    assertBucketEquals(bucketInfo, "par3", BucketType.INSERT);
}
Also used : HashMap(java.util.HashMap) SmallFile(org.apache.hudi.table.action.commit.SmallFile) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) List(java.util.List) BucketInfo(org.apache.hudi.table.action.commit.BucketInfo) Test(org.junit.jupiter.api.Test)

Example 5 with SmallFile

use of org.apache.hudi.table.action.commit.SmallFile in project hudi by apache.

the class SparkUpsertDeltaCommitPartitioner method getSmallFiles.

@Override
protected List<SmallFile> getSmallFiles(String partitionPath) {
    // Init here since this class (and member variables) might not have been initialized
    HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline();
    if (commitTimeline.empty()) {
        return Collections.emptyList();
    }
    HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
    // Find out all eligible small file slices, looking for
    // smallest file in the partition to append to
    List<FileSlice> smallFileSlicesCandidates = getSmallFileCandidates(partitionPath, latestCommitTime);
    List<SmallFile> smallFileLocations = new ArrayList<>();
    // Create SmallFiles from the eligible file slices
    for (FileSlice smallFileSlice : smallFileSlicesCandidates) {
        SmallFile sf = new SmallFile();
        if (smallFileSlice.getBaseFile().isPresent()) {
            // TODO : Move logic of file name, file id, base commit time handling inside file slice
            String filename = smallFileSlice.getBaseFile().get().getFileName();
            sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
            sf.sizeBytes = getTotalFileSize(smallFileSlice);
            smallFileLocations.add(sf);
        } else {
            HoodieLogFile logFile = smallFileSlice.getLogFiles().findFirst().get();
            sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
            sf.sizeBytes = getTotalFileSize(smallFileSlice);
            smallFileLocations.add(sf);
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Aggregations

SmallFile (org.apache.hudi.table.action.commit.SmallFile)9 HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)8 Test (org.junit.jupiter.api.Test)6 HashMap (java.util.HashMap)4 List (java.util.List)4 BucketInfo (org.apache.hudi.table.action.commit.BucketInfo)4 ArrayList (java.util.ArrayList)3 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)3 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)3 FileSlice (org.apache.hudi.common.model.FileSlice)2 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)1 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)1 WriteProfile (org.apache.hudi.sink.partitioner.profile.WriteProfile)1