Search in sources :

Example 36 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class JavaUpsertPartitioner method getSmallFiles.

/**
 * Returns a list of small files in the given partition path.
 */
protected List<SmallFile> getSmallFiles(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants();
    if (!commitTimeline.empty()) {
        // if we have some commits
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        List<HoodieBaseFile> allFiles = table.getBaseFileOnlyView().getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
        for (HoodieBaseFile file : allFiles) {
            if (file.getFileSize() < config.getParquetSmallFileLimit()) {
                String filename = file.getFileName();
                SmallFile sf = new SmallFile();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = file.getFileSize();
                smallFileLocations.add(sf);
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 37 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class JavaUpsertPartitioner method assignInserts.

private void assignInserts(WorkloadProfile profile, HoodieEngineContext context) {
    // for new inserts, compute buckets depending on how many records we have for each partition
    Set<String> partitionPaths = profile.getPartitionPaths();
    long averageRecordSize = averageBytesPerRecord(table.getMetaClient().getActiveTimeline().getCommitTimeline().filterCompletedInstants(), config);
    LOG.info("AvgRecordSize => " + averageRecordSize);
    Map<String, List<SmallFile>> partitionSmallFilesMap = getSmallFilesForPartitions(new ArrayList<String>(partitionPaths), context);
    for (String partitionPath : partitionPaths) {
        WorkloadStat pStat = profile.getWorkloadStat(partitionPath);
        WorkloadStat outputWorkloadStats = profile.getOutputPartitionPathStatMap().getOrDefault(partitionPath, new WorkloadStat());
        if (pStat.getNumInserts() > 0) {
            List<SmallFile> smallFiles = partitionSmallFilesMap.getOrDefault(partitionPath, new ArrayList<>());
            this.smallFiles.addAll(smallFiles);
            LOG.info("For partitionPath : " + partitionPath + " Small Files => " + smallFiles);
            long totalUnassignedInserts = pStat.getNumInserts();
            List<Integer> bucketNumbers = new ArrayList<>();
            List<Long> recordsPerBucket = new ArrayList<>();
            // first try packing this into one of the smallFiles
            for (SmallFile smallFile : smallFiles) {
                long recordsToAppend = Math.min((config.getParquetMaxFileSize() - smallFile.sizeBytes) / averageRecordSize, totalUnassignedInserts);
                if (recordsToAppend > 0) {
                    // create a new bucket or re-use an existing bucket
                    int bucket;
                    if (updateLocationToBucket.containsKey(smallFile.location.getFileId())) {
                        bucket = updateLocationToBucket.get(smallFile.location.getFileId());
                        LOG.info("Assigning " + recordsToAppend + " inserts to existing update bucket " + bucket);
                    } else {
                        bucket = addUpdateBucket(partitionPath, smallFile.location.getFileId());
                        LOG.info("Assigning " + recordsToAppend + " inserts to new update bucket " + bucket);
                    }
                    if (profile.hasOutputWorkLoadStats()) {
                        outputWorkloadStats.addInserts(smallFile.location, recordsToAppend);
                    }
                    bucketNumbers.add(bucket);
                    recordsPerBucket.add(recordsToAppend);
                    totalUnassignedInserts -= recordsToAppend;
                }
            }
            // if we have anything more, create new insert buckets, like normal
            if (totalUnassignedInserts > 0) {
                long insertRecordsPerBucket = config.getCopyOnWriteInsertSplitSize();
                if (config.shouldAutoTuneInsertSplits()) {
                    insertRecordsPerBucket = config.getParquetMaxFileSize() / averageRecordSize;
                }
                int insertBuckets = (int) Math.ceil((1.0 * totalUnassignedInserts) / insertRecordsPerBucket);
                LOG.info("After small file assignment: unassignedInserts => " + totalUnassignedInserts + ", totalInsertBuckets => " + insertBuckets + ", recordsPerBucket => " + insertRecordsPerBucket);
                for (int b = 0; b < insertBuckets; b++) {
                    bucketNumbers.add(totalBuckets);
                    if (b < insertBuckets - 1) {
                        recordsPerBucket.add(insertRecordsPerBucket);
                    } else {
                        recordsPerBucket.add(totalUnassignedInserts - (insertBuckets - 1) * insertRecordsPerBucket);
                    }
                    BucketInfo bucketInfo = new BucketInfo(BucketType.INSERT, FSUtils.createNewFileIdPfx(), partitionPath);
                    bucketInfoMap.put(totalBuckets, bucketInfo);
                    if (profile.hasOutputWorkLoadStats()) {
                        outputWorkloadStats.addInserts(new HoodieRecordLocation(HoodieWriteStat.NULL_COMMIT, bucketInfo.getFileIdPrefix()), recordsPerBucket.get(recordsPerBucket.size() - 1));
                    }
                    totalBuckets++;
                }
            }
            // Go over all such buckets, and assign weights as per amount of incoming inserts.
            List<InsertBucketCumulativeWeightPair> insertBuckets = new ArrayList<>();
            double currentCumulativeWeight = 0;
            for (int i = 0; i < bucketNumbers.size(); i++) {
                InsertBucket bkt = new InsertBucket();
                bkt.bucketNumber = bucketNumbers.get(i);
                bkt.weight = (1.0 * recordsPerBucket.get(i)) / pStat.getNumInserts();
                currentCumulativeWeight += bkt.weight;
                insertBuckets.add(new InsertBucketCumulativeWeightPair(bkt, currentCumulativeWeight));
            }
            LOG.info("Total insert buckets for partition path " + partitionPath + " => " + insertBuckets);
            partitionPathToInsertBucketInfos.put(partitionPath, insertBuckets);
        }
        if (profile.hasOutputWorkLoadStats()) {
            profile.updateOutputPartitionPathStatMap(partitionPath, outputWorkloadStats);
        }
    }
}
Also used : ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) WorkloadStat(org.apache.hudi.table.WorkloadStat) ArrayList(java.util.ArrayList) List(java.util.List)

Example 38 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestBucketAssigner method testSmallFilesOfThisTask.

/**
 * Test that the file ids generated by the task can finally shuffled to itself.
 */
@Test
void testSmallFilesOfThisTask() {
    MockBucketAssigner mockBucketAssigner1 = new MockBucketAssigner(context, writeConfig);
    String fileId1 = mockBucketAssigner1.createFileIdOfThisTask();
    SmallFile smallFile1 = new SmallFile();
    smallFile1.location = new HoodieRecordLocation("t0", fileId1);
    smallFile1.sizeBytes = 123;
    List<SmallFile> smallFiles1 = mockBucketAssigner1.smallFilesOfThisTask(Collections.singletonList(smallFile1));
    assertThat(smallFiles1.size(), is(1));
    // modify the parallelism and test again
    MockBucketAssigner mockBucketAssigner2 = new MockBucketAssigner(123, 200, context, writeConfig, Collections.emptyMap());
    String fileId2 = mockBucketAssigner2.createFileIdOfThisTask();
    SmallFile smallFile2 = new SmallFile();
    smallFile2.location = new HoodieRecordLocation("t0", fileId2);
    smallFile2.sizeBytes = 123;
    String fileId3 = mockBucketAssigner2.createFileIdOfThisTask();
    SmallFile smallFile3 = new SmallFile();
    smallFile3.location = new HoodieRecordLocation("t0", fileId3);
    smallFile3.sizeBytes = 456;
    List<SmallFile> smallFiles2 = mockBucketAssigner1.smallFilesOfThisTask(Arrays.asList(smallFile2, smallFile3));
    assertThat(smallFiles2.size(), is(2));
}
Also used : SmallFile(org.apache.hudi.table.action.commit.SmallFile) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Test(org.junit.jupiter.api.Test)

Example 39 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestBucketAssigner method testUpdateAndInsertWithSmallFiles.

@Test
public void testUpdateAndInsertWithSmallFiles() {
    SmallFile f0 = new SmallFile();
    f0.location = new HoodieRecordLocation("t0", "f0");
    f0.sizeBytes = 12;
    SmallFile f1 = new SmallFile();
    f1.location = new HoodieRecordLocation("t0", "f1");
    // no left space to append new records to this bucket
    f1.sizeBytes = 122879;
    SmallFile f2 = new SmallFile();
    f2.location = new HoodieRecordLocation("t0", "f2");
    f2.sizeBytes = 56;
    Map<String, List<SmallFile>> smallFilesMap = new HashMap<>();
    smallFilesMap.put("par1", Arrays.asList(f0, f1));
    smallFilesMap.put("par2", Collections.singletonList(f2));
    MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(context, writeConfig, smallFilesMap);
    mockBucketAssigner.addUpdate("par1", "f0");
    BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner.addUpdate("par1", "f2");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner.addUpdate("par2", "f0");
    mockBucketAssigner.addInsert("par2");
    bucketInfo = mockBucketAssigner.addInsert("par2");
    assertBucketEquals(bucketInfo, "par2", BucketType.UPDATE, "f2");
}
Also used : HashMap(java.util.HashMap) SmallFile(org.apache.hudi.table.action.commit.SmallFile) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) List(java.util.List) BucketInfo(org.apache.hudi.table.action.commit.BucketInfo) Test(org.junit.jupiter.api.Test)

Example 40 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestBucketAssigner method testInsertWithPartialSmallFiles.

/**
 * Test that only partial small files are assigned to the task.
 */
@Test
public void testInsertWithPartialSmallFiles() {
    SmallFile f0 = new SmallFile();
    f0.location = new HoodieRecordLocation("t0", "f0");
    f0.sizeBytes = 12;
    SmallFile f1 = new SmallFile();
    f1.location = new HoodieRecordLocation("t0", "f1");
    // no left space to append new records to this bucket
    f1.sizeBytes = 122879;
    SmallFile f2 = new SmallFile();
    f2.location = new HoodieRecordLocation("t0", "f2");
    f2.sizeBytes = 56;
    Map<String, List<SmallFile>> smallFilesMap = new HashMap<>();
    smallFilesMap.put("par1", Arrays.asList(f0, f1, f2));
    MockBucketAssigner mockBucketAssigner = new MockBucketAssigner(0, 2, context, writeConfig, smallFilesMap);
    BucketInfo bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2");
    mockBucketAssigner.addInsert("par1");
    bucketInfo = mockBucketAssigner.addInsert("par1");
    assertBucketEquals(bucketInfo, "par1", BucketType.UPDATE, "f2");
    bucketInfo = mockBucketAssigner.addInsert("par3");
    assertBucketEquals(bucketInfo, "par3", BucketType.INSERT);
    bucketInfo = mockBucketAssigner.addInsert("par3");
    assertBucketEquals(bucketInfo, "par3", BucketType.INSERT);
    MockBucketAssigner mockBucketAssigner2 = new MockBucketAssigner(1, 2, context, writeConfig, smallFilesMap);
    BucketInfo bucketInfo2 = mockBucketAssigner2.addInsert("par1");
    assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0");
    mockBucketAssigner2.addInsert("par1");
    bucketInfo2 = mockBucketAssigner2.addInsert("par1");
    assertBucketEquals(bucketInfo2, "par1", BucketType.UPDATE, "f0");
    bucketInfo2 = mockBucketAssigner2.addInsert("par3");
    assertBucketEquals(bucketInfo2, "par3", BucketType.INSERT);
    bucketInfo2 = mockBucketAssigner2.addInsert("par3");
    assertBucketEquals(bucketInfo2, "par3", BucketType.INSERT);
}
Also used : HashMap(java.util.HashMap) SmallFile(org.apache.hudi.table.action.commit.SmallFile) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) List(java.util.List) BucketInfo(org.apache.hudi.table.action.commit.BucketInfo) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)43 ArrayList (java.util.ArrayList)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 HashMap (java.util.HashMap)16 List (java.util.List)16 HoodieKey (org.apache.hudi.common.model.HoodieKey)16 Map (java.util.Map)13 Pair (org.apache.hudi.common.util.collection.Pair)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)9 Option (org.apache.hudi.common.util.Option)9 IOException (java.io.IOException)8 WorkloadStat (org.apache.hudi.table.WorkloadStat)8 SmallFile (org.apache.hudi.table.action.commit.SmallFile)8 Tuple2 (scala.Tuple2)8 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 HoodieTable (org.apache.hudi.table.HoodieTable)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6