Search in sources :

Example 1 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieFlinkWriteClient method getOrCreateWriteHandle.

/**
 * Get or create a new write handle in order to reuse the file handles.
 *
 * @param record      The first record in the bucket
 * @param config      Write config
 * @param instantTime The instant time
 * @param table       The table
 * @param recordItr   Record iterator
 * @return Existing write handle or create a new one
 */
private HoodieWriteHandle<?, ?, ?, ?> getOrCreateWriteHandle(HoodieRecord<T> record, HoodieWriteConfig config, String instantTime, HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table, Iterator<HoodieRecord<T>> recordItr) {
    final HoodieRecordLocation loc = record.getCurrentLocation();
    final String fileID = loc.getFileId();
    final String partitionPath = record.getPartitionPath();
    final boolean insertClustering = config.allowDuplicateInserts();
    if (bucketToHandles.containsKey(fileID)) {
        MiniBatchHandle lastHandle = (MiniBatchHandle) bucketToHandles.get(fileID);
        if (lastHandle.shouldReplace()) {
            HoodieWriteHandle<?, ?, ?, ?> writeHandle = insertClustering ? new FlinkConcatAndReplaceHandle<>(config, instantTime, table, recordItr, partitionPath, fileID, table.getTaskContextSupplier(), lastHandle.getWritePath()) : new FlinkMergeAndReplaceHandle<>(config, instantTime, table, recordItr, partitionPath, fileID, table.getTaskContextSupplier(), lastHandle.getWritePath());
            // override with new replace handle
            this.bucketToHandles.put(fileID, writeHandle);
            return writeHandle;
        }
    }
    final boolean isDelta = table.getMetaClient().getTableType().equals(HoodieTableType.MERGE_ON_READ);
    final HoodieWriteHandle<?, ?, ?, ?> writeHandle;
    if (isDelta) {
        writeHandle = new FlinkAppendHandle<>(config, instantTime, table, partitionPath, fileID, recordItr, table.getTaskContextSupplier());
    } else if (loc.getInstantTime().equals("I")) {
        writeHandle = new FlinkCreateHandle<>(config, instantTime, table, partitionPath, fileID, table.getTaskContextSupplier());
    } else {
        writeHandle = insertClustering ? new FlinkConcatHandle<>(config, instantTime, table, recordItr, partitionPath, fileID, table.getTaskContextSupplier()) : new FlinkMergeHandle<>(config, instantTime, table, recordItr, partitionPath, fileID, table.getTaskContextSupplier());
    }
    this.bucketToHandles.put(fileID, writeHandle);
    return writeHandle;
}
Also used : HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) FlinkCreateHandle(org.apache.hudi.io.FlinkCreateHandle) MiniBatchHandle(org.apache.hudi.io.MiniBatchHandle)

Example 2 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestUpsertPartitioner method getUpsertPartitioner.

private UpsertPartitioner getUpsertPartitioner(int smallFileSize, int numInserts, int numUpdates, int fileSize, String testPartitionPath, boolean autoSplitInserts) throws Exception {
    HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1000 * 1024).parquetMaxFileSize(1000 * 1024).orcMaxFileSize(1000 * 1024).build()).build();
    FileCreateUtils.createCommit(basePath, "001");
    FileCreateUtils.createBaseFile(basePath, testPartitionPath, "001", "file1", fileSize);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieSparkCopyOnWriteTable table = (HoodieSparkCopyOnWriteTable) HoodieSparkTable.create(config, context, metaClient);
    HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] { testPartitionPath });
    List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
    List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
    for (HoodieRecord updateRec : updateRecords) {
        updateRec.unseal();
        updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
        updateRec.seal();
    }
    List<HoodieRecord> records = new ArrayList<>();
    records.addAll(insertRecords);
    records.addAll(updateRecords);
    WorkloadProfile profile = new WorkloadProfile(buildProfile(jsc.parallelize(records)));
    UpsertPartitioner partitioner = new UpsertPartitioner(profile, context, table, config);
    assertEquals(0, partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.ofNullable(updateRecords.get(0).getCurrentLocation()))), "Update record should have gone to the 1 update partition");
    return partitioner;
}
Also used : WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieSparkCopyOnWriteTable(org.apache.hudi.table.HoodieSparkCopyOnWriteTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator)

Example 3 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieClientTestHarness method buildProfile.

public static Pair<HashMap<String, WorkloadStat>, WorkloadStat> buildProfile(JavaRDD<HoodieRecord> inputRecordsRDD) {
    HashMap<String, WorkloadStat> partitionPathStatMap = new HashMap<>();
    WorkloadStat globalStat = new WorkloadStat();
    // group the records by partitionPath + currentLocation combination, count the number of
    // records in each partition
    Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = inputRecordsRDD.mapToPair(record -> new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)).countByKey();
    // count the number of both inserts and updates in each partition, update the counts to workLoadStats
    for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
        String partitionPath = e.getKey()._1();
        Long count = e.getValue();
        Option<HoodieRecordLocation> locOption = e.getKey()._2();
        if (!partitionPathStatMap.containsKey(partitionPath)) {
            partitionPathStatMap.put(partitionPath, new WorkloadStat());
        }
        if (locOption.isPresent()) {
            // update
            partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
            globalStat.addUpdates(locOption.get(), count);
        } else {
            // insert
            partitionPathStatMap.get(partitionPath).addInserts(count);
            globalStat.addInserts(count);
        }
    }
    return Pair.of(partitionPathStatMap, globalStat);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Random(java.util.Random) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) AfterAll(org.junit.jupiter.api.AfterAll) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Assertions.assertLinesMatch(org.junit.jupiter.api.Assertions.assertLinesMatch) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) CleanPlanV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WorkloadStat(org.apache.hudi.table.WorkloadStat) CleanerUtils.convertCleanMetadata(org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Tuple2(scala.Tuple2) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) TestInfo(org.junit.jupiter.api.TestInfo) Executors(java.util.concurrent.Executors) Serializable(java.io.Serializable) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) TimelineService(org.apache.hudi.timeline.service.TimelineService) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ExecutorService(java.util.concurrent.ExecutorService) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HdfsTestService(org.apache.hudi.common.testutils.minicluster.HdfsTestService) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) SQLContext(org.apache.spark.sql.SQLContext) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Assertions(org.junit.jupiter.api.Assertions) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) WorkloadStat(org.apache.hudi.table.WorkloadStat) Tuple2(scala.Tuple2) Map(java.util.Map) HashMap(java.util.HashMap)

Example 4 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class TestHoodieKeyLocationFetchHandle method testFetchHandle.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testFetchHandle(boolean populateMetaFields) throws Exception {
    metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, populateMetaFields ? new Properties() : getPropertiesForKeyGen());
    config = getConfigBuilder().withProperties(getPropertiesForKeyGen()).withIndexConfig(HoodieIndexConfig.newBuilder().build()).build();
    List<HoodieRecord> records = dataGen.generateInserts(makeNewCommitTime(), 100);
    Map<String, List<HoodieRecord>> partitionRecordsMap = recordsToPartitionRecordsMap(records);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(hoodieTable, AVRO_SCHEMA_WITH_METADATA_FIELDS);
    Map<Tuple2<String, String>, List<Tuple2<HoodieKey, HoodieRecordLocation>>> expectedList = writeToParquetAndGetExpectedRecordLocations(partitionRecordsMap, testTable);
    List<Tuple2<String, HoodieBaseFile>> partitionPathFileIdPairs = loadAllFilesForPartitions(new ArrayList<>(partitionRecordsMap.keySet()), context, hoodieTable);
    BaseKeyGenerator keyGenerator = (BaseKeyGenerator) HoodieSparkKeyGeneratorFactory.createKeyGenerator(new TypedProperties(getPropertiesForKeyGen()));
    for (Tuple2<String, HoodieBaseFile> entry : partitionPathFileIdPairs) {
        HoodieKeyLocationFetchHandle fetcherHandle = new HoodieKeyLocationFetchHandle(config, hoodieTable, Pair.of(entry._1, entry._2), populateMetaFields ? Option.empty() : Option.of(keyGenerator));
        Iterator<Pair<HoodieKey, HoodieRecordLocation>> result = fetcherHandle.locations().iterator();
        List<Tuple2<HoodieKey, HoodieRecordLocation>> actualList = new ArrayList<>();
        result.forEachRemaining(x -> actualList.add(new Tuple2<>(x.getLeft(), x.getRight())));
        assertEquals(expectedList.get(new Tuple2<>(entry._1, entry._2.getFileId())), actualList);
    }
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Properties(java.util.Properties) TypedProperties(org.apache.hudi.common.config.TypedProperties) TypedProperties(org.apache.hudi.common.config.TypedProperties) Tuple2(scala.Tuple2) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Pair(org.apache.hudi.common.util.collection.Pair) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 5 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class DeltaWriteProfile method smallFilesProfile.

@Override
protected List<SmallFile> smallFilesProfile(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    // Init here since this class (and member variables) might not have been initialized
    HoodieTimeline commitTimeline = metaClient.getCommitsTimeline().filterCompletedInstants();
    // Find out all eligible small file slices
    if (!commitTimeline.empty()) {
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        // find the smallest file in partition and append to it
        List<FileSlice> allSmallFileSlices = new ArrayList<>();
        // If we can index log files, we can add more inserts to log files for fileIds including those under
        // pending compaction.
        List<FileSlice> allFileSlices = fsView.getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp(), false).collect(Collectors.toList());
        for (FileSlice fileSlice : allFileSlices) {
            if (isSmallFile(fileSlice)) {
                allSmallFileSlices.add(fileSlice);
            }
        }
        // Create SmallFiles from the eligible file slices
        for (FileSlice smallFileSlice : allSmallFileSlices) {
            SmallFile sf = new SmallFile();
            if (smallFileSlice.getBaseFile().isPresent()) {
                // TODO : Move logic of file name, file id, base commit time handling inside file slice
                String filename = smallFileSlice.getBaseFile().get().getFileName();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = getTotalFileSize(smallFileSlice);
                smallFileLocations.add(sf);
            } else {
                smallFileSlice.getLogFiles().findFirst().ifPresent(logFile -> {
                    // in case there is something error, and the file slice has no log file
                    sf.location = new HoodieRecordLocation(FSUtils.getBaseCommitTimeFromLogPath(logFile.getPath()), FSUtils.getFileIdFromLogPath(logFile.getPath()));
                    sf.sizeBytes = getTotalFileSize(smallFileSlice);
                    smallFileLocations.add(sf);
                });
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) SmallFile(org.apache.hudi.table.action.commit.SmallFile) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Aggregations

HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)43 ArrayList (java.util.ArrayList)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 HashMap (java.util.HashMap)16 List (java.util.List)16 HoodieKey (org.apache.hudi.common.model.HoodieKey)16 Map (java.util.Map)13 Pair (org.apache.hudi.common.util.collection.Pair)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)9 Option (org.apache.hudi.common.util.Option)9 IOException (java.io.IOException)8 WorkloadStat (org.apache.hudi.table.WorkloadStat)8 SmallFile (org.apache.hudi.table.action.commit.SmallFile)8 Tuple2 (scala.Tuple2)8 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 HoodieTable (org.apache.hudi.table.HoodieTable)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6