Search in sources :

Example 11 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieMergeHandle method init.

/**
 * Load the new incoming records in a map and return partitionPath.
 */
protected void init(String fileId, Iterator<HoodieRecord<T>> newRecordsItr) {
    initializeIncomingRecordsMap();
    while (newRecordsItr.hasNext()) {
        HoodieRecord<T> record = newRecordsItr.next();
        // update the new location of the record, so we know where to find it next
        if (needsUpdateLocation()) {
            record.unseal();
            record.setNewLocation(new HoodieRecordLocation(instantTime, fileId));
            record.seal();
        }
        // NOTE: Once Records are added to map (spillable-map), DO NOT change it as they won't persist
        keyToNewRecords.put(record.getRecordKey(), record);
    }
    LOG.info("Number of entries in MemoryBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getInMemoryMapNumEntries() + "Total size in bytes of MemoryBasedMap => " + ((ExternalSpillableMap) keyToNewRecords).getCurrentInMemoryMapSize() + "Number of entries in BitCaskDiskMap => " + ((ExternalSpillableMap) keyToNewRecords).getDiskBasedMapNumEntries() + "Size of file spilled to disk => " + ((ExternalSpillableMap) keyToNewRecords).getSizeOfFileOnDiskInBytes());
}
Also used : ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 12 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class ListBasedHoodieBloomIndexHelper method findMatchingFilesForRecordKeys.

@Override
public HoodiePairData<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, HoodiePairData<String, String> partitionRecordKeyPairs, HoodieData<Pair<String, HoodieKey>> fileComparisonPairs, Map<String, List<BloomIndexFileInfo>> partitionToFileInfo, Map<String, Long> recordsPerPartition) {
    List<Pair<String, HoodieKey>> fileComparisonPairList = HoodieList.getList(fileComparisonPairs).stream().sorted(Comparator.comparing(Pair::getLeft)).collect(toList());
    List<HoodieKeyLookupResult> keyLookupResults = new ArrayList<>();
    Iterator<List<HoodieKeyLookupResult>> iterator = new HoodieBaseBloomIndexCheckFunction(hoodieTable, config).apply(fileComparisonPairList.iterator());
    while (iterator.hasNext()) {
        keyLookupResults.addAll(iterator.next());
    }
    keyLookupResults = keyLookupResults.stream().filter(lr -> lr.getMatchingRecordKeys().size() > 0).collect(toList());
    return context.parallelize(keyLookupResults).flatMap(lookupResult -> lookupResult.getMatchingRecordKeys().stream().map(recordKey -> new ImmutablePair<>(lookupResult, recordKey)).iterator()).mapToPair(pair -> {
        HoodieKeyLookupResult lookupResult = pair.getLeft();
        String recordKey = pair.getRight();
        return new ImmutablePair<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()));
    });
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodiePairData(org.apache.hudi.common.data.HoodiePairData) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieKeyLookupResult(org.apache.hudi.io.HoodieKeyLookupResult) ArrayList(java.util.ArrayList) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Map(java.util.Map) HoodieKey(org.apache.hudi.common.model.HoodieKey) Comparator(java.util.Comparator) Pair(org.apache.hudi.common.util.collection.Pair) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieKeyLookupResult(org.apache.hudi.io.HoodieKeyLookupResult) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 13 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieInMemoryHashIndex method updateLocation.

@Override
public HoodieData<WriteStatus> updateLocation(HoodieData<WriteStatus> writeStatuses, HoodieEngineContext context, HoodieTable hoodieTable) {
    return writeStatuses.map(writeStatus -> {
        for (HoodieRecord record : writeStatus.getWrittenRecords()) {
            if (!writeStatus.isErrored(record.getKey())) {
                HoodieKey key = record.getKey();
                Option<HoodieRecordLocation> newLocation = record.getNewLocation();
                if (newLocation.isPresent()) {
                    recordLocationMap.put(key, newLocation.get());
                } else {
                    // Delete existing index for a deleted record
                    recordLocationMap.remove(key);
                }
            }
        }
        return writeStatus;
    });
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 14 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class HoodieFlinkWriteableTestTable method appendRecordsToLogFile.

private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
    String partitionPath = groupedRecords.get(0).getPartitionPath();
    HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
    try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()).overBaseCommit(location.getInstantTime()).withFs(fs).build()) {
        Map<HeaderMetadataType, String> header = new java.util.HashMap<>();
        header.put(HeaderMetadataType.INSTANT_TIME, location.getInstantTime());
        header.put(HeaderMetadataType.SCHEMA, schema.toString());
        logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> {
            try {
                GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get();
                HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
                return (IndexedRecord) val;
            } catch (IOException e) {
                LOG.warn("Failed to convert record " + r.toString(), e);
                return null;
            }
        }).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
        return Pair.of(partitionPath, logWriter.getLogFile());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) IOException(java.io.IOException) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 15 with HoodieRecordLocation

use of org.apache.hudi.common.model.HoodieRecordLocation in project hudi by apache.

the class BaseJavaCommitActionExecutor method buildProfile.

protected Pair<HashMap<String, WorkloadStat>, WorkloadStat> buildProfile(List<HoodieRecord<T>> inputRecords) {
    HashMap<String, WorkloadStat> partitionPathStatMap = new HashMap<>();
    WorkloadStat globalStat = new WorkloadStat();
    Map<Pair<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = inputRecords.stream().map(record -> Pair.of(Pair.of(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)).collect(Collectors.groupingBy(Pair::getLeft, Collectors.counting()));
    for (Map.Entry<Pair<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
        String partitionPath = e.getKey().getLeft();
        Long count = e.getValue();
        Option<HoodieRecordLocation> locOption = e.getKey().getRight();
        if (!partitionPathStatMap.containsKey(partitionPath)) {
            partitionPathStatMap.put(partitionPath, new WorkloadStat());
        }
        if (locOption.isPresent()) {
            // update
            partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
            globalStat.addUpdates(locOption.get(), count);
        } else {
            // insert
            partitionPathStatMap.get(partitionPath).addInserts(count);
            globalStat.addInserts(count);
        }
    }
    return Pair.of(partitionPathStatMap, globalStat);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) CreateHandleFactory(org.apache.hudi.io.CreateHandleFactory) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) CommitUtils(org.apache.hudi.common.util.CommitUtils) HoodieList(org.apache.hudi.common.data.HoodieList) LinkedHashMap(java.util.LinkedHashMap) JavaLazyInsertIterable(org.apache.hudi.execution.JavaLazyInsertIterable) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Duration(java.time.Duration) Map(java.util.Map) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) Path(org.apache.hadoop.fs.Path) HoodieConcatHandle(org.apache.hudi.io.HoodieConcatHandle) WorkloadStat(org.apache.hudi.table.WorkloadStat) LinkedList(java.util.LinkedList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieKey(org.apache.hudi.common.model.HoodieKey) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) WorkloadStat(org.apache.hudi.table.WorkloadStat) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

HoodieRecordLocation (org.apache.hudi.common.model.HoodieRecordLocation)43 ArrayList (java.util.ArrayList)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)17 HashMap (java.util.HashMap)16 List (java.util.List)16 HoodieKey (org.apache.hudi.common.model.HoodieKey)16 Map (java.util.Map)13 Pair (org.apache.hudi.common.util.collection.Pair)12 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)9 Option (org.apache.hudi.common.util.Option)9 IOException (java.io.IOException)8 WorkloadStat (org.apache.hudi.table.WorkloadStat)8 SmallFile (org.apache.hudi.table.action.commit.SmallFile)8 Tuple2 (scala.Tuple2)8 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)7 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)7 HoodieTable (org.apache.hudi.table.HoodieTable)7 LogManager (org.apache.log4j.LogManager)7 Logger (org.apache.log4j.Logger)7 Collectors (java.util.stream.Collectors)6