Search in sources :

Example 1 with PartitionIndexID

use of org.apache.hudi.common.util.hash.PartitionIndexID in project hudi by apache.

the class HoodieMetadataPayload method getColumnStatsIndexKey.

/**
 * Get column stats index key from the column range metadata.
 *
 * @param partitionName       - Partition name
 * @param columnRangeMetadata -  Column range metadata
 * @return Column stats index key
 */
public static String getColumnStatsIndexKey(String partitionName, HoodieColumnRangeMetadata<Comparable> columnRangeMetadata) {
    final PartitionIndexID partitionIndexID = new PartitionIndexID(partitionName);
    final FileIndexID fileIndexID = new FileIndexID(new Path(columnRangeMetadata.getFilePath()).getName());
    final ColumnIndexID columnIndexID = new ColumnIndexID(columnRangeMetadata.getColumnName());
    return getColumnStatsIndexKey(partitionIndexID, fileIndexID, columnIndexID);
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionIndexID(org.apache.hudi.common.util.hash.PartitionIndexID) ColumnIndexID(org.apache.hudi.common.util.hash.ColumnIndexID) FileIndexID(org.apache.hudi.common.util.hash.FileIndexID)

Example 2 with PartitionIndexID

use of org.apache.hudi.common.util.hash.PartitionIndexID in project hudi by apache.

the class HoodieMetadataPayload method createBloomFilterMetadataRecord.

/**
 * Create bloom filter metadata record.
 *
 * @param partitionName - Partition name
 * @param baseFileName  - Base file name for which the bloom filter needs to persisted
 * @param timestamp     - Instant timestamp responsible for this record
 * @param bloomFilter   - Bloom filter for the File
 * @param isDeleted     - Is the bloom filter no more valid
 * @return Metadata payload containing the fileID and its bloom filter record
 */
public static HoodieRecord<HoodieMetadataPayload> createBloomFilterMetadataRecord(final String partitionName, final String baseFileName, final String timestamp, final String bloomFilterType, final ByteBuffer bloomFilter, final boolean isDeleted) {
    checkArgument(!baseFileName.contains(Path.SEPARATOR) && FSUtils.isBaseFile(new Path(baseFileName)), "Invalid base file '" + baseFileName + "' for MetaIndexBloomFilter!");
    final String bloomFilterIndexKey = new PartitionIndexID(partitionName).asBase64EncodedString().concat(new FileIndexID(baseFileName).asBase64EncodedString());
    HoodieKey key = new HoodieKey(bloomFilterIndexKey, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath());
    HoodieMetadataBloomFilter metadataBloomFilter = new HoodieMetadataBloomFilter(bloomFilterType, timestamp, bloomFilter, isDeleted);
    HoodieMetadataPayload metadataPayload = new HoodieMetadataPayload(key.getRecordKey(), metadataBloomFilter);
    return new HoodieAvroRecord<>(key, metadataPayload);
}
Also used : Path(org.apache.hadoop.fs.Path) PartitionIndexID(org.apache.hudi.common.util.hash.PartitionIndexID) HoodieMetadataBloomFilter(org.apache.hudi.avro.model.HoodieMetadataBloomFilter) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) FileIndexID(org.apache.hudi.common.util.hash.FileIndexID)

Example 3 with PartitionIndexID

use of org.apache.hudi.common.util.hash.PartitionIndexID in project hudi by apache.

the class BaseTableMetadata method getColumnStats.

@Override
public Map<Pair<String, String>, HoodieMetadataColumnStats> getColumnStats(final List<Pair<String, String>> partitionNameFileNameList, final String columnName) throws HoodieMetadataException {
    if (!isColumnStatsIndexEnabled) {
        LOG.error("Metadata column stats index is disabled!");
        return Collections.emptyMap();
    }
    Map<String, Pair<String, String>> columnStatKeyToFileNameMap = new HashMap<>();
    TreeSet<String> sortedKeys = new TreeSet<>();
    final ColumnIndexID columnIndexID = new ColumnIndexID(columnName);
    for (Pair<String, String> partitionNameFileNamePair : partitionNameFileNameList) {
        final String columnStatsIndexKey = HoodieMetadataPayload.getColumnStatsIndexKey(new PartitionIndexID(partitionNameFileNamePair.getLeft()), new FileIndexID(partitionNameFileNamePair.getRight()), columnIndexID);
        sortedKeys.add(columnStatsIndexKey);
        columnStatKeyToFileNameMap.put(columnStatsIndexKey, partitionNameFileNamePair);
    }
    List<String> columnStatKeys = new ArrayList<>(sortedKeys);
    HoodieTimer timer = new HoodieTimer().startTimer();
    List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> hoodieRecordList = getRecordsByKeys(columnStatKeys, MetadataPartitionType.COLUMN_STATS.getPartitionPath());
    metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_METADATA_STR, timer.endTimer()));
    Map<Pair<String, String>, HoodieMetadataColumnStats> fileToColumnStatMap = new HashMap<>();
    for (final Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : hoodieRecordList) {
        if (entry.getRight().isPresent()) {
            final Option<HoodieMetadataColumnStats> columnStatMetadata = entry.getRight().get().getData().getColumnStatMetadata();
            if (columnStatMetadata.isPresent()) {
                if (!columnStatMetadata.get().getIsDeleted()) {
                    ValidationUtils.checkState(columnStatKeyToFileNameMap.containsKey(entry.getLeft()));
                    final Pair<String, String> partitionFileNamePair = columnStatKeyToFileNameMap.get(entry.getLeft());
                    ValidationUtils.checkState(!fileToColumnStatMap.containsKey(partitionFileNamePair));
                    fileToColumnStatMap.put(partitionFileNamePair, columnStatMetadata.get());
                }
            } else {
                LOG.error("Meta index column stats missing for: " + entry.getLeft());
            }
        }
    }
    return fileToColumnStatMap;
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) PartitionIndexID(org.apache.hudi.common.util.hash.PartitionIndexID) TreeSet(java.util.TreeSet) ColumnIndexID(org.apache.hudi.common.util.hash.ColumnIndexID) Option(org.apache.hudi.common.util.Option) FileIndexID(org.apache.hudi.common.util.hash.FileIndexID) Pair(org.apache.hudi.common.util.collection.Pair)

Example 4 with PartitionIndexID

use of org.apache.hudi.common.util.hash.PartitionIndexID in project hudi by apache.

the class BaseTableMetadata method getBloomFilters.

@Override
public Map<Pair<String, String>, BloomFilter> getBloomFilters(final List<Pair<String, String>> partitionNameFileNameList) throws HoodieMetadataException {
    if (!isBloomFilterIndexEnabled) {
        LOG.error("Metadata bloom filter index is disabled!");
        return Collections.emptyMap();
    }
    if (partitionNameFileNameList.isEmpty()) {
        return Collections.emptyMap();
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    Set<String> partitionIDFileIDSortedStrings = new TreeSet<>();
    Map<String, Pair<String, String>> fileToKeyMap = new HashMap<>();
    partitionNameFileNameList.forEach(partitionNameFileNamePair -> {
        final String bloomFilterIndexKey = HoodieMetadataPayload.getBloomFilterIndexKey(new PartitionIndexID(partitionNameFileNamePair.getLeft()), new FileIndexID(partitionNameFileNamePair.getRight()));
        partitionIDFileIDSortedStrings.add(bloomFilterIndexKey);
        fileToKeyMap.put(bloomFilterIndexKey, partitionNameFileNamePair);
    });
    List<String> partitionIDFileIDStrings = new ArrayList<>(partitionIDFileIDSortedStrings);
    List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> hoodieRecordList = getRecordsByKeys(partitionIDFileIDStrings, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath());
    metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_BLOOM_FILTERS_METADATA_STR, (timer.endTimer() / partitionIDFileIDStrings.size())));
    Map<Pair<String, String>, BloomFilter> partitionFileToBloomFilterMap = new HashMap<>();
    for (final Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : hoodieRecordList) {
        if (entry.getRight().isPresent()) {
            final Option<HoodieMetadataBloomFilter> bloomFilterMetadata = entry.getRight().get().getData().getBloomFilterMetadata();
            if (bloomFilterMetadata.isPresent()) {
                if (!bloomFilterMetadata.get().getIsDeleted()) {
                    ValidationUtils.checkState(fileToKeyMap.containsKey(entry.getLeft()));
                    final ByteBuffer bloomFilterByteBuffer = bloomFilterMetadata.get().getBloomFilter();
                    final String bloomFilterType = bloomFilterMetadata.get().getType();
                    final BloomFilter bloomFilter = BloomFilterFactory.fromString(StandardCharsets.UTF_8.decode(bloomFilterByteBuffer).toString(), bloomFilterType);
                    partitionFileToBloomFilterMap.put(fileToKeyMap.get(entry.getLeft()), bloomFilter);
                }
            } else {
                LOG.error("Meta index bloom filter missing for: " + fileToKeyMap.get(entry.getLeft()));
            }
        }
    }
    return partitionFileToBloomFilterMap;
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ByteBuffer(java.nio.ByteBuffer) HoodieMetadataBloomFilter(org.apache.hudi.avro.model.HoodieMetadataBloomFilter) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) PartitionIndexID(org.apache.hudi.common.util.hash.PartitionIndexID) HoodieMetadataBloomFilter(org.apache.hudi.avro.model.HoodieMetadataBloomFilter) TreeSet(java.util.TreeSet) Option(org.apache.hudi.common.util.Option) FileIndexID(org.apache.hudi.common.util.hash.FileIndexID) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

FileIndexID (org.apache.hudi.common.util.hash.FileIndexID)4 PartitionIndexID (org.apache.hudi.common.util.hash.PartitionIndexID)4 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 TreeSet (java.util.TreeSet)2 Path (org.apache.hadoop.fs.Path)2 HoodieMetadataBloomFilter (org.apache.hudi.avro.model.HoodieMetadataBloomFilter)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieTimer (org.apache.hudi.common.util.HoodieTimer)2 Option (org.apache.hudi.common.util.Option)2 Pair (org.apache.hudi.common.util.collection.Pair)2 ColumnIndexID (org.apache.hudi.common.util.hash.ColumnIndexID)2 ByteBuffer (java.nio.ByteBuffer)1 HoodieMetadataColumnStats (org.apache.hudi.avro.model.HoodieMetadataColumnStats)1 BloomFilter (org.apache.hudi.common.bloom.BloomFilter)1 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)1 HoodieKey (org.apache.hudi.common.model.HoodieKey)1