Search in sources :

Example 1 with HoodieMetadataColumnStats

use of org.apache.hudi.avro.model.HoodieMetadataColumnStats in project hudi by apache.

the class HoodieBloomIndex method loadColumnRangesFromMetaIndex.

/**
 * Load the column stats index as BloomIndexFileInfo for all the involved files in the partition.
 *
 * @param partitions  - List of partitions for which column stats need to be loaded
 * @param context     - Engine context
 * @param hoodieTable - Hoodie table
 * @return List of partition and file column range info pairs
 */
protected List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromMetaIndex(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
    // also obtain file ranges, if range pruning is enabled
    context.setJobStatus(this.getClass().getName(), "Load meta index key ranges for file slices");
    final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
    return context.flatMap(partitions, partitionName -> {
        // Partition and file name pairs
        List<Pair<String, String>> partitionFileNameList = HoodieIndexUtils.getLatestBaseFilesForPartition(partitionName, hoodieTable).stream().map(baseFile -> Pair.of(partitionName, baseFile.getFileName())).sorted().collect(toList());
        if (partitionFileNameList.isEmpty()) {
            return Stream.empty();
        }
        try {
            Map<Pair<String, String>, HoodieMetadataColumnStats> fileToColumnStatsMap = hoodieTable.getMetadataTable().getColumnStats(partitionFileNameList, keyField);
            List<Pair<String, BloomIndexFileInfo>> result = new ArrayList<>();
            for (Map.Entry<Pair<String, String>, HoodieMetadataColumnStats> entry : fileToColumnStatsMap.entrySet()) {
                result.add(Pair.of(entry.getKey().getLeft(), new BloomIndexFileInfo(FSUtils.getFileId(entry.getKey().getRight()), entry.getValue().getMinValue(), entry.getValue().getMaxValue())));
            }
            return result.stream();
        } catch (MetadataNotFoundException me) {
            throw new HoodieMetadataException("Unable to find column range metadata for partition:" + partitionName, me);
        }
    }, Math.max(partitions.size(), 1));
}
Also used : HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) ArrayList(java.util.ArrayList) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) Map(java.util.Map) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 2 with HoodieMetadataColumnStats

use of org.apache.hudi.avro.model.HoodieMetadataColumnStats in project hudi by apache.

the class BaseTableMetadata method getColumnStats.

@Override
public Map<Pair<String, String>, HoodieMetadataColumnStats> getColumnStats(final List<Pair<String, String>> partitionNameFileNameList, final String columnName) throws HoodieMetadataException {
    if (!isColumnStatsIndexEnabled) {
        LOG.error("Metadata column stats index is disabled!");
        return Collections.emptyMap();
    }
    Map<String, Pair<String, String>> columnStatKeyToFileNameMap = new HashMap<>();
    TreeSet<String> sortedKeys = new TreeSet<>();
    final ColumnIndexID columnIndexID = new ColumnIndexID(columnName);
    for (Pair<String, String> partitionNameFileNamePair : partitionNameFileNameList) {
        final String columnStatsIndexKey = HoodieMetadataPayload.getColumnStatsIndexKey(new PartitionIndexID(partitionNameFileNamePair.getLeft()), new FileIndexID(partitionNameFileNamePair.getRight()), columnIndexID);
        sortedKeys.add(columnStatsIndexKey);
        columnStatKeyToFileNameMap.put(columnStatsIndexKey, partitionNameFileNamePair);
    }
    List<String> columnStatKeys = new ArrayList<>(sortedKeys);
    HoodieTimer timer = new HoodieTimer().startTimer();
    List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> hoodieRecordList = getRecordsByKeys(columnStatKeys, MetadataPartitionType.COLUMN_STATS.getPartitionPath());
    metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_METADATA_STR, timer.endTimer()));
    Map<Pair<String, String>, HoodieMetadataColumnStats> fileToColumnStatMap = new HashMap<>();
    for (final Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : hoodieRecordList) {
        if (entry.getRight().isPresent()) {
            final Option<HoodieMetadataColumnStats> columnStatMetadata = entry.getRight().get().getData().getColumnStatMetadata();
            if (columnStatMetadata.isPresent()) {
                if (!columnStatMetadata.get().getIsDeleted()) {
                    ValidationUtils.checkState(columnStatKeyToFileNameMap.containsKey(entry.getLeft()));
                    final Pair<String, String> partitionFileNamePair = columnStatKeyToFileNameMap.get(entry.getLeft());
                    ValidationUtils.checkState(!fileToColumnStatMap.containsKey(partitionFileNamePair));
                    fileToColumnStatMap.put(partitionFileNamePair, columnStatMetadata.get());
                }
            } else {
                LOG.error("Meta index column stats missing for: " + entry.getLeft());
            }
        }
    }
    return fileToColumnStatMap;
}
Also used : HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) PartitionIndexID(org.apache.hudi.common.util.hash.PartitionIndexID) TreeSet(java.util.TreeSet) ColumnIndexID(org.apache.hudi.common.util.hash.ColumnIndexID) Option(org.apache.hudi.common.util.Option) FileIndexID(org.apache.hudi.common.util.hash.FileIndexID) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

ArrayList (java.util.ArrayList)2 HoodieMetadataColumnStats (org.apache.hudi.avro.model.HoodieMetadataColumnStats)2 Pair (org.apache.hudi.common.util.collection.Pair)2 HashMap (java.util.HashMap)1 Map (java.util.Map)1 TreeSet (java.util.TreeSet)1 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)1 HoodieTimer (org.apache.hudi.common.util.HoodieTimer)1 Option (org.apache.hudi.common.util.Option)1 ImmutablePair (org.apache.hudi.common.util.collection.ImmutablePair)1 ColumnIndexID (org.apache.hudi.common.util.hash.ColumnIndexID)1 FileIndexID (org.apache.hudi.common.util.hash.FileIndexID)1 PartitionIndexID (org.apache.hudi.common.util.hash.PartitionIndexID)1 HoodieMetadataException (org.apache.hudi.exception.HoodieMetadataException)1 MetadataNotFoundException (org.apache.hudi.exception.MetadataNotFoundException)1