Search in sources :

Example 1 with MetadataNotFoundException

use of org.apache.hudi.exception.MetadataNotFoundException in project hudi by apache.

the class OrcUtils method readFooter.

@Override
public Map<String, String> readFooter(Configuration conf, boolean required, Path orcFilePath, String... footerNames) {
    try (Reader reader = OrcFile.createReader(orcFilePath, OrcFile.readerOptions(conf))) {
        Map<String, String> footerVals = new HashMap<>();
        List<UserMetadataItem> metadataItemList = reader.getFileTail().getFooter().getMetadataList();
        Map<String, String> metadata = metadataItemList.stream().collect(Collectors.toMap(UserMetadataItem::getName, metadataItem -> metadataItem.getValue().toStringUtf8()));
        for (String footerName : footerNames) {
            if (metadata.containsKey(footerName)) {
                footerVals.put(footerName, metadata.get(footerName));
            } else if (required) {
                throw new MetadataNotFoundException("Could not find index in ORC footer. Looked for key " + footerName + " in " + orcFilePath);
            }
        }
        return footerVals;
    } catch (IOException io) {
        throw new HoodieIOException("Unable to read footer for ORC file:" + orcFilePath, io);
    }
}
Also used : HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) VectorizedRowBatch(org.apache.orc.storage.ql.exec.vector.VectorizedRowBatch) HoodieException(org.apache.hudi.exception.HoodieException) HashMap(java.util.HashMap) ByteBuffer(java.nio.ByteBuffer) OrcFile(org.apache.orc.OrcFile) ArrayList(java.util.ArrayList) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) HashSet(java.util.HashSet) Reader(org.apache.orc.Reader) Options(org.apache.orc.Reader.Options) Map(java.util.Map) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) RecordReader(org.apache.orc.RecordReader) Set(java.util.Set) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) BytesColumnVector(org.apache.orc.storage.ql.exec.vector.BytesColumnVector) Collectors(java.util.stream.Collectors) List(java.util.List) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) UserMetadataItem(org.apache.orc.OrcProto.UserMetadataItem) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HashMap(java.util.HashMap) UserMetadataItem(org.apache.orc.OrcProto.UserMetadataItem) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 2 with MetadataNotFoundException

use of org.apache.hudi.exception.MetadataNotFoundException in project hudi by apache.

the class HoodieBloomIndex method loadColumnRangesFromMetaIndex.

/**
 * Load the column stats index as BloomIndexFileInfo for all the involved files in the partition.
 *
 * @param partitions  - List of partitions for which column stats need to be loaded
 * @param context     - Engine context
 * @param hoodieTable - Hoodie table
 * @return List of partition and file column range info pairs
 */
protected List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromMetaIndex(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
    // also obtain file ranges, if range pruning is enabled
    context.setJobStatus(this.getClass().getName(), "Load meta index key ranges for file slices");
    final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
    return context.flatMap(partitions, partitionName -> {
        // Partition and file name pairs
        List<Pair<String, String>> partitionFileNameList = HoodieIndexUtils.getLatestBaseFilesForPartition(partitionName, hoodieTable).stream().map(baseFile -> Pair.of(partitionName, baseFile.getFileName())).sorted().collect(toList());
        if (partitionFileNameList.isEmpty()) {
            return Stream.empty();
        }
        try {
            Map<Pair<String, String>, HoodieMetadataColumnStats> fileToColumnStatsMap = hoodieTable.getMetadataTable().getColumnStats(partitionFileNameList, keyField);
            List<Pair<String, BloomIndexFileInfo>> result = new ArrayList<>();
            for (Map.Entry<Pair<String, String>, HoodieMetadataColumnStats> entry : fileToColumnStatsMap.entrySet()) {
                result.add(Pair.of(entry.getKey().getLeft(), new BloomIndexFileInfo(FSUtils.getFileId(entry.getKey().getRight()), entry.getValue().getMinValue(), entry.getValue().getMaxValue())));
            }
            return result.stream();
        } catch (MetadataNotFoundException me) {
            throw new HoodieMetadataException("Unable to find column range metadata for partition:" + partitionName, me);
        }
    }, Math.max(partitions.size(), 1));
}
Also used : HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) ArrayList(java.util.ArrayList) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) Map(java.util.Map) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 3 with MetadataNotFoundException

use of org.apache.hudi.exception.MetadataNotFoundException in project hudi by apache.

the class HoodieBloomIndex method loadColumnRangesFromFiles.

/**
 * Load all involved files as <Partition, filename> pair List.
 */
List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromFiles(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
    // Obtain the latest data files from all the partitions.
    List<Pair<String, String>> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream().map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())).collect(toList());
    context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)");
    return context.map(partitionPathFileIDList, pf -> {
        try {
            HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf);
            String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys();
            return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1]));
        } catch (MetadataNotFoundException me) {
            LOG.warn("Unable to find range metadata in file :" + pf);
            return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()));
        }
    }, Math.max(partitionPathFileIDList.size(), 1));
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieIndexUtils.getLatestBaseFilesForAllPartitions(org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieConfig(org.apache.hudi.common.config.HoodieConfig) HoodieRangeInfoHandle(org.apache.hudi.io.HoodieRangeInfoHandle) Map(java.util.Map) Collectors.mapping(java.util.stream.Collectors.mapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) HoodiePairData(org.apache.hudi.common.data.HoodiePairData) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Stream(java.util.stream.Stream) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIndexUtils(org.apache.hudi.index.HoodieIndexUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair) HoodieRangeInfoHandle(org.apache.hudi.io.HoodieRangeInfoHandle)

Example 4 with MetadataNotFoundException

use of org.apache.hudi.exception.MetadataNotFoundException in project hudi by apache.

the class ParquetUtils method readFooter.

@Override
public Map<String, String> readFooter(Configuration configuration, boolean required, Path parquetFilePath, String... footerNames) {
    Map<String, String> footerVals = new HashMap<>();
    ParquetMetadata footer = readMetadata(configuration, parquetFilePath);
    Map<String, String> metadata = footer.getFileMetaData().getKeyValueMetaData();
    for (String footerName : footerNames) {
        if (metadata.containsKey(footerName)) {
            footerVals.put(footerName, metadata.get(footerName));
        } else if (required) {
            throw new MetadataNotFoundException("Could not find index in Parquet footer. Looked for key " + footerName + " in " + parquetFilePath);
        }
    }
    return footerVals;
}
Also used : MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) HashMap(java.util.HashMap) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata)

Aggregations

MetadataNotFoundException (org.apache.hudi.exception.MetadataNotFoundException)4 ArrayList (java.util.ArrayList)3 Map (java.util.Map)3 HashMap (java.util.HashMap)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 HoodieMetadataColumnStats (org.apache.hudi.avro.model.HoodieMetadataColumnStats)2 FSUtils (org.apache.hudi.common.fs.FSUtils)2 HoodieKey (org.apache.hudi.common.model.HoodieKey)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 ImmutablePair (org.apache.hudi.common.util.collection.ImmutablePair)2 Pair (org.apache.hudi.common.util.collection.Pair)2 HoodieMetadataException (org.apache.hudi.exception.HoodieMetadataException)2 IOException (java.io.IOException)1 ByteBuffer (java.nio.ByteBuffer)1 Collections (java.util.Collections)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 Collectors.groupingBy (java.util.stream.Collectors.groupingBy)1 Collectors.mapping (java.util.stream.Collectors.mapping)1