Search in sources :

Example 1 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class HoodieMergeHandle method performMergeDataValidationCheck.

public void performMergeDataValidationCheck(WriteStatus writeStatus) {
    if (!config.isMergeDataValidationCheckEnabled()) {
        return;
    }
    long oldNumWrites = 0;
    try {
        HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(hoodieTable.getHadoopConf(), oldFilePath);
        oldNumWrites = reader.getTotalRecords();
    } catch (IOException e) {
        throw new HoodieUpsertException("Failed to check for merge data validation", e);
    }
    if ((writeStatus.getStat().getNumWrites() + writeStatus.getStat().getNumDeletes()) < oldNumWrites) {
        throw new HoodieCorruptedDataException(String.format("Record write count decreased for file: %s, Partition Path: %s (%s:%d + %d < %s:%d)", writeStatus.getFileId(), writeStatus.getPartitionPath(), instantTime, writeStatus.getStat().getNumWrites(), writeStatus.getStat().getNumDeletes(), FSUtils.getCommitTime(oldFilePath.toString()), oldNumWrites));
    }
}
Also used : HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCorruptedDataException(org.apache.hudi.exception.HoodieCorruptedDataException)

Example 2 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class MultipleSparkJobExecutionStrategy method readRecordsForGroupWithLogs.

/**
 * Read records from baseFiles, apply updates and convert to RDD.
 */
private HoodieData<HoodieRecord<T>> readRecordsForGroupWithLogs(JavaSparkContext jsc, List<ClusteringOperation> clusteringOps, String instantTime) {
    HoodieWriteConfig config = getWriteConfig();
    HoodieTable table = getHoodieTable();
    return HoodieJavaRDD.of(jsc.parallelize(clusteringOps, clusteringOps.size()).mapPartitions(clusteringOpsPartition -> {
        List<Iterator<HoodieRecord<T>>> recordIterators = new ArrayList<>();
        clusteringOpsPartition.forEachRemaining(clusteringOp -> {
            long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(new SparkTaskContextSupplier(), config);
            LOG.info("MaxMemoryPerCompaction run as part of clustering => " + maxMemoryPerCompaction);
            try {
                Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
                HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(table.getMetaClient().getFs()).withBasePath(table.getMetaClient().getBasePath()).withLogFilePaths(clusteringOp.getDeltaFilePaths()).withReaderSchema(readerSchema).withLatestInstantTime(instantTime).withMaxMemorySizeInBytes(maxMemoryPerCompaction).withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()).withReverseReader(config.getCompactionReverseLogReadEnabled()).withBufferSize(config.getMaxDFSStreamBufferSize()).withSpillableMapBasePath(config.getSpillableMapBasePath()).withPartition(clusteringOp.getPartitionPath()).build();
                Option<HoodieFileReader> baseFileReader = StringUtils.isNullOrEmpty(clusteringOp.getDataFilePath()) ? Option.empty() : Option.of(HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), new Path(clusteringOp.getDataFilePath())));
                HoodieTableConfig tableConfig = table.getMetaClient().getTableConfig();
                recordIterators.add(getFileSliceReader(baseFileReader, scanner, readerSchema, tableConfig.getPayloadClass(), tableConfig.getPreCombineField(), tableConfig.populateMetaFields() ? Option.empty() : Option.of(Pair.of(tableConfig.getRecordKeyFieldProp(), tableConfig.getPartitionFieldProp()))));
            } catch (IOException e) {
                throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e);
            }
        });
        return new ConcatenatingIterator<>(recordIterators);
    }));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) RewriteAvroPayload(org.apache.hudi.common.model.RewriteAvroPayload) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) RDDSpatialCurveSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) PLAN_STRATEGY_SORT_COLUMNS(org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) ClusteringOperation(org.apache.hudi.common.model.ClusteringOperation) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) KeyGenUtils(org.apache.hudi.keygen.KeyGenUtils) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieFileSliceReader.getFileSliceReader(org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) ClusteringExecutionStrategy(org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy) IndexedRecord(org.apache.avro.generic.IndexedRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) RDDCustomColumnsSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) FutureUtils(org.apache.hudi.common.util.FutureUtils) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) HoodieTable(org.apache.hudi.table.HoodieTable) List(java.util.List) ArrayList(java.util.ArrayList) Option(org.apache.hudi.common.util.Option)

Example 3 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class DFSHoodieDatasetInputReader method readColumnarOrLogFiles.

private Iterator<IndexedRecord> readColumnarOrLogFiles(FileSlice fileSlice) throws IOException {
    if (fileSlice.getBaseFile().isPresent()) {
        // Read the base files using the latest writer schema.
        Schema schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));
        HoodieFileReader reader = HoodieFileReaderFactory.getFileReader(metaClient.getHadoopConf(), new Path(fileSlice.getBaseFile().get().getPath()));
        return reader.getRecordIterator(schema);
    } else {
        // If there is no data file, fall back to reading log files
        HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(metaClient.getFs()).withBasePath(metaClient.getBasePath()).withLogFilePaths(fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList())).withReaderSchema(new Schema.Parser().parse(schemaStr)).withLatestInstantTime(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant().get().getTimestamp()).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withReadBlocksLazily(true).withReverseReader(false).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
        // readAvro log files
        Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
        Schema schema = new Schema.Parser().parse(schemaStr);
        return StreamSupport.stream(iterable.spliterator(), false).map(e -> {
            try {
                return (IndexedRecord) e.getData().getInsertValue(schema).get();
            } catch (IOException io) {
                throw new UncheckedIOException(io);
            }
        }).iterator();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) Entry.comparingByValue(java.util.Map.Entry.comparingByValue) LinkedHashMap(java.util.LinkedHashMap) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Collectors.toMap(java.util.stream.Collectors.toMap) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieMemoryConfig(org.apache.hudi.config.HoodieMemoryConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) StreamSupport(java.util.stream.StreamSupport) NoSuchElementException(java.util.NoSuchElementException) IndexedRecord(org.apache.avro.generic.IndexedRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) IOException(java.io.IOException) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) UncheckedIOException(java.io.UncheckedIOException) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) UncheckedIOException(java.io.UncheckedIOException) IOException(java.io.IOException) UncheckedIOException(java.io.UncheckedIOException) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Example 4 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class HoodieTableMetadataUtil method convertFilesToBloomFilterRecords.

/**
 * Convert added and deleted files metadata to bloom filter index records.
 */
public static HoodieData<HoodieRecord> convertFilesToBloomFilterRecords(HoodieEngineContext engineContext, Map<String, List<String>> partitionToDeletedFiles, Map<String, Map<String, Long>> partitionToAppendedFiles, MetadataRecordsGenerationParams recordsGenerationParams, String instantTime) {
    HoodieData<HoodieRecord> allRecordsRDD = engineContext.emptyHoodieData();
    List<Pair<String, List<String>>> partitionToDeletedFilesList = partitionToDeletedFiles.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())).collect(Collectors.toList());
    int parallelism = Math.max(Math.min(partitionToDeletedFilesList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1);
    HoodieData<Pair<String, List<String>>> partitionToDeletedFilesRDD = engineContext.parallelize(partitionToDeletedFilesList, parallelism);
    HoodieData<HoodieRecord> deletedFilesRecordsRDD = partitionToDeletedFilesRDD.flatMap(partitionToDeletedFilesPair -> {
        final String partitionName = partitionToDeletedFilesPair.getLeft();
        final List<String> deletedFileList = partitionToDeletedFilesPair.getRight();
        return deletedFileList.stream().flatMap(deletedFile -> {
            if (!FSUtils.isBaseFile(new Path(deletedFile))) {
                return Stream.empty();
            }
            final String partition = getPartition(partitionName);
            return Stream.<HoodieRecord>of(HoodieMetadataPayload.createBloomFilterMetadataRecord(partition, deletedFile, instantTime, StringUtils.EMPTY_STRING, ByteBuffer.allocate(0), true));
        }).iterator();
    });
    allRecordsRDD = allRecordsRDD.union(deletedFilesRecordsRDD);
    List<Pair<String, Map<String, Long>>> partitionToAppendedFilesList = partitionToAppendedFiles.entrySet().stream().map(entry -> Pair.of(entry.getKey(), entry.getValue())).collect(Collectors.toList());
    parallelism = Math.max(Math.min(partitionToAppendedFilesList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1);
    HoodieData<Pair<String, Map<String, Long>>> partitionToAppendedFilesRDD = engineContext.parallelize(partitionToAppendedFilesList, parallelism);
    HoodieData<HoodieRecord> appendedFilesRecordsRDD = partitionToAppendedFilesRDD.flatMap(partitionToAppendedFilesPair -> {
        final String partitionName = partitionToAppendedFilesPair.getLeft();
        final Map<String, Long> appendedFileMap = partitionToAppendedFilesPair.getRight();
        final String partition = getPartition(partitionName);
        return appendedFileMap.entrySet().stream().flatMap(appendedFileLengthPairEntry -> {
            final String appendedFile = appendedFileLengthPairEntry.getKey();
            if (!FSUtils.isBaseFile(new Path(appendedFile))) {
                return Stream.empty();
            }
            final String pathWithPartition = partitionName + "/" + appendedFile;
            final Path appendedFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition);
            try (HoodieFileReader<IndexedRecord> fileReader = HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), appendedFilePath)) {
                final BloomFilter fileBloomFilter = fileReader.readBloomFilter();
                if (fileBloomFilter == null) {
                    LOG.error("Failed to read bloom filter for " + appendedFilePath);
                    return Stream.empty();
                }
                ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes());
                HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord(partition, appendedFile, instantTime, recordsGenerationParams.getBloomFilterType(), bloomByteBuffer, false);
                return Stream.of(record);
            } catch (IOException e) {
                LOG.error("Failed to get bloom filter for file: " + appendedFilePath);
            }
            return Stream.empty();
        }).iterator();
    });
    allRecordsRDD = allRecordsRDD.union(appendedFilesRecordsRDD);
    return allRecordsRDD;
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BiFunction(java.util.function.BiFunction) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) ByteBuffer(java.nio.ByteBuffer) MAX(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MAX) Logger(org.apache.log4j.Logger) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Collectors(java.util.stream.Collectors) TOTAL_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_SIZE) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) VALUE_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.VALUE_COUNT) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) TOTAL_UNCOMPRESSED_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_UNCOMPRESSED_SIZE) EMPTY_PARTITION_NAME(org.apache.hudi.metadata.HoodieTableMetadata.EMPTY_PARTITION_NAME) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) NULL_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.NULL_COUNT) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) LinkedList(java.util.LinkedList) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) GenericRecord(org.apache.avro.generic.GenericRecord) MIN(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MIN) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) NON_PARTITIONED_NAME(org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) COLUMN_RANGE_MERGE_FUNCTION(org.apache.hudi.common.model.HoodieColumnRangeMetadata.COLUMN_RANGE_MERGE_FUNCTION) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ByteBuffer(java.nio.ByteBuffer) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) Pair(org.apache.hudi.common.util.collection.Pair)

Example 5 with HoodieFileReader

use of org.apache.hudi.io.storage.HoodieFileReader in project hudi by apache.

the class HoodieTableMetadataUtil method convertMetadataToBloomFilterRecords.

/**
 * Convert commit action metadata to bloom filter records.
 *
 * @param context                 - Engine context to use
 * @param commitMetadata          - Commit action metadata
 * @param instantTime             - Action instant time
 * @param recordsGenerationParams - Parameters for bloom filter record generation
 * @return HoodieData of metadata table records
 */
public static HoodieData<HoodieRecord> convertMetadataToBloomFilterRecords(HoodieEngineContext context, HoodieCommitMetadata commitMetadata, String instantTime, MetadataRecordsGenerationParams recordsGenerationParams) {
    final List<HoodieWriteStat> allWriteStats = commitMetadata.getPartitionToWriteStats().values().stream().flatMap(entry -> entry.stream()).collect(Collectors.toList());
    if (allWriteStats.isEmpty()) {
        return context.emptyHoodieData();
    }
    final int parallelism = Math.max(Math.min(allWriteStats.size(), recordsGenerationParams.getBloomIndexParallelism()), 1);
    HoodieData<HoodieWriteStat> allWriteStatsRDD = context.parallelize(allWriteStats, parallelism);
    return allWriteStatsRDD.flatMap(hoodieWriteStat -> {
        final String partition = hoodieWriteStat.getPartitionPath();
        // For bloom filter index, delta writes do not change the base file bloom filter entries
        if (hoodieWriteStat instanceof HoodieDeltaWriteStat) {
            return Collections.emptyListIterator();
        }
        String pathWithPartition = hoodieWriteStat.getPath();
        if (pathWithPartition == null) {
            // Empty partition
            LOG.error("Failed to find path in write stat to update metadata table " + hoodieWriteStat);
            return Collections.emptyListIterator();
        }
        int offset = partition.equals(NON_PARTITIONED_NAME) ? (pathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
        final String fileName = pathWithPartition.substring(offset);
        if (!FSUtils.isBaseFile(new Path(fileName))) {
            return Collections.emptyListIterator();
        }
        final Path writeFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition);
        try (HoodieFileReader<IndexedRecord> fileReader = HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), writeFilePath)) {
            try {
                final BloomFilter fileBloomFilter = fileReader.readBloomFilter();
                if (fileBloomFilter == null) {
                    LOG.error("Failed to read bloom filter for " + writeFilePath);
                    return Collections.emptyListIterator();
                }
                ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes());
                HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord(partition, fileName, instantTime, recordsGenerationParams.getBloomFilterType(), bloomByteBuffer, false);
                return Collections.singletonList(record).iterator();
            } catch (Exception e) {
                LOG.error("Failed to read bloom filter for " + writeFilePath);
                return Collections.emptyListIterator();
            } finally {
                fileReader.close();
            }
        } catch (IOException e) {
            LOG.error("Failed to get bloom filter for file: " + writeFilePath + ", write stat: " + hoodieWriteStat);
        }
        return Collections.emptyListIterator();
    });
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BiFunction(java.util.function.BiFunction) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) ByteBuffer(java.nio.ByteBuffer) MAX(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MAX) Logger(org.apache.log4j.Logger) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Collectors(java.util.stream.Collectors) TOTAL_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_SIZE) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) VALUE_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.VALUE_COUNT) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) TOTAL_UNCOMPRESSED_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_UNCOMPRESSED_SIZE) EMPTY_PARTITION_NAME(org.apache.hudi.metadata.HoodieTableMetadata.EMPTY_PARTITION_NAME) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) NULL_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.NULL_COUNT) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) LinkedList(java.util.LinkedList) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) GenericRecord(org.apache.avro.generic.GenericRecord) MIN(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MIN) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) NON_PARTITIONED_NAME(org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) COLUMN_RANGE_MERGE_FUNCTION(org.apache.hudi.common.model.HoodieColumnRangeMetadata.COLUMN_RANGE_MERGE_FUNCTION) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ByteBuffer(java.nio.ByteBuffer) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat)

Aggregations

HoodieFileReader (org.apache.hudi.io.storage.HoodieFileReader)15 IOException (java.io.IOException)13 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)10 Schema (org.apache.avro.Schema)9 GenericRecord (org.apache.avro.generic.GenericRecord)8 Option (org.apache.hudi.common.util.Option)8 HoodieFileReaderFactory (org.apache.hudi.io.storage.HoodieFileReaderFactory)8 ArrayList (java.util.ArrayList)7 List (java.util.List)7 Path (org.apache.hadoop.fs.Path)7 HoodieIOException (org.apache.hudi.exception.HoodieIOException)7 Iterator (java.util.Iterator)6 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)6 Map (java.util.Map)5 Collectors (java.util.stream.Collectors)5 IndexedRecord (org.apache.avro.generic.IndexedRecord)5 Pair (org.apache.hudi.common.util.collection.Pair)5 HoodieException (org.apache.hudi.exception.HoodieException)5 Stream (java.util.stream.Stream)4 HoodieData (org.apache.hudi.common.data.HoodieData)4