Search in sources :

Example 1 with HoodieDeltaWriteStat

use of org.apache.hudi.common.model.HoodieDeltaWriteStat in project hudi by apache.

the class HoodieAppendHandle method processAppendResult.

private void processAppendResult(AppendResult result, List<IndexedRecord> recordList) {
    HoodieDeltaWriteStat stat = (HoodieDeltaWriteStat) this.writeStatus.getStat();
    if (stat.getPath() == null) {
        // first time writing to this log block.
        updateWriteStatus(stat, result);
    } else if (stat.getPath().endsWith(result.logFile().getFileName())) {
        // append/continued writing to the same log file
        stat.setLogOffset(Math.min(stat.getLogOffset(), result.offset()));
        stat.setFileSizeInBytes(stat.getFileSizeInBytes() + result.size());
        accumulateWriteCounts(stat, result);
        accumulateRuntimeStats(stat);
    } else {
        // written to a newer log file, due to rollover/otherwise.
        initNewStatus();
        stat = (HoodieDeltaWriteStat) this.writeStatus.getStat();
        updateWriteStatus(stat, result);
    }
    if (config.isMetadataIndexColumnStatsForAllColumnsEnabled()) {
        Map<String, HoodieColumnRangeMetadata<Comparable>> columnRangeMap = stat.getRecordsStats().isPresent() ? stat.getRecordsStats().get().getStats() : new HashMap<>();
        final String filePath = stat.getPath();
        // initialize map of column name to map of stats name to stats value
        Map<String, Map<String, Object>> columnToStats = new HashMap<>();
        writeSchemaWithMetaFields.getFields().forEach(field -> columnToStats.putIfAbsent(field.name(), new HashMap<>()));
        // collect stats for columns at once per record and keep iterating through every record to eventually find col stats for all fields.
        recordList.forEach(record -> aggregateColumnStats(record, writeSchemaWithMetaFields, columnToStats, config.isConsistentLogicalTimestampEnabled()));
        writeSchemaWithMetaFields.getFields().forEach(field -> accumulateColumnRanges(field, filePath, columnRangeMap, columnToStats));
        stat.setRecordsStats(new HoodieDeltaWriteStat.RecordsStats<>(columnRangeMap));
    }
    resetWriteCounts();
    assert stat.getRuntimeStats() != null;
    LOG.info(String.format("AppendHandle for partitionPath %s filePath %s, took %d ms.", partitionPath, stat.getPath(), stat.getRuntimeStats().getTotalUpsertTime()));
    timer.startTimer();
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HashMap(java.util.HashMap) Map(java.util.Map) HashMap(java.util.HashMap)

Example 2 with HoodieDeltaWriteStat

use of org.apache.hudi.common.model.HoodieDeltaWriteStat in project hudi by apache.

the class HoodieAppendHandle method init.

private void init(HoodieRecord record) {
    if (doInit) {
        // extract some information from the first record
        SliceView rtView = hoodieTable.getSliceView();
        Option<FileSlice> fileSlice = rtView.getLatestFileSlice(partitionPath, fileId);
        // Set the base commit time as the current instantTime for new inserts into log files
        String baseInstantTime;
        String baseFile = "";
        List<String> logFiles = new ArrayList<>();
        if (fileSlice.isPresent()) {
            baseInstantTime = fileSlice.get().getBaseInstantTime();
            baseFile = fileSlice.get().getBaseFile().map(BaseFile::getFileName).orElse("");
            logFiles = fileSlice.get().getLogFiles().map(HoodieLogFile::getFileName).collect(Collectors.toList());
        } else {
            baseInstantTime = instantTime;
            // This means there is no base data file, start appending to a new log file
            fileSlice = Option.of(new FileSlice(partitionPath, baseInstantTime, this.fileId));
            LOG.info("New AppendHandle for partition :" + partitionPath);
        }
        // Prepare the first write status
        writeStatus.setStat(new HoodieDeltaWriteStat());
        writeStatus.setFileId(fileId);
        writeStatus.setPartitionPath(partitionPath);
        averageRecordSize = sizeEstimator.sizeEstimate(record);
        HoodieDeltaWriteStat deltaWriteStat = (HoodieDeltaWriteStat) writeStatus.getStat();
        deltaWriteStat.setPrevCommit(baseInstantTime);
        deltaWriteStat.setPartitionPath(partitionPath);
        deltaWriteStat.setFileId(fileId);
        deltaWriteStat.setBaseFile(baseFile);
        deltaWriteStat.setLogFiles(logFiles);
        try {
            // Save hoodie partition meta in the partition path
            HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, baseInstantTime, new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
            partitionMetadata.trySave(getPartitionId());
            // Since the actual log file written to can be different based on when rollover happens, we use the
            // base file to denote some log appends happened on a slice. writeToken will still fence concurrent
            // writers.
            // https://issues.apache.org/jira/browse/HUDI-1517
            createMarkerFile(partitionPath, FSUtils.makeDataFileName(baseInstantTime, writeToken, fileId, hoodieTable.getBaseFileExtension()));
            this.writer = createLogWriter(fileSlice, baseInstantTime);
        } catch (Exception e) {
            LOG.error("Error in update task at commit " + instantTime, e);
            writeStatus.setGlobalError(e);
            throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit " + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + "/" + partitionPath, e);
        }
        doInit = false;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) HoodieException(org.apache.hudi.exception.HoodieException) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAppendException(org.apache.hudi.exception.HoodieAppendException) IOException(java.io.IOException) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 3 with HoodieDeltaWriteStat

use of org.apache.hudi.common.model.HoodieDeltaWriteStat in project hudi by apache.

the class HoodieAppendHandle method initNewStatus.

private void initNewStatus() {
    HoodieDeltaWriteStat prevStat = (HoodieDeltaWriteStat) this.writeStatus.getStat();
    // Make a new write status and copy basic fields over.
    HoodieDeltaWriteStat stat = new HoodieDeltaWriteStat();
    stat.setFileId(fileId);
    stat.setPartitionPath(partitionPath);
    stat.setPrevCommit(prevStat.getPrevCommit());
    stat.setBaseFile(prevStat.getBaseFile());
    stat.setLogFiles(new ArrayList<>(prevStat.getLogFiles()));
    this.writeStatus = (WriteStatus) ReflectionUtils.loadClass(config.getWriteStatusClassName(), !hoodieTable.getIndex().isImplicitWithStorage(), config.getWriteStatusFailureFraction());
    this.writeStatus.setFileId(fileId);
    this.writeStatus.setPartitionPath(partitionPath);
    this.writeStatus.setStat(stat);
}
Also used : HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat)

Example 4 with HoodieDeltaWriteStat

use of org.apache.hudi.common.model.HoodieDeltaWriteStat in project hudi by apache.

the class HoodieTableMetadataUtil method convertMetadataToBloomFilterRecords.

/**
 * Convert commit action metadata to bloom filter records.
 *
 * @param context                 - Engine context to use
 * @param commitMetadata          - Commit action metadata
 * @param instantTime             - Action instant time
 * @param recordsGenerationParams - Parameters for bloom filter record generation
 * @return HoodieData of metadata table records
 */
public static HoodieData<HoodieRecord> convertMetadataToBloomFilterRecords(HoodieEngineContext context, HoodieCommitMetadata commitMetadata, String instantTime, MetadataRecordsGenerationParams recordsGenerationParams) {
    final List<HoodieWriteStat> allWriteStats = commitMetadata.getPartitionToWriteStats().values().stream().flatMap(entry -> entry.stream()).collect(Collectors.toList());
    if (allWriteStats.isEmpty()) {
        return context.emptyHoodieData();
    }
    final int parallelism = Math.max(Math.min(allWriteStats.size(), recordsGenerationParams.getBloomIndexParallelism()), 1);
    HoodieData<HoodieWriteStat> allWriteStatsRDD = context.parallelize(allWriteStats, parallelism);
    return allWriteStatsRDD.flatMap(hoodieWriteStat -> {
        final String partition = hoodieWriteStat.getPartitionPath();
        // For bloom filter index, delta writes do not change the base file bloom filter entries
        if (hoodieWriteStat instanceof HoodieDeltaWriteStat) {
            return Collections.emptyListIterator();
        }
        String pathWithPartition = hoodieWriteStat.getPath();
        if (pathWithPartition == null) {
            // Empty partition
            LOG.error("Failed to find path in write stat to update metadata table " + hoodieWriteStat);
            return Collections.emptyListIterator();
        }
        int offset = partition.equals(NON_PARTITIONED_NAME) ? (pathWithPartition.startsWith("/") ? 1 : 0) : partition.length() + 1;
        final String fileName = pathWithPartition.substring(offset);
        if (!FSUtils.isBaseFile(new Path(fileName))) {
            return Collections.emptyListIterator();
        }
        final Path writeFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition);
        try (HoodieFileReader<IndexedRecord> fileReader = HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), writeFilePath)) {
            try {
                final BloomFilter fileBloomFilter = fileReader.readBloomFilter();
                if (fileBloomFilter == null) {
                    LOG.error("Failed to read bloom filter for " + writeFilePath);
                    return Collections.emptyListIterator();
                }
                ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes());
                HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord(partition, fileName, instantTime, recordsGenerationParams.getBloomFilterType(), bloomByteBuffer, false);
                return Collections.singletonList(record).iterator();
            } catch (Exception e) {
                LOG.error("Failed to read bloom filter for " + writeFilePath);
                return Collections.emptyListIterator();
            } finally {
                fileReader.close();
            }
        } catch (IOException e) {
            LOG.error("Failed to get bloom filter for file: " + writeFilePath + ", write stat: " + hoodieWriteStat);
        }
        return Collections.emptyListIterator();
    });
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BiFunction(java.util.function.BiFunction) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) ByteBuffer(java.nio.ByteBuffer) MAX(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MAX) Logger(org.apache.log4j.Logger) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Collectors(java.util.stream.Collectors) TOTAL_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_SIZE) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) VALUE_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.VALUE_COUNT) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) TOTAL_UNCOMPRESSED_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_UNCOMPRESSED_SIZE) EMPTY_PARTITION_NAME(org.apache.hudi.metadata.HoodieTableMetadata.EMPTY_PARTITION_NAME) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) NULL_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.NULL_COUNT) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) LinkedList(java.util.LinkedList) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) GenericRecord(org.apache.avro.generic.GenericRecord) MIN(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MIN) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) NON_PARTITIONED_NAME(org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) COLUMN_RANGE_MERGE_FUNCTION(org.apache.hudi.common.model.HoodieColumnRangeMetadata.COLUMN_RANGE_MERGE_FUNCTION) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ByteBuffer(java.nio.ByteBuffer) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat)

Example 5 with HoodieDeltaWriteStat

use of org.apache.hudi.common.model.HoodieDeltaWriteStat in project hudi by apache.

the class HiveTestUtil method createLogFiles.

private static HoodieCommitMetadata createLogFiles(Map<String, List<HoodieWriteStat>> partitionWriteStats, boolean isLogSchemaSimple, boolean useSchemaFromCommitMetadata) throws InterruptedException, IOException, URISyntaxException {
    HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
    for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) {
        String partitionPath = wEntry.getKey();
        for (HoodieWriteStat wStat : wEntry.getValue()) {
            Path path = new Path(wStat.getPath());
            HoodieBaseFile dataFile = new HoodieBaseFile(fileSystem.getFileStatus(path));
            HoodieLogFile logFile = generateLogData(path, isLogSchemaSimple);
            HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat();
            writeStat.setFileId(dataFile.getFileId());
            writeStat.setPath(logFile.getPath().toString());
            commitMetadata.addWriteStat(partitionPath, writeStat);
        }
    }
    addSchemaToCommitMetadata(commitMetadata, isLogSchemaSimple, useSchemaFromCommitMetadata);
    return commitMetadata;
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Path(org.apache.hadoop.fs.Path) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) List(java.util.List) ArrayList(java.util.ArrayList) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Aggregations

HoodieDeltaWriteStat (org.apache.hudi.common.model.HoodieDeltaWriteStat)5 ArrayList (java.util.ArrayList)3 Path (org.apache.hadoop.fs.Path)3 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 FileSlice (org.apache.hudi.common.model.FileSlice)2 HoodieColumnRangeMetadata (org.apache.hudi.common.model.HoodieColumnRangeMetadata)2 HoodieException (org.apache.hudi.exception.HoodieException)2 ByteBuffer (java.nio.ByteBuffer)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1 Comparator (java.util.Comparator)1 LinkedList (java.util.LinkedList)1 Objects (java.util.Objects)1 BiFunction (java.util.function.BiFunction)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 Nonnull (javax.annotation.Nonnull)1