Search in sources :

Example 1 with SliceView

use of org.apache.hudi.common.table.view.TableFileSystemView.SliceView in project hudi by apache.

the class HoodieAppendHandle method init.

private void init(HoodieRecord record) {
    if (doInit) {
        // extract some information from the first record
        SliceView rtView = hoodieTable.getSliceView();
        Option<FileSlice> fileSlice = rtView.getLatestFileSlice(partitionPath, fileId);
        // Set the base commit time as the current instantTime for new inserts into log files
        String baseInstantTime;
        String baseFile = "";
        List<String> logFiles = new ArrayList<>();
        if (fileSlice.isPresent()) {
            baseInstantTime = fileSlice.get().getBaseInstantTime();
            baseFile = fileSlice.get().getBaseFile().map(BaseFile::getFileName).orElse("");
            logFiles = fileSlice.get().getLogFiles().map(HoodieLogFile::getFileName).collect(Collectors.toList());
        } else {
            baseInstantTime = instantTime;
            // This means there is no base data file, start appending to a new log file
            fileSlice = Option.of(new FileSlice(partitionPath, baseInstantTime, this.fileId));
            LOG.info("New AppendHandle for partition :" + partitionPath);
        }
        // Prepare the first write status
        writeStatus.setStat(new HoodieDeltaWriteStat());
        writeStatus.setFileId(fileId);
        writeStatus.setPartitionPath(partitionPath);
        averageRecordSize = sizeEstimator.sizeEstimate(record);
        HoodieDeltaWriteStat deltaWriteStat = (HoodieDeltaWriteStat) writeStatus.getStat();
        deltaWriteStat.setPrevCommit(baseInstantTime);
        deltaWriteStat.setPartitionPath(partitionPath);
        deltaWriteStat.setFileId(fileId);
        deltaWriteStat.setBaseFile(baseFile);
        deltaWriteStat.setLogFiles(logFiles);
        try {
            // Save hoodie partition meta in the partition path
            HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, baseInstantTime, new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
            partitionMetadata.trySave(getPartitionId());
            // Since the actual log file written to can be different based on when rollover happens, we use the
            // base file to denote some log appends happened on a slice. writeToken will still fence concurrent
            // writers.
            // https://issues.apache.org/jira/browse/HUDI-1517
            createMarkerFile(partitionPath, FSUtils.makeDataFileName(baseInstantTime, writeToken, fileId, hoodieTable.getBaseFileExtension()));
            this.writer = createLogWriter(fileSlice, baseInstantTime);
        } catch (Exception e) {
            LOG.error("Error in update task at commit " + instantTime, e);
            writeStatus.setGlobalError(e);
            throw new HoodieUpsertException("Failed to initialize HoodieAppendHandle for FileId: " + fileId + " on commit " + instantTime + " on HDFS path " + hoodieTable.getMetaClient().getBasePath() + "/" + partitionPath, e);
        }
        doInit = false;
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) HoodieException(org.apache.hudi.exception.HoodieException) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAppendException(org.apache.hudi.exception.HoodieAppendException) IOException(java.io.IOException) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 2 with SliceView

use of org.apache.hudi.common.table.view.TableFileSystemView.SliceView in project hudi by apache.

the class HoodieCompactor method generateCompactionPlan.

/**
 * Generate a new compaction plan for scheduling.
 *
 * @param context                               HoodieEngineContext
 * @param hoodieTable                           Hoodie Table
 * @param config                                Hoodie Write Configuration
 * @param compactionCommitTime                  scheduled compaction commit time
 * @param fgIdsInPendingCompactionAndClustering partition-fileId pairs for which compaction is pending
 * @return Compaction Plan
 * @throws IOException when encountering errors
 */
HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, HoodieTable<T, I, K, O> hoodieTable, HoodieWriteConfig config, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactionAndClustering) throws IOException {
    // Accumulator to keep track of total log files for a table
    HoodieAccumulator totalLogFiles = context.newAccumulator();
    // Accumulator to keep track of total log file slices for a table
    HoodieAccumulator totalFileSlices = context.newAccumulator();
    ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient().getTableType().name());
    // TODO : check if maxMemory is not greater than JVM or executor memory
    // TODO - rollback any compactions in flight
    HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
    LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
    List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath());
    // filter the partition paths if needed to reduce list status
    partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths);
    if (partitionPaths.isEmpty()) {
        // In case no partitions could be picked, return no compaction plan
        return null;
    }
    SliceView fileSystemView = hoodieTable.getSliceView();
    LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
    context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact");
    List<HoodieCompactionOperation> operations = context.flatMap(partitionPaths, partitionPath -> fileSystemView.getLatestFileSlices(partitionPath).filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId())).map(s -> {
        List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(toList());
        totalLogFiles.add(logFiles.size());
        totalFileSlices.add(1L);
        // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
        // for Map operations and collecting them finally in Avro generated classes for storing
        // into meta files.
        Option<HoodieBaseFile> dataFile = s.getBaseFile();
        return new CompactionOperation(dataFile, partitionPath, logFiles, config.getCompactionStrategy().captureMetrics(config, s));
    }).filter(c -> !c.getDeltaFileNames().isEmpty()), partitionPaths.size()).stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
    LOG.info("Total of " + operations.size() + " compactions are retrieved");
    LOG.info("Total number of latest files slices " + totalFileSlices.value());
    LOG.info("Total number of log files " + totalLogFiles.value());
    LOG.info("Total number of file slices " + totalFileSlices.value());
    // Filter the compactions with the passed in filter. This lets us choose most effective
    // compactions only
    HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
    ValidationUtils.checkArgument(compactionPlan.getOperations().stream().noneMatch(op -> fgIdsInPendingCompactionAndClustering.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactionAndClustering + ", Selected workload :" + compactionPlan);
    if (compactionPlan.getOperations().isEmpty()) {
        LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
    }
    return compactionPlan;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) CompactionStrategy(org.apache.hudi.table.action.compact.strategy.CompactionStrategy) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieCompactionHandler(org.apache.hudi.table.HoodieCompactionHandler) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) Option(org.apache.hudi.common.util.Option) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Path (org.apache.hadoop.fs.Path)2 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)2 SliceView (org.apache.hudi.common.table.view.TableFileSystemView.SliceView)2 Serializable (java.io.Serializable)1 Collection (java.util.Collection)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Set (java.util.Set)1 Collectors.toList (java.util.stream.Collectors.toList)1 StreamSupport (java.util.stream.StreamSupport)1 Schema (org.apache.avro.Schema)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 HoodieAvroUtils (org.apache.hudi.avro.HoodieAvroUtils)1 HoodieCompactionOperation (org.apache.hudi.avro.model.HoodieCompactionOperation)1 HoodieCompactionPlan (org.apache.hudi.avro.model.HoodieCompactionPlan)1 WriteStatus (org.apache.hudi.client.WriteStatus)1 HoodieAccumulator (org.apache.hudi.common.data.HoodieAccumulator)1 HoodieData (org.apache.hudi.common.data.HoodieData)1