Search in sources :

Example 1 with ClosableIterator

use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.

the class HoodieArchivedTimeline method loadInstants.

/**
 * This is method to read selected instants. Do NOT use this directly use one of the helper methods above
 * If loadInstantDetails is set to true, this would also update 'readCommits' map with commit details
 * If filter is specified, only the filtered instants are loaded
 * If commitsFilter is specified, only the filtered records are loaded
 */
private List<HoodieInstant> loadInstants(TimeRangeFilter filter, boolean loadInstantDetails, Function<GenericRecord, Boolean> commitsFilter) {
    try {
        // List all files
        FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
        // Sort files by version suffix in reverse (implies reverse chronological order)
        Arrays.sort(fsStatuses, new ArchiveFileVersionComparator());
        Set<HoodieInstant> instantsInRange = new HashSet<>();
        for (FileStatus fs : fsStatuses) {
            // Read the archived file
            try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema())) {
                int instantsInPreviousFile = instantsInRange.size();
                // Read the avro blocks
                while (reader.hasNext()) {
                    HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
                    // (such as startTime, endTime of records in the block)
                    try (ClosableIterator<IndexedRecord> itr = blk.getRecordItr()) {
                        StreamSupport.stream(Spliterators.spliteratorUnknownSize(itr, Spliterator.IMMUTABLE), true).filter(r -> commitsFilter.apply((GenericRecord) r)).map(r -> readCommit((GenericRecord) r, loadInstantDetails)).filter(c -> filter == null || filter.isInRange(c)).forEach(instantsInRange::add);
                    }
                }
                if (filter != null) {
                    int instantsInCurrentFile = instantsInRange.size() - instantsInPreviousFile;
                    if (instantsInPreviousFile > 0 && instantsInCurrentFile == 0) {
                        // This signals we crossed lower bound of desired time window.
                        break;
                    }
                }
            } catch (Exception originalException) {
                // need to ignore this kind of exception here.
                try {
                    Path planPath = new Path(metaClient.getArchivePath(), MERGE_ARCHIVE_PLAN_NAME);
                    HoodieWrapperFileSystem fileSystem = metaClient.getFs();
                    if (fileSystem.exists(planPath)) {
                        HoodieMergeArchiveFilePlan plan = TimelineMetadataUtils.deserializeAvroMetadata(FileIOUtils.readDataFromPath(fileSystem, planPath).get(), HoodieMergeArchiveFilePlan.class);
                        String mergedArchiveFileName = plan.getMergedArchiveFileName();
                        if (!StringUtils.isNullOrEmpty(mergedArchiveFileName) && fs.getPath().getName().equalsIgnoreCase(mergedArchiveFileName)) {
                            LOG.warn("Catch exception because of reading uncompleted merging archive file " + mergedArchiveFileName + ". Ignore it here.");
                            continue;
                        }
                    }
                    throw originalException;
                } catch (Exception e) {
                    // For example corrupted archive file and corrupted plan are both existed.
                    throw originalException;
                }
            }
        }
        ArrayList<HoodieInstant> result = new ArrayList<>(instantsInRange);
        Collections.sort(result);
        return result;
    } catch (IOException e) {
        throw new HoodieIOException("Could not load archived commit timeline from path " + metaClient.getArchivePath(), e);
    }
}
Also used : HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieArchivedMetaEntry(org.apache.hudi.avro.model.HoodieArchivedMetaEntry) Spliterators(java.util.Spliterators) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) Matcher(java.util.regex.Matcher) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) Nonnull(javax.annotation.Nonnull) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) IndexedRecord(org.apache.avro.generic.IndexedRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Set(java.util.Set) IOException(java.io.IOException) HoodieMergeArchiveFilePlan(org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) Collections(java.util.Collections) Spliterator(java.util.Spliterator) FileStatus(org.apache.hadoop.fs.FileStatus) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) ArrayList(java.util.ArrayList) HoodieMergeArchiveFilePlan(org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) GenericRecord(org.apache.avro.generic.GenericRecord) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 2 with ClosableIterator

use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.

the class HoodieHFileDataBlock method lookupRecords.

// TODO abstract this w/in HoodieDataBlock
@Override
protected ClosableIterator<IndexedRecord> lookupRecords(List<String> keys) throws IOException {
    HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get();
    // NOTE: It's important to extend Hadoop configuration here to make sure configuration
    // is appropriately carried over
    Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf());
    inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName());
    Path inlinePath = InLineFSUtils.getInlineFilePath(blockContentLoc.getLogFile().getPath(), blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(), blockContentLoc.getContentPositionInLogFile(), blockContentLoc.getBlockSize());
    // HFile read will be efficient if keys are sorted, since on storage, records are sorted by key. This will avoid unnecessary seeks.
    Collections.sort(keys);
    final HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(inlineConf, inlinePath, new CacheConfig(inlineConf), inlinePath.getFileSystem(inlineConf));
    // Get writer's schema from the header
    final ClosableIterator<IndexedRecord> recordIterator = reader.getRecordIterator(keys, readerSchema);
    return new ClosableIterator<IndexedRecord>() {

        @Override
        public boolean hasNext() {
            return recordIterator.hasNext();
        }

        @Override
        public IndexedRecord next() {
            return recordIterator.next();
        }

        @Override
        public void close() {
            recordIterator.close();
            reader.close();
        }
    };
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) IndexedRecord(org.apache.avro.generic.IndexedRecord) InLineFileSystem(org.apache.hudi.common.fs.inline.InLineFileSystem) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) HoodieHFileReader(org.apache.hudi.io.storage.HoodieHFileReader) CacheConfig(org.apache.hadoop.hbase.io.hfile.CacheConfig)

Example 3 with ClosableIterator

use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.

the class BootstrapOperator method loadRecords.

/**
 * Loads all the indices of give partition path into the backup state.
 *
 * @param partitionPath The partition path
 */
@SuppressWarnings("unchecked")
protected void loadRecords(String partitionPath) throws Exception {
    long start = System.currentTimeMillis();
    final int parallelism = getRuntimeContext().getNumberOfParallelSubtasks();
    final int maxParallelism = getRuntimeContext().getMaxNumberOfParallelSubtasks();
    final int taskID = getRuntimeContext().getIndexOfThisSubtask();
    HoodieTimeline commitsTimeline = this.hoodieTable.getMetaClient().getCommitsTimeline();
    if (!StringUtils.isNullOrEmpty(lastInstantTime)) {
        commitsTimeline = commitsTimeline.findInstantsAfter(lastInstantTime);
    }
    Option<HoodieInstant> latestCommitTime = commitsTimeline.filterCompletedInstants().lastInstant();
    if (latestCommitTime.isPresent()) {
        BaseFileUtils fileUtils = BaseFileUtils.getInstance(this.hoodieTable.getBaseFileFormat());
        Schema schema = new TableSchemaResolver(this.hoodieTable.getMetaClient()).getTableAvroSchema();
        List<FileSlice> fileSlices = this.hoodieTable.getSliceView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp(), true).collect(toList());
        for (FileSlice fileSlice : fileSlices) {
            if (!shouldLoadFile(fileSlice.getFileId(), maxParallelism, parallelism, taskID)) {
                continue;
            }
            LOG.info("Load records from {}.", fileSlice);
            // load parquet records
            fileSlice.getBaseFile().ifPresent(baseFile -> {
                // filter out crushed files
                if (!isValidFile(baseFile.getFileStatus())) {
                    return;
                }
                try (ClosableIterator<HoodieKey> iterator = fileUtils.getHoodieKeyIterator(this.hadoopConf, new Path(baseFile.getPath()))) {
                    iterator.forEachRemaining(hoodieKey -> {
                        output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice))));
                    });
                }
            });
            // load avro log records
            List<String> logPaths = fileSlice.getLogFiles().filter(logFile -> isValidFile(logFile.getFileStatus())).map(logFile -> logFile.getPath().toString()).collect(toList());
            HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(logPaths, schema, latestCommitTime.get().getTimestamp(), writeConfig, hadoopConf);
            try {
                for (String recordKey : scanner.getRecords().keySet()) {
                    output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(new HoodieKey(recordKey, partitionPath), fileSlice))));
                }
            } catch (Exception e) {
                throw new HoodieException(String.format("Error when loading record keys from files: %s", logPaths), e);
            } finally {
                scanner.close();
            }
        }
    }
    long cost = System.currentTimeMillis() - start;
    LOG.info("Task [{}}:{}}] finish loading the index under partition {} and sending them to downstream, time cost: {} milliseconds.", this.getClass().getSimpleName(), taskID, partitionPath, cost);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) BootstrapAggFunction(org.apache.hudi.sink.bootstrap.aggregate.BootstrapAggFunction) CkpMetadata(org.apache.hudi.sink.meta.CkpMetadata) ListState(org.apache.flink.api.common.state.ListState) StringUtils(org.apache.hudi.common.util.StringUtils) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) FlinkTables(org.apache.hudi.util.FlinkTables) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) Path(org.apache.hadoop.fs.Path) StreamerUtil(org.apache.hudi.util.StreamerUtil) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Types(org.apache.flink.api.common.typeinfo.Types) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) StateSnapshotContext(org.apache.flink.runtime.state.StateSnapshotContext) HoodieRecordGlobalLocation(org.apache.hudi.common.model.HoodieRecordGlobalLocation) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Configuration(org.apache.flink.configuration.Configuration) StreamerUtil.isValidFile(org.apache.hudi.util.StreamerUtil.isValidFile) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) AbstractStreamOperator(org.apache.flink.streaming.api.operators.AbstractStreamOperator) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) GlobalAggregateManager(org.apache.flink.runtime.taskexecutor.GlobalAggregateManager) HoodieKey(org.apache.hudi.common.model.HoodieKey) Pattern(java.util.regex.Pattern) OneInputStreamOperator(org.apache.flink.streaming.api.operators.OneInputStreamOperator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) FormatUtils(org.apache.hudi.table.format.FormatUtils) KeyGroupRangeAssignment(org.apache.flink.runtime.state.KeyGroupRangeAssignment) StateInitializationContext(org.apache.flink.runtime.state.StateInitializationContext) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieKey(org.apache.hudi.common.model.HoodieKey) BaseFileUtils(org.apache.hudi.common.util.BaseFileUtils)

Example 4 with ClosableIterator

use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.

the class HoodieHFileReader method getRecordIterator.

public ClosableIterator<R> getRecordIterator(List<String> keys, Schema schema) throws IOException {
    this.schema = schema;
    reader.loadFileInfo();
    Iterator<String> iterator = keys.iterator();
    return new ClosableIterator<R>() {

        private R next;

        @Override
        public void close() {
        }

        @Override
        public boolean hasNext() {
            try {
                while (iterator.hasNext()) {
                    Option<R> value = getRecordByKey(iterator.next(), schema);
                    if (value.isPresent()) {
                        next = value.get();
                        return true;
                    }
                }
                return false;
            } catch (IOException e) {
                throw new HoodieIOException("unable to read next record from hfile ", e);
            }
        }

        @Override
        public R next() {
            return next;
        }
    };
}
Also used : HoodieIOException(org.apache.hudi.exception.HoodieIOException) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 5 with ClosableIterator

use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.

the class BitCaskDiskMap method close.

@Override
public void close() {
    valueMetadataMap.clear();
    try {
        if (writeOnlyFileHandle != null) {
            writeOnlyFileHandle.flush();
            fileOutputStream.getChannel().force(false);
            writeOnlyFileHandle.close();
        }
        while (!openedAccessFiles.isEmpty()) {
            BufferedRandomAccessFile file = openedAccessFiles.poll();
            if (null != file) {
                try {
                    file.close();
                } catch (IOException ioe) {
                // skip exception
                }
            }
        }
        writeOnlyFile.delete();
        this.iterators.forEach(ClosableIterator::close);
    } catch (Exception e) {
        // delete the file for any sort of exception
        writeOnlyFile.delete();
    } finally {
        super.close();
    }
}
Also used : BufferedRandomAccessFile(org.apache.hudi.common.util.BufferedRandomAccessFile) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieException(org.apache.hudi.exception.HoodieException) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Aggregations

ClosableIterator (org.apache.hudi.common.util.ClosableIterator)10 IOException (java.io.IOException)7 IndexedRecord (org.apache.avro.generic.IndexedRecord)7 Schema (org.apache.avro.Schema)5 Path (org.apache.hadoop.fs.Path)5 List (java.util.List)4 GenericRecord (org.apache.avro.generic.GenericRecord)4 Option (org.apache.hudi.common.util.Option)4 HoodieException (org.apache.hudi.exception.HoodieException)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 HoodieIOException (org.apache.hudi.exception.HoodieIOException)3 Collections (java.util.Collections)2 Comparator (java.util.Comparator)2 Map (java.util.Map)2 Pattern (java.util.regex.Pattern)2 Collectors (java.util.stream.Collectors)2 GenericRecordBuilder (org.apache.avro.generic.GenericRecordBuilder)2 GenericRowData (org.apache.flink.table.data.GenericRowData)2 RowData (org.apache.flink.table.data.RowData)2