use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.
the class HoodieArchivedTimeline method loadInstants.
/**
* This is method to read selected instants. Do NOT use this directly use one of the helper methods above
* If loadInstantDetails is set to true, this would also update 'readCommits' map with commit details
* If filter is specified, only the filtered instants are loaded
* If commitsFilter is specified, only the filtered records are loaded
*/
private List<HoodieInstant> loadInstants(TimeRangeFilter filter, boolean loadInstantDetails, Function<GenericRecord, Boolean> commitsFilter) {
try {
// List all files
FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
// Sort files by version suffix in reverse (implies reverse chronological order)
Arrays.sort(fsStatuses, new ArchiveFileVersionComparator());
Set<HoodieInstant> instantsInRange = new HashSet<>();
for (FileStatus fs : fsStatuses) {
// Read the archived file
try (HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(metaClient.getFs(), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema())) {
int instantsInPreviousFile = instantsInRange.size();
// Read the avro blocks
while (reader.hasNext()) {
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
// (such as startTime, endTime of records in the block)
try (ClosableIterator<IndexedRecord> itr = blk.getRecordItr()) {
StreamSupport.stream(Spliterators.spliteratorUnknownSize(itr, Spliterator.IMMUTABLE), true).filter(r -> commitsFilter.apply((GenericRecord) r)).map(r -> readCommit((GenericRecord) r, loadInstantDetails)).filter(c -> filter == null || filter.isInRange(c)).forEach(instantsInRange::add);
}
}
if (filter != null) {
int instantsInCurrentFile = instantsInRange.size() - instantsInPreviousFile;
if (instantsInPreviousFile > 0 && instantsInCurrentFile == 0) {
// This signals we crossed lower bound of desired time window.
break;
}
}
} catch (Exception originalException) {
// need to ignore this kind of exception here.
try {
Path planPath = new Path(metaClient.getArchivePath(), MERGE_ARCHIVE_PLAN_NAME);
HoodieWrapperFileSystem fileSystem = metaClient.getFs();
if (fileSystem.exists(planPath)) {
HoodieMergeArchiveFilePlan plan = TimelineMetadataUtils.deserializeAvroMetadata(FileIOUtils.readDataFromPath(fileSystem, planPath).get(), HoodieMergeArchiveFilePlan.class);
String mergedArchiveFileName = plan.getMergedArchiveFileName();
if (!StringUtils.isNullOrEmpty(mergedArchiveFileName) && fs.getPath().getName().equalsIgnoreCase(mergedArchiveFileName)) {
LOG.warn("Catch exception because of reading uncompleted merging archive file " + mergedArchiveFileName + ". Ignore it here.");
continue;
}
}
throw originalException;
} catch (Exception e) {
// For example corrupted archive file and corrupted plan are both existed.
throw originalException;
}
}
}
ArrayList<HoodieInstant> result = new ArrayList<>(instantsInRange);
Collections.sort(result);
return result;
} catch (IOException e) {
throw new HoodieIOException("Could not load archived commit timeline from path " + metaClient.getArchivePath(), e);
}
}
use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.
the class HoodieHFileDataBlock method lookupRecords.
// TODO abstract this w/in HoodieDataBlock
@Override
protected ClosableIterator<IndexedRecord> lookupRecords(List<String> keys) throws IOException {
HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get();
// NOTE: It's important to extend Hadoop configuration here to make sure configuration
// is appropriately carried over
Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf());
inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName());
Path inlinePath = InLineFSUtils.getInlineFilePath(blockContentLoc.getLogFile().getPath(), blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(), blockContentLoc.getContentPositionInLogFile(), blockContentLoc.getBlockSize());
// HFile read will be efficient if keys are sorted, since on storage, records are sorted by key. This will avoid unnecessary seeks.
Collections.sort(keys);
final HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(inlineConf, inlinePath, new CacheConfig(inlineConf), inlinePath.getFileSystem(inlineConf));
// Get writer's schema from the header
final ClosableIterator<IndexedRecord> recordIterator = reader.getRecordIterator(keys, readerSchema);
return new ClosableIterator<IndexedRecord>() {
@Override
public boolean hasNext() {
return recordIterator.hasNext();
}
@Override
public IndexedRecord next() {
return recordIterator.next();
}
@Override
public void close() {
recordIterator.close();
reader.close();
}
};
}
use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.
the class BootstrapOperator method loadRecords.
/**
* Loads all the indices of give partition path into the backup state.
*
* @param partitionPath The partition path
*/
@SuppressWarnings("unchecked")
protected void loadRecords(String partitionPath) throws Exception {
long start = System.currentTimeMillis();
final int parallelism = getRuntimeContext().getNumberOfParallelSubtasks();
final int maxParallelism = getRuntimeContext().getMaxNumberOfParallelSubtasks();
final int taskID = getRuntimeContext().getIndexOfThisSubtask();
HoodieTimeline commitsTimeline = this.hoodieTable.getMetaClient().getCommitsTimeline();
if (!StringUtils.isNullOrEmpty(lastInstantTime)) {
commitsTimeline = commitsTimeline.findInstantsAfter(lastInstantTime);
}
Option<HoodieInstant> latestCommitTime = commitsTimeline.filterCompletedInstants().lastInstant();
if (latestCommitTime.isPresent()) {
BaseFileUtils fileUtils = BaseFileUtils.getInstance(this.hoodieTable.getBaseFileFormat());
Schema schema = new TableSchemaResolver(this.hoodieTable.getMetaClient()).getTableAvroSchema();
List<FileSlice> fileSlices = this.hoodieTable.getSliceView().getLatestFileSlicesBeforeOrOn(partitionPath, latestCommitTime.get().getTimestamp(), true).collect(toList());
for (FileSlice fileSlice : fileSlices) {
if (!shouldLoadFile(fileSlice.getFileId(), maxParallelism, parallelism, taskID)) {
continue;
}
LOG.info("Load records from {}.", fileSlice);
// load parquet records
fileSlice.getBaseFile().ifPresent(baseFile -> {
// filter out crushed files
if (!isValidFile(baseFile.getFileStatus())) {
return;
}
try (ClosableIterator<HoodieKey> iterator = fileUtils.getHoodieKeyIterator(this.hadoopConf, new Path(baseFile.getPath()))) {
iterator.forEachRemaining(hoodieKey -> {
output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(hoodieKey, fileSlice))));
});
}
});
// load avro log records
List<String> logPaths = fileSlice.getLogFiles().filter(logFile -> isValidFile(logFile.getFileStatus())).map(logFile -> logFile.getPath().toString()).collect(toList());
HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(logPaths, schema, latestCommitTime.get().getTimestamp(), writeConfig, hadoopConf);
try {
for (String recordKey : scanner.getRecords().keySet()) {
output.collect(new StreamRecord(new IndexRecord(generateHoodieRecord(new HoodieKey(recordKey, partitionPath), fileSlice))));
}
} catch (Exception e) {
throw new HoodieException(String.format("Error when loading record keys from files: %s", logPaths), e);
} finally {
scanner.close();
}
}
}
long cost = System.currentTimeMillis() - start;
LOG.info("Task [{}}:{}}] finish loading the index under partition {} and sending them to downstream, time cost: {} milliseconds.", this.getClass().getSimpleName(), taskID, partitionPath, cost);
}
use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.
the class HoodieHFileReader method getRecordIterator.
public ClosableIterator<R> getRecordIterator(List<String> keys, Schema schema) throws IOException {
this.schema = schema;
reader.loadFileInfo();
Iterator<String> iterator = keys.iterator();
return new ClosableIterator<R>() {
private R next;
@Override
public void close() {
}
@Override
public boolean hasNext() {
try {
while (iterator.hasNext()) {
Option<R> value = getRecordByKey(iterator.next(), schema);
if (value.isPresent()) {
next = value.get();
return true;
}
}
return false;
} catch (IOException e) {
throw new HoodieIOException("unable to read next record from hfile ", e);
}
}
@Override
public R next() {
return next;
}
};
}
use of org.apache.hudi.common.util.ClosableIterator in project hudi by apache.
the class BitCaskDiskMap method close.
@Override
public void close() {
valueMetadataMap.clear();
try {
if (writeOnlyFileHandle != null) {
writeOnlyFileHandle.flush();
fileOutputStream.getChannel().force(false);
writeOnlyFileHandle.close();
}
while (!openedAccessFiles.isEmpty()) {
BufferedRandomAccessFile file = openedAccessFiles.poll();
if (null != file) {
try {
file.close();
} catch (IOException ioe) {
// skip exception
}
}
}
writeOnlyFile.delete();
this.iterators.forEach(ClosableIterator::close);
} catch (Exception e) {
// delete the file for any sort of exception
writeOnlyFile.delete();
} finally {
super.close();
}
}
Aggregations