use of org.apache.hudi.common.util.Option in project hudi by apache.
the class HoodieTableMetadataUtil method processRollbackMetadata.
/**
* Extracts information about the deleted and append files from the {@code HoodieRollbackMetadata}.
* <p>
* During a rollback files may be deleted (COW, MOR) or rollback blocks be appended (MOR only) to files. This
* function will extract this change file for each partition.
*
* @param metadataTableTimeline Current timeline of the Metadata Table
* @param rollbackMetadata {@code HoodieRollbackMetadata}
* @param partitionToDeletedFiles The {@code Map} to fill with files deleted per partition.
* @param partitionToAppendedFiles The {@code Map} to fill with files appended per partition and their sizes.
*/
private static void processRollbackMetadata(HoodieActiveTimeline metadataTableTimeline, HoodieRollbackMetadata rollbackMetadata, Map<String, List<String>> partitionToDeletedFiles, Map<String, Map<String, Long>> partitionToAppendedFiles, Option<String> lastSyncTs) {
rollbackMetadata.getPartitionMetadata().values().forEach(pm -> {
final String instantToRollback = rollbackMetadata.getCommitsRollback().get(0);
// Has this rollback produced new files?
boolean hasRollbackLogFiles = pm.getRollbackLogFiles() != null && !pm.getRollbackLogFiles().isEmpty();
boolean hasNonZeroRollbackLogFiles = hasRollbackLogFiles && pm.getRollbackLogFiles().values().stream().mapToLong(Long::longValue).sum() > 0;
// If instant-to-rollback has not been synced to metadata table yet then there is no need to update metadata
// This can happen in two cases:
// Case 1: Metadata Table timeline is behind the instant-to-rollback.
boolean shouldSkip = lastSyncTs.isPresent() && HoodieTimeline.compareTimestamps(instantToRollback, HoodieTimeline.GREATER_THAN, lastSyncTs.get());
if (!hasNonZeroRollbackLogFiles && shouldSkip) {
LOG.info(String.format("Skipping syncing of rollbackMetadata at %s, given metadata table is already synced upto to %s", instantToRollback, lastSyncTs.get()));
return;
}
// Case 2: The instant-to-rollback was never committed to Metadata Table. This can happen if the instant-to-rollback
// was a failed commit (never completed) as only completed instants are synced to Metadata Table.
// But the required Metadata Table instants should not have been archived
HoodieInstant syncedInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, instantToRollback);
if (metadataTableTimeline.getCommitsTimeline().isBeforeTimelineStarts(syncedInstant.getTimestamp())) {
throw new HoodieMetadataException(String.format("The instant %s required to sync rollback of %s has been archived", syncedInstant, instantToRollback));
}
shouldSkip = !metadataTableTimeline.containsInstant(syncedInstant);
if (!hasNonZeroRollbackLogFiles && shouldSkip) {
LOG.info(String.format("Skipping syncing of rollbackMetadata at %s, since this instant was never committed to Metadata Table", instantToRollback));
return;
}
final String partition = pm.getPartitionPath();
if ((!pm.getSuccessDeleteFiles().isEmpty() || !pm.getFailedDeleteFiles().isEmpty()) && !shouldSkip) {
if (!partitionToDeletedFiles.containsKey(partition)) {
partitionToDeletedFiles.put(partition, new ArrayList<>());
}
// Extract deleted file name from the absolute paths saved in getSuccessDeleteFiles()
List<String> deletedFiles = pm.getSuccessDeleteFiles().stream().map(p -> new Path(p).getName()).collect(Collectors.toList());
if (!pm.getFailedDeleteFiles().isEmpty()) {
deletedFiles.addAll(pm.getFailedDeleteFiles().stream().map(p -> new Path(p).getName()).collect(Collectors.toList()));
}
partitionToDeletedFiles.get(partition).addAll(deletedFiles);
}
BiFunction<Long, Long, Long> fileMergeFn = (oldSize, newSizeCopy) -> {
// as rollback file could have been updated after written log files are computed.
return oldSize > newSizeCopy ? oldSize : newSizeCopy;
};
if (hasRollbackLogFiles) {
if (!partitionToAppendedFiles.containsKey(partition)) {
partitionToAppendedFiles.put(partition, new HashMap<>());
}
// Extract appended file name from the absolute paths saved in getAppendFiles()
pm.getRollbackLogFiles().forEach((path, size) -> {
partitionToAppendedFiles.get(partition).merge(new Path(path).getName(), size, fileMergeFn);
});
}
});
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class BaseTableMetadata method getColumnStats.
@Override
public Map<Pair<String, String>, HoodieMetadataColumnStats> getColumnStats(final List<Pair<String, String>> partitionNameFileNameList, final String columnName) throws HoodieMetadataException {
if (!isColumnStatsIndexEnabled) {
LOG.error("Metadata column stats index is disabled!");
return Collections.emptyMap();
}
Map<String, Pair<String, String>> columnStatKeyToFileNameMap = new HashMap<>();
TreeSet<String> sortedKeys = new TreeSet<>();
final ColumnIndexID columnIndexID = new ColumnIndexID(columnName);
for (Pair<String, String> partitionNameFileNamePair : partitionNameFileNameList) {
final String columnStatsIndexKey = HoodieMetadataPayload.getColumnStatsIndexKey(new PartitionIndexID(partitionNameFileNamePair.getLeft()), new FileIndexID(partitionNameFileNamePair.getRight()), columnIndexID);
sortedKeys.add(columnStatsIndexKey);
columnStatKeyToFileNameMap.put(columnStatsIndexKey, partitionNameFileNamePair);
}
List<String> columnStatKeys = new ArrayList<>(sortedKeys);
HoodieTimer timer = new HoodieTimer().startTimer();
List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> hoodieRecordList = getRecordsByKeys(columnStatKeys, MetadataPartitionType.COLUMN_STATS.getPartitionPath());
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_COLUMN_STATS_METADATA_STR, timer.endTimer()));
Map<Pair<String, String>, HoodieMetadataColumnStats> fileToColumnStatMap = new HashMap<>();
for (final Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : hoodieRecordList) {
if (entry.getRight().isPresent()) {
final Option<HoodieMetadataColumnStats> columnStatMetadata = entry.getRight().get().getData().getColumnStatMetadata();
if (columnStatMetadata.isPresent()) {
if (!columnStatMetadata.get().getIsDeleted()) {
ValidationUtils.checkState(columnStatKeyToFileNameMap.containsKey(entry.getLeft()));
final Pair<String, String> partitionFileNamePair = columnStatKeyToFileNameMap.get(entry.getLeft());
ValidationUtils.checkState(!fileToColumnStatMap.containsKey(partitionFileNamePair));
fileToColumnStatMap.put(partitionFileNamePair, columnStatMetadata.get());
}
} else {
LOG.error("Meta index column stats missing for: " + entry.getLeft());
}
}
}
return fileToColumnStatMap;
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class BaseTableMetadata method getBloomFilters.
@Override
public Map<Pair<String, String>, BloomFilter> getBloomFilters(final List<Pair<String, String>> partitionNameFileNameList) throws HoodieMetadataException {
if (!isBloomFilterIndexEnabled) {
LOG.error("Metadata bloom filter index is disabled!");
return Collections.emptyMap();
}
if (partitionNameFileNameList.isEmpty()) {
return Collections.emptyMap();
}
HoodieTimer timer = new HoodieTimer().startTimer();
Set<String> partitionIDFileIDSortedStrings = new TreeSet<>();
Map<String, Pair<String, String>> fileToKeyMap = new HashMap<>();
partitionNameFileNameList.forEach(partitionNameFileNamePair -> {
final String bloomFilterIndexKey = HoodieMetadataPayload.getBloomFilterIndexKey(new PartitionIndexID(partitionNameFileNamePair.getLeft()), new FileIndexID(partitionNameFileNamePair.getRight()));
partitionIDFileIDSortedStrings.add(bloomFilterIndexKey);
fileToKeyMap.put(bloomFilterIndexKey, partitionNameFileNamePair);
});
List<String> partitionIDFileIDStrings = new ArrayList<>(partitionIDFileIDSortedStrings);
List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> hoodieRecordList = getRecordsByKeys(partitionIDFileIDStrings, MetadataPartitionType.BLOOM_FILTERS.getPartitionPath());
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_BLOOM_FILTERS_METADATA_STR, (timer.endTimer() / partitionIDFileIDStrings.size())));
Map<Pair<String, String>, BloomFilter> partitionFileToBloomFilterMap = new HashMap<>();
for (final Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : hoodieRecordList) {
if (entry.getRight().isPresent()) {
final Option<HoodieMetadataBloomFilter> bloomFilterMetadata = entry.getRight().get().getData().getBloomFilterMetadata();
if (bloomFilterMetadata.isPresent()) {
if (!bloomFilterMetadata.get().getIsDeleted()) {
ValidationUtils.checkState(fileToKeyMap.containsKey(entry.getLeft()));
final ByteBuffer bloomFilterByteBuffer = bloomFilterMetadata.get().getBloomFilter();
final String bloomFilterType = bloomFilterMetadata.get().getType();
final BloomFilter bloomFilter = BloomFilterFactory.fromString(StandardCharsets.UTF_8.decode(bloomFilterByteBuffer).toString(), bloomFilterType);
partitionFileToBloomFilterMap.put(fileToKeyMap.get(entry.getLeft()), bloomFilter);
}
} else {
LOG.error("Meta index bloom filter missing for: " + fileToKeyMap.get(entry.getLeft()));
}
}
}
return partitionFileToBloomFilterMap;
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class BaseTableMetadata method fetchAllFilesInPartitionPaths.
Map<String, FileStatus[]> fetchAllFilesInPartitionPaths(List<Path> partitionPaths) throws IOException {
Map<String, Path> partitionInfo = new HashMap<>();
boolean foundNonPartitionedPath = false;
for (Path partitionPath : partitionPaths) {
String partitionName = FSUtils.getRelativePartitionPath(new Path(dataBasePath), partitionPath);
if (partitionName.isEmpty()) {
if (partitionInfo.size() > 1) {
throw new HoodieMetadataException("Found mix of partitioned and non partitioned paths while fetching data from metadata table");
}
partitionInfo.put(NON_PARTITIONED_NAME, partitionPath);
foundNonPartitionedPath = true;
} else {
if (foundNonPartitionedPath) {
throw new HoodieMetadataException("Found mix of partitioned and non partitioned paths while fetching data from metadata table");
}
partitionInfo.put(partitionName, partitionPath);
}
}
HoodieTimer timer = new HoodieTimer().startTimer();
List<Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>>> partitionsFileStatus = getRecordsByKeys(new ArrayList<>(partitionInfo.keySet()), MetadataPartitionType.FILES.getPartitionPath());
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.LOOKUP_FILES_STR, timer.endTimer()));
Map<String, FileStatus[]> result = new HashMap<>();
for (Pair<String, Option<HoodieRecord<HoodieMetadataPayload>>> entry : partitionsFileStatus) {
if (entry.getValue().isPresent()) {
mayBeHandleSpuriousDeletes(entry.getValue(), entry.getKey());
result.put(partitionInfo.get(entry.getKey()).toString(), entry.getValue().get().getData().getFileStatuses(hadoopConf.get(), partitionInfo.get(entry.getKey())));
}
}
LOG.info("Listed files in partitions from metadata: partition list =" + Arrays.toString(partitionPaths.toArray()));
return result;
}
use of org.apache.hudi.common.util.Option in project hudi by apache.
the class HoodieBackedTableMetadata method getLogRecordScanner.
public Pair<HoodieMetadataMergedLogRecordReader, Long> getLogRecordScanner(List<HoodieLogFile> logFiles, String partitionName) {
HoodieTimer timer = new HoodieTimer().startTimer();
List<String> sortedLogFilePaths = logFiles.stream().sorted(HoodieLogFile.getLogFileComparator()).map(o -> o.getPath().toString()).collect(Collectors.toList());
// Only those log files which have a corresponding completed instant on the dataset should be read
// This is because the metadata table is updated before the dataset instants are committed.
Set<String> validInstantTimestamps = getValidInstantTimestamps();
Option<HoodieInstant> latestMetadataInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
String latestMetadataInstantTime = latestMetadataInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
// Load the schema
Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build();
HoodieMetadataMergedLogRecordReader logRecordScanner = HoodieMetadataMergedLogRecordReader.newBuilder().withFileSystem(metadataMetaClient.getFs()).withBasePath(metadataBasePath).withLogFilePaths(sortedLogFilePaths).withReaderSchema(schema).withLatestInstantTime(latestMetadataInstantTime).withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES).withBufferSize(BUFFER_SIZE).withSpillableMapBasePath(spillableMapDirectory).withDiskMapType(commonConfig.getSpillableDiskMapType()).withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled()).withLogBlockTimestamps(validInstantTimestamps).enableFullScan(metadataConfig.enableFullScan()).withPartition(partitionName).build();
Long logScannerOpenMs = timer.endTimer();
LOG.info(String.format("Opened %d metadata log files (dataset instant=%s, metadata instant=%s) in %d ms", sortedLogFilePaths.size(), getLatestDataInstantTime(), latestMetadataInstantTime, logScannerOpenMs));
return Pair.of(logRecordScanner, logScannerOpenMs);
}
Aggregations