use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class HoodieBloomIndex method loadColumnRangesFromFiles.
/**
* Load all involved files as <Partition, filename> pair List.
*/
List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromFiles(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
// Obtain the latest data files from all the partitions.
List<Pair<String, String>> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream().map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())).collect(toList());
context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)");
return context.map(partitionPathFileIDList, pf -> {
try {
HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf);
String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys();
return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1]));
} catch (MetadataNotFoundException me) {
LOG.warn("Unable to find range metadata in file :" + pf);
return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()));
}
}, Math.max(partitionPathFileIDList.size(), 1));
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class ListBasedHoodieBloomIndexHelper method findMatchingFilesForRecordKeys.
@Override
public HoodiePairData<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, HoodiePairData<String, String> partitionRecordKeyPairs, HoodieData<Pair<String, HoodieKey>> fileComparisonPairs, Map<String, List<BloomIndexFileInfo>> partitionToFileInfo, Map<String, Long> recordsPerPartition) {
List<Pair<String, HoodieKey>> fileComparisonPairList = HoodieList.getList(fileComparisonPairs).stream().sorted(Comparator.comparing(Pair::getLeft)).collect(toList());
List<HoodieKeyLookupResult> keyLookupResults = new ArrayList<>();
Iterator<List<HoodieKeyLookupResult>> iterator = new HoodieBaseBloomIndexCheckFunction(hoodieTable, config).apply(fileComparisonPairList.iterator());
while (iterator.hasNext()) {
keyLookupResults.addAll(iterator.next());
}
keyLookupResults = keyLookupResults.stream().filter(lr -> lr.getMatchingRecordKeys().size() > 0).collect(toList());
return context.parallelize(keyLookupResults).flatMap(lookupResult -> lookupResult.getMatchingRecordKeys().stream().map(recordKey -> new ImmutablePair<>(lookupResult, recordKey)).iterator()).mapToPair(pair -> {
HoodieKeyLookupResult lookupResult = pair.getLeft();
String recordKey = pair.getRight();
return new ImmutablePair<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()));
});
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class JavaDeleteHelper method execute.
@Override
public HoodieWriteMetadata<List<WriteStatus>> execute(String instantTime, List<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>> table, BaseCommitActionExecutor<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>, R> deleteExecutor) {
try {
HoodieWriteMetadata<List<WriteStatus>> result = null;
List<HoodieKey> dedupedKeys = keys;
final int parallelism = config.getDeleteShuffleParallelism();
if (config.shouldCombineBeforeDelete()) {
// De-dupe/merge if needed
dedupedKeys = deduplicateKeys(keys, table, parallelism);
}
List<HoodieRecord<EmptyHoodieRecordPayload>> dedupedRecords = dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
Instant beginTag = Instant.now();
// perform index look up to get existing location of records
List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
// filter out non existent keys/records
List<HoodieRecord<EmptyHoodieRecordPayload>> taggedValidRecords = taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).collect(Collectors.toList());
if (!taggedValidRecords.isEmpty()) {
result = deleteExecutor.execute(taggedValidRecords);
result.setIndexLookupDuration(tagLocationDuration);
} else {
// if entire set of keys are non existent
deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
result = new HoodieWriteMetadata<>();
result.setWriteStatuses(Collections.EMPTY_LIST);
deleteExecutor.commitOnAutoCommit(result);
}
return result;
} catch (Throwable e) {
if (e instanceof HoodieUpsertException) {
throw (HoodieUpsertException) e;
}
throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
}
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class SparkValidatorUtils method runValidators.
/**
* Check configured pre-commit validators and run them. Note that this only works for COW tables
*
* Throw error if there are validation failures.
*/
public static void runValidators(HoodieWriteConfig config, HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata, HoodieEngineContext context, HoodieTable table, String instantTime) {
if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
LOG.info("no validators configured.");
} else {
if (!writeMetadata.getWriteStats().isPresent()) {
writeMetadata.setWriteStats(writeMetadata.getWriteStatuses().map(WriteStatus::getStat).collectAsList());
}
Set<String> partitionsModified = writeMetadata.getWriteStats().get().stream().map(writeStats -> writeStats.getPartitionPath()).collect(Collectors.toSet());
SQLContext sqlContext = new SQLContext(HoodieSparkEngineContext.getSparkContext(context));
// Refresh timeline to ensure validator sees the any other operations done on timeline (async operations such as other clustering/compaction/rollback)
table.getMetaClient().reloadActiveTimeline();
Dataset<Row> beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table).cache();
Dataset<Row> afterState = getRecordsFromPendingCommits(sqlContext, partitionsModified, writeMetadata, table, instantTime).cache();
Stream<SparkPreCommitValidator> validators = Arrays.stream(config.getPreCommitValidators().split(",")).map(validatorClass -> {
return ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass, new Class<?>[] { HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class }, table, context, config));
});
boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join).reduce(true, Boolean::logicalAnd);
if (allSuccess) {
LOG.info("All validations succeeded");
} else {
LOG.error("At least one pre-commit validation failed");
throw new HoodieValidationException("At least one pre-commit validation failed");
}
}
}
use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.
the class FSUtils method getFileStatusAtLevel.
/**
* Lists file status at a certain level in the directory hierarchy.
* <p>
* E.g., given "/tmp/hoodie_table" as the rootPath, and 3 as the expected level,
* this method gives back the {@link FileStatus} of all files under
* "/tmp/hoodie_table/[*]/[*]/[*]/" folders.
*
* @param hoodieEngineContext {@link HoodieEngineContext} instance.
* @param fs {@link FileSystem} instance.
* @param rootPath Root path for the file listing.
* @param expectLevel Expected level of directory hierarchy for files to be added.
* @param parallelism Parallelism for the file listing.
* @return A list of file status of files at the level.
*/
public static List<FileStatus> getFileStatusAtLevel(HoodieEngineContext hoodieEngineContext, FileSystem fs, Path rootPath, int expectLevel, int parallelism) {
List<String> levelPaths = new ArrayList<>();
List<FileStatus> result = new ArrayList<>();
levelPaths.add(rootPath.toString());
for (int i = 0; i <= expectLevel; i++) {
result = FSUtils.parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, pairOfSubPathAndConf -> {
Path path = new Path(pairOfSubPathAndConf.getKey());
try {
FileSystem fileSystem = path.getFileSystem(pairOfSubPathAndConf.getValue().get());
return Arrays.stream(fileSystem.listStatus(path)).collect(Collectors.toList());
} catch (IOException e) {
throw new HoodieIOException("Failed to list " + path, e);
}
}, levelPaths).values().stream().flatMap(list -> list.stream()).collect(Collectors.toList());
if (i < expectLevel) {
levelPaths = result.stream().filter(FileStatus::isDirectory).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
}
}
return result;
}
Aggregations