Search in sources :

Example 6 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieBloomIndex method loadColumnRangesFromFiles.

/**
 * Load all involved files as <Partition, filename> pair List.
 */
List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromFiles(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
    // Obtain the latest data files from all the partitions.
    List<Pair<String, String>> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream().map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())).collect(toList());
    context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)");
    return context.map(partitionPathFileIDList, pf -> {
        try {
            HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf);
            String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys();
            return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1]));
        } catch (MetadataNotFoundException me) {
            LOG.warn("Unable to find range metadata in file :" + pf);
            return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()));
        }
    }, Math.max(partitionPathFileIDList.size(), 1));
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieIndexUtils.getLatestBaseFilesForAllPartitions(org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieConfig(org.apache.hudi.common.config.HoodieConfig) HoodieRangeInfoHandle(org.apache.hudi.io.HoodieRangeInfoHandle) Map(java.util.Map) Collectors.mapping(java.util.stream.Collectors.mapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) HoodiePairData(org.apache.hudi.common.data.HoodiePairData) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Stream(java.util.stream.Stream) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIndexUtils(org.apache.hudi.index.HoodieIndexUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair) HoodieRangeInfoHandle(org.apache.hudi.io.HoodieRangeInfoHandle)

Example 7 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class ListBasedHoodieBloomIndexHelper method findMatchingFilesForRecordKeys.

@Override
public HoodiePairData<HoodieKey, HoodieRecordLocation> findMatchingFilesForRecordKeys(HoodieWriteConfig config, HoodieEngineContext context, HoodieTable hoodieTable, HoodiePairData<String, String> partitionRecordKeyPairs, HoodieData<Pair<String, HoodieKey>> fileComparisonPairs, Map<String, List<BloomIndexFileInfo>> partitionToFileInfo, Map<String, Long> recordsPerPartition) {
    List<Pair<String, HoodieKey>> fileComparisonPairList = HoodieList.getList(fileComparisonPairs).stream().sorted(Comparator.comparing(Pair::getLeft)).collect(toList());
    List<HoodieKeyLookupResult> keyLookupResults = new ArrayList<>();
    Iterator<List<HoodieKeyLookupResult>> iterator = new HoodieBaseBloomIndexCheckFunction(hoodieTable, config).apply(fileComparisonPairList.iterator());
    while (iterator.hasNext()) {
        keyLookupResults.addAll(iterator.next());
    }
    keyLookupResults = keyLookupResults.stream().filter(lr -> lr.getMatchingRecordKeys().size() > 0).collect(toList());
    return context.parallelize(keyLookupResults).flatMap(lookupResult -> lookupResult.getMatchingRecordKeys().stream().map(recordKey -> new ImmutablePair<>(lookupResult, recordKey)).iterator()).mapToPair(pair -> {
        HoodieKeyLookupResult lookupResult = pair.getLeft();
        String recordKey = pair.getRight();
        return new ImmutablePair<>(new HoodieKey(recordKey, lookupResult.getPartitionPath()), new HoodieRecordLocation(lookupResult.getBaseInstantTime(), lookupResult.getFileId()));
    });
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodiePairData(org.apache.hudi.common.data.HoodiePairData) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieKeyLookupResult(org.apache.hudi.io.HoodieKeyLookupResult) ArrayList(java.util.ArrayList) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Map(java.util.Map) HoodieKey(org.apache.hudi.common.model.HoodieKey) Comparator(java.util.Comparator) Pair(org.apache.hudi.common.util.collection.Pair) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieKeyLookupResult(org.apache.hudi.io.HoodieKeyLookupResult) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieKey(org.apache.hudi.common.model.HoodieKey) ArrayList(java.util.ArrayList) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 8 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class JavaDeleteHelper method execute.

@Override
public HoodieWriteMetadata<List<WriteStatus>> execute(String instantTime, List<HoodieKey> keys, HoodieEngineContext context, HoodieWriteConfig config, HoodieTable<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>> table, BaseCommitActionExecutor<EmptyHoodieRecordPayload, List<HoodieRecord<EmptyHoodieRecordPayload>>, List<HoodieKey>, List<WriteStatus>, R> deleteExecutor) {
    try {
        HoodieWriteMetadata<List<WriteStatus>> result = null;
        List<HoodieKey> dedupedKeys = keys;
        final int parallelism = config.getDeleteShuffleParallelism();
        if (config.shouldCombineBeforeDelete()) {
            // De-dupe/merge if needed
            dedupedKeys = deduplicateKeys(keys, table, parallelism);
        }
        List<HoodieRecord<EmptyHoodieRecordPayload>> dedupedRecords = dedupedKeys.stream().map(key -> new HoodieAvroRecord<>(key, new EmptyHoodieRecordPayload())).collect(Collectors.toList());
        Instant beginTag = Instant.now();
        // perform index look up to get existing location of records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedRecords = HoodieList.getList(table.getIndex().tagLocation(HoodieList.of(dedupedRecords), context, table));
        Duration tagLocationDuration = Duration.between(beginTag, Instant.now());
        // filter out non existent keys/records
        List<HoodieRecord<EmptyHoodieRecordPayload>> taggedValidRecords = taggedRecords.stream().filter(HoodieRecord::isCurrentLocationKnown).collect(Collectors.toList());
        if (!taggedValidRecords.isEmpty()) {
            result = deleteExecutor.execute(taggedValidRecords);
            result.setIndexLookupDuration(tagLocationDuration);
        } else {
            // if entire set of keys are non existent
            deleteExecutor.saveWorkloadProfileMetadataToInflight(new WorkloadProfile(Pair.of(new HashMap<>(), new WorkloadStat())), instantTime);
            result = new HoodieWriteMetadata<>();
            result.setWriteStatuses(Collections.EMPTY_LIST);
            deleteExecutor.commitOnAutoCommit(result);
        }
        return result;
    } catch (Throwable e) {
        if (e instanceof HoodieUpsertException) {
            throw (HoodieUpsertException) e;
        }
        throw new HoodieUpsertException("Failed to delete for commit time " + instantTime, e);
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieList(org.apache.hudi.common.data.HoodieList) HashSet(java.util.HashSet) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) Duration(java.time.Duration) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieKey(org.apache.hudi.common.model.HoodieKey) WorkloadStat(org.apache.hudi.table.WorkloadStat) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Instant(java.time.Instant) Duration(java.time.Duration) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) HoodieList(org.apache.hudi.common.data.HoodieList) List(java.util.List) LinkedList(java.util.LinkedList)

Example 9 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class SparkValidatorUtils method runValidators.

/**
 * Check configured pre-commit validators and run them. Note that this only works for COW tables
 *
 * Throw error if there are validation failures.
 */
public static void runValidators(HoodieWriteConfig config, HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata, HoodieEngineContext context, HoodieTable table, String instantTime) {
    if (StringUtils.isNullOrEmpty(config.getPreCommitValidators())) {
        LOG.info("no validators configured.");
    } else {
        if (!writeMetadata.getWriteStats().isPresent()) {
            writeMetadata.setWriteStats(writeMetadata.getWriteStatuses().map(WriteStatus::getStat).collectAsList());
        }
        Set<String> partitionsModified = writeMetadata.getWriteStats().get().stream().map(writeStats -> writeStats.getPartitionPath()).collect(Collectors.toSet());
        SQLContext sqlContext = new SQLContext(HoodieSparkEngineContext.getSparkContext(context));
        // Refresh timeline to ensure validator sees the any other operations done on timeline (async operations such as other clustering/compaction/rollback)
        table.getMetaClient().reloadActiveTimeline();
        Dataset<Row> beforeState = getRecordsFromCommittedFiles(sqlContext, partitionsModified, table).cache();
        Dataset<Row> afterState = getRecordsFromPendingCommits(sqlContext, partitionsModified, writeMetadata, table, instantTime).cache();
        Stream<SparkPreCommitValidator> validators = Arrays.stream(config.getPreCommitValidators().split(",")).map(validatorClass -> {
            return ((SparkPreCommitValidator) ReflectionUtils.loadClass(validatorClass, new Class<?>[] { HoodieSparkTable.class, HoodieEngineContext.class, HoodieWriteConfig.class }, table, context, config));
        });
        boolean allSuccess = validators.map(v -> runValidatorAsync(v, writeMetadata, beforeState, afterState, instantTime)).map(CompletableFuture::join).reduce(true, Boolean::logicalAnd);
        if (allSuccess) {
            LOG.info("All validations succeeded");
        } else {
            LOG.error("At least one pre-commit validation failed");
            throw new HoodieValidationException("At least one pre-commit validation failed");
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) Dataset(org.apache.spark.sql.Dataset) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) SQLContext(org.apache.spark.sql.SQLContext) Set(java.util.Set) Row(org.apache.spark.sql.Row) Collectors(java.util.stream.Collectors) WriteStatus(org.apache.hudi.client.WriteStatus) SparkPreCommitValidator(org.apache.hudi.client.validator.SparkPreCommitValidator) List(java.util.List) Stream(java.util.stream.Stream) HoodieTablePreCommitFileSystemView(org.apache.hudi.common.table.view.HoodieTablePreCommitFileSystemView) JavaConverters(scala.collection.JavaConverters) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) SparkPreCommitValidator(org.apache.hudi.client.validator.SparkPreCommitValidator) Row(org.apache.spark.sql.Row) WriteStatus(org.apache.hudi.client.WriteStatus) SQLContext(org.apache.spark.sql.SQLContext)

Example 10 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class FSUtils method getFileStatusAtLevel.

/**
 * Lists file status at a certain level in the directory hierarchy.
 * <p>
 * E.g., given "/tmp/hoodie_table" as the rootPath, and 3 as the expected level,
 * this method gives back the {@link FileStatus} of all files under
 * "/tmp/hoodie_table/[*]/[*]/[*]/" folders.
 *
 * @param hoodieEngineContext {@link HoodieEngineContext} instance.
 * @param fs                  {@link FileSystem} instance.
 * @param rootPath            Root path for the file listing.
 * @param expectLevel         Expected level of directory hierarchy for files to be added.
 * @param parallelism         Parallelism for the file listing.
 * @return A list of file status of files at the level.
 */
public static List<FileStatus> getFileStatusAtLevel(HoodieEngineContext hoodieEngineContext, FileSystem fs, Path rootPath, int expectLevel, int parallelism) {
    List<String> levelPaths = new ArrayList<>();
    List<FileStatus> result = new ArrayList<>();
    levelPaths.add(rootPath.toString());
    for (int i = 0; i <= expectLevel; i++) {
        result = FSUtils.parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, pairOfSubPathAndConf -> {
            Path path = new Path(pairOfSubPathAndConf.getKey());
            try {
                FileSystem fileSystem = path.getFileSystem(pairOfSubPathAndConf.getValue().get());
                return Arrays.stream(fileSystem.listStatus(path)).collect(Collectors.toList());
            } catch (IOException e) {
                throw new HoodieIOException("Failed to list " + path, e);
            }
        }, levelPaths).values().stream().flatMap(list -> list.stream()).collect(Collectors.toList());
        if (i < expectLevel) {
            levelPaths = result.stream().filter(FileStatus::isDirectory).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
        }
    }
    return result;
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) InvalidHoodiePathException(org.apache.hudi.exception.InvalidHoodiePathException) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) PathFilter(org.apache.hadoop.fs.PathFilter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) Predicate(java.util.function.Predicate) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Entry(java.util.Map.Entry) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) FileSystem(org.apache.hadoop.fs.FileSystem) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Aggregations

HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)36 List (java.util.List)29 ArrayList (java.util.ArrayList)27 IOException (java.io.IOException)25 LogManager (org.apache.log4j.LogManager)25 Logger (org.apache.log4j.Logger)25 Map (java.util.Map)23 Collectors (java.util.stream.Collectors)23 Path (org.apache.hadoop.fs.Path)23 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)23 Option (org.apache.hudi.common.util.Option)23 FileSystem (org.apache.hadoop.fs.FileSystem)21 Pair (org.apache.hudi.common.util.collection.Pair)19 FSUtils (org.apache.hudi.common.fs.FSUtils)18 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)18 HoodieIOException (org.apache.hudi.exception.HoodieIOException)18 HashMap (java.util.HashMap)16 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)16 HoodieTable (org.apache.hudi.table.HoodieTable)15 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)14