Search in sources :

Example 21 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieCompactor method compact.

/**
 * Execute compaction operations and report back status.
 */
public HoodieData<WriteStatus> compact(HoodieEngineContext context, HoodieCompactionPlan compactionPlan, HoodieTable table, HoodieWriteConfig config, String compactionInstantTime, HoodieCompactionHandler compactionHandler) {
    if (compactionPlan == null || (compactionPlan.getOperations() == null) || (compactionPlan.getOperations().isEmpty())) {
        return context.emptyHoodieData();
    }
    HoodieActiveTimeline timeline = table.getActiveTimeline();
    HoodieInstant instant = HoodieTimeline.getCompactionRequestedInstant(compactionInstantTime);
    // Mark instant as compaction inflight
    timeline.transitionCompactionRequestedToInflight(instant);
    table.getMetaClient().reloadActiveTimeline();
    HoodieTableMetaClient metaClient = table.getMetaClient();
    TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
    // the same with the table schema.
    try {
        Schema readerSchema = schemaResolver.getTableAvroSchema(false);
        config.setSchema(readerSchema.toString());
    } catch (Exception e) {
    // If there is no commit in the table, just ignore the exception.
    }
    // Compacting is very similar to applying updates to existing file
    List<CompactionOperation> operations = compactionPlan.getOperations().stream().map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
    LOG.info("Compactor compacting " + operations + " files");
    context.setJobStatus(this.getClass().getSimpleName(), "Compacting file slices");
    TaskContextSupplier taskContextSupplier = table.getTaskContextSupplier();
    return context.parallelize(operations).map(operation -> compact(compactionHandler, metaClient, config, operation, compactionInstantTime, taskContextSupplier)).flatMap(List::iterator);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) CompactionStrategy(org.apache.hudi.table.action.compact.strategy.CompactionStrategy) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieCompactionHandler(org.apache.hudi.table.HoodieCompactionHandler) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) IOException(java.io.IOException) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier)

Example 22 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class BootstrapUtils method getAllLeafFoldersWithFiles.

/**
 * Returns leaf folders with files under a path.
 * @param metaClient Hoodie table metadata client
 * @param fs  File System
 * @param context JHoodieEngineContext
 * @return list of partition paths with files under them.
 * @throws IOException
 */
public static List<Pair<String, List<HoodieFileStatus>>> getAllLeafFoldersWithFiles(HoodieTableMetaClient metaClient, FileSystem fs, String basePathStr, HoodieEngineContext context) throws IOException {
    final Path basePath = new Path(basePathStr);
    final String baseFileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
    final Map<Integer, List<String>> levelToPartitions = new HashMap<>();
    final Map<String, List<HoodieFileStatus>> partitionToFiles = new HashMap<>();
    PathFilter filePathFilter = getFilePathFilter(baseFileExtension);
    PathFilter metaPathFilter = getExcludeMetaPathFilter();
    FileStatus[] topLevelStatuses = fs.listStatus(basePath);
    List<String> subDirectories = new ArrayList<>();
    List<Pair<HoodieFileStatus, Pair<Integer, String>>> result = new ArrayList<>();
    for (FileStatus topLevelStatus : topLevelStatuses) {
        if (topLevelStatus.isFile() && filePathFilter.accept(topLevelStatus.getPath())) {
            String relativePath = FSUtils.getRelativePartitionPath(basePath, topLevelStatus.getPath().getParent());
            Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
            HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(topLevelStatus);
            result.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath)));
        } else if (topLevelStatus.isDirectory() && metaPathFilter.accept(topLevelStatus.getPath())) {
            subDirectories.add(topLevelStatus.getPath().toString());
        }
    }
    if (subDirectories.size() > 0) {
        result.addAll(context.flatMap(subDirectories, directory -> {
            PathFilter pathFilter = getFilePathFilter(baseFileExtension);
            Path path = new Path(directory);
            FileSystem fileSystem = path.getFileSystem(new Configuration());
            RemoteIterator<LocatedFileStatus> itr = fileSystem.listFiles(path, true);
            List<Pair<HoodieFileStatus, Pair<Integer, String>>> res = new ArrayList<>();
            while (itr.hasNext()) {
                FileStatus status = itr.next();
                if (pathFilter.accept(status.getPath())) {
                    String relativePath = FSUtils.getRelativePartitionPath(new Path(basePathStr), status.getPath().getParent());
                    Integer level = (int) relativePath.chars().filter(ch -> ch == '/').count();
                    HoodieFileStatus hoodieFileStatus = FileStatusUtils.fromFileStatus(status);
                    res.add(Pair.of(hoodieFileStatus, Pair.of(level, relativePath)));
                }
            }
            return res.stream();
        }, subDirectories.size()));
    }
    result.forEach(val -> {
        String relativePath = val.getRight().getRight();
        List<HoodieFileStatus> statusList = partitionToFiles.get(relativePath);
        if (null == statusList) {
            Integer level = val.getRight().getLeft();
            List<String> dirs = levelToPartitions.get(level);
            if (null == dirs) {
                dirs = new ArrayList<>();
                levelToPartitions.put(level, dirs);
            }
            dirs.add(relativePath);
            statusList = new ArrayList<>();
            partitionToFiles.put(relativePath, statusList);
        }
        statusList.add(val.getLeft());
    });
    OptionalInt maxLevelOpt = levelToPartitions.keySet().stream().mapToInt(x -> x).max();
    int maxLevel = maxLevelOpt.orElse(-1);
    return maxLevel >= 0 ? levelToPartitions.get(maxLevel).stream().map(d -> Pair.of(d, partitionToFiles.get(d))).collect(Collectors.toList()) : new ArrayList<>();
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) PathFilter(org.apache.hadoop.fs.PathFilter) IOException(java.io.IOException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) OptionalInt(java.util.OptionalInt) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) List(java.util.List) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) PathFilter(org.apache.hadoop.fs.PathFilter) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) OptionalInt(java.util.OptionalInt) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) List(java.util.List) Pair(org.apache.hudi.common.util.collection.Pair)

Example 23 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class CleanActionExecutor method clean.

/**
 * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
 * skews in partitions to clean by making files to clean as the unit of task distribution.
 *
 * @throws IllegalArgumentException if unknown cleaning policy is provided
 */
List<HoodieCleanStat> clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) {
    int cleanerParallelism = Math.min((int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), config.getCleanerParallelism());
    LOG.info("Using cleanerParallelism: " + cleanerParallelism);
    context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions");
    Stream<Pair<String, CleanFileInfo>> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream().flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile()))));
    Stream<ImmutablePair<String, PartitionCleanStat>> partitionCleanStats = context.mapPartitionsToPairAndReduceByKey(filesToBeDeletedPerPartition, iterator -> deleteFilesFunc(iterator, table), PartitionCleanStat::merge, cleanerParallelism);
    Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    // Return PartitionCleanStat for each partition passed.
    return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> {
        PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) ? partitionCleanStatsMap.get(partitionPath) : new PartitionCleanStat(partitionPath);
        HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain();
        return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath).withEarliestCommitRetained(Option.ofNullable(actionInstant != null ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), actionInstant.getAction(), actionInstant.getTimestamp()) : null)).withDeletePathPattern(partitionCleanStat.deletePathPatterns()).withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()).withFailedDeletes(partitionCleanStat.failedDeleteFiles()).withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()).withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()).withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()).build();
    }).collect(Collectors.toList());
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) BaseActionExecutor(org.apache.hudi.table.action.BaseActionExecutor) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) TransactionManager(org.apache.hudi.client.transaction.TransactionManager) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Stream(java.util.stream.Stream) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) ArrayList(java.util.ArrayList) List(java.util.List) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 24 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieTimelineArchiver method mergeArchiveFilesIfNecessary.

/**
 * Here Hoodie can merge the small archive files into a new larger one.
 * Only used for filesystem which does not support append operation.
 * The whole merge small archive files operation has four stages:
 * 1. Build merge plan with merge candidates/merged file name infos.
 * 2. Do merge.
 * 3. Delete all the candidates.
 * 4. Delete the merge plan.
 * @param context HoodieEngineContext
 * @throws IOException
 */
private void mergeArchiveFilesIfNecessary(HoodieEngineContext context) throws IOException {
    Path planPath = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME);
    // Flush remained content if existed and open a new write
    reOpenWriter();
    // List all archive files
    FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
    // Sort files by version suffix in reverse (implies reverse chronological order)
    Arrays.sort(fsStatuses, new HoodieArchivedTimeline.ArchiveFileVersionComparator());
    int archiveMergeFilesBatchSize = config.getArchiveMergeFilesBatchSize();
    long smallFileLimitBytes = config.getArchiveMergeSmallFileLimitBytes();
    List<FileStatus> mergeCandidate = getMergeCandidates(smallFileLimitBytes, fsStatuses);
    if (mergeCandidate.size() >= archiveMergeFilesBatchSize) {
        List<String> candidateFiles = mergeCandidate.stream().map(fs -> fs.getPath().toString()).collect(Collectors.toList());
        // before merge archive files build merge plan
        String logFileName = computeLogFileName();
        buildArchiveMergePlan(candidateFiles, planPath, logFileName);
        // merge archive files
        mergeArchiveFiles(mergeCandidate);
        // after merge, delete the small archive files.
        deleteFilesParallelize(metaClient, candidateFiles, context, true);
        LOG.info("Success to delete replaced small archive files.");
        // finally, delete archiveMergePlan which means merging small archive files operation is succeed.
        metaClient.getFs().delete(planPath, false);
        LOG.info("Success to merge small archive files.");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieArchivedMetaEntry(org.apache.hudi.avro.model.HoodieArchivedMetaEntry) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) Schema(org.apache.avro.Schema) Collection(java.util.Collection) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) HoodieMergeArchiveFilePlan(org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) LESSER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) CompactionTriggerStrategy(org.apache.hudi.table.action.compact.CompactionTriggerStrategy) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) MetadataConversionUtils(org.apache.hudi.client.utils.MetadataConversionUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) LESSER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) IOException(java.io.IOException) StorageSchemes(org.apache.hudi.common.fs.StorageSchemes) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline)

Example 25 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieRepairTool method listFilesFromBasePath.

/**
 * Lists all Hoodie files from the table base path.
 *
 * @param context       {@link HoodieEngineContext} instance.
 * @param basePathStr   Table base path.
 * @param expectedLevel Expected level in the directory hierarchy to include the file status.
 * @param parallelism   Parallelism for the file listing.
 * @return A list of absolute file paths of all Hoodie files.
 * @throws IOException upon errors.
 */
static List<String> listFilesFromBasePath(HoodieEngineContext context, String basePathStr, int expectedLevel, int parallelism) {
    FileSystem fs = FSUtils.getFs(basePathStr, context.getHadoopConf().get());
    Path basePath = new Path(basePathStr);
    return FSUtils.getFileStatusAtLevel(context, fs, basePath, expectedLevel, parallelism).stream().filter(fileStatus -> {
        if (!fileStatus.isFile()) {
            return false;
        }
        return FSUtils.isDataFile(fileStatus.getPath());
    }).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) SecureRandom(java.security.SecureRandom) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) JCommander(com.beust.jcommander.JCommander) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) List(java.util.List) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RepairUtils(org.apache.hudi.table.repair.RepairUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) FileSystem(org.apache.hadoop.fs.FileSystem)

Aggregations

HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)36 List (java.util.List)29 ArrayList (java.util.ArrayList)27 IOException (java.io.IOException)25 LogManager (org.apache.log4j.LogManager)25 Logger (org.apache.log4j.Logger)25 Map (java.util.Map)23 Collectors (java.util.stream.Collectors)23 Path (org.apache.hadoop.fs.Path)23 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)23 Option (org.apache.hudi.common.util.Option)23 FileSystem (org.apache.hadoop.fs.FileSystem)21 Pair (org.apache.hudi.common.util.collection.Pair)19 FSUtils (org.apache.hudi.common.fs.FSUtils)18 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)18 HoodieIOException (org.apache.hudi.exception.HoodieIOException)18 HashMap (java.util.HashMap)16 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)16 HoodieTable (org.apache.hudi.table.HoodieTable)15 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)14