Search in sources :

Example 1 with ROOT_INPUT_PATH_PROP

use of org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP in project hudi by apache.

the class DatePartitionPathSelector method getNextFilePathsAndMaxModificationTime.

@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(JavaSparkContext sparkContext, Option<String> lastCheckpointStr, long sourceLimit) {
    // If not specified the current date is assumed by default.
    LocalDate currentDate = LocalDate.parse(props.getString(Config.CURRENT_DATE, LocalDate.now().toString()));
    // obtain all eligible files under root folder.
    LOG.info("Root path => " + props.getString(ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit + " depth of day partition => " + datePartitionDepth + " num prev days to list => " + numPrevDaysToList + " from current date => " + currentDate);
    long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
    HoodieSparkEngineContext context = new HoodieSparkEngineContext(sparkContext);
    SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
    List<String> prunedPartitionPaths = pruneDatePartitionPaths(context, fs, props.getString(ROOT_INPUT_PATH_PROP), currentDate);
    List<FileStatus> eligibleFiles = context.flatMap(prunedPartitionPaths, path -> {
        FileSystem fs = new Path(path).getFileSystem(serializedConf.get());
        return listEligibleFiles(fs, new Path(path), lastCheckpointTime).stream();
    }, partitionsListParallelism);
    // sort them by modification time ascending.
    List<FileStatus> sortedEligibleFiles = eligibleFiles.stream().sorted(Comparator.comparingLong(FileStatus::getModificationTime)).collect(Collectors.toList());
    // Filter based on checkpoint & input size, if needed
    long currentBytes = 0;
    long newCheckpointTime = lastCheckpointTime;
    List<FileStatus> filteredFiles = new ArrayList<>();
    for (FileStatus f : sortedEligibleFiles) {
        if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
            // so that some files with the same modification time won't be skipped in next read
            break;
        }
        newCheckpointTime = f.getModificationTime();
        currentBytes += f.getLen();
        filteredFiles.add(f);
    }
    // no data to read
    if (filteredFiles.isEmpty()) {
        return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
    }
    // read the files out.
    String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
    return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) DEFAULT_PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_PARTITIONS_LIST_PARALLELISM) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) ROOT_INPUT_PATH_PROP(org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP) TypedProperties(org.apache.hudi.common.config.TypedProperties) LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.LOOKBACK_DAYS) DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_PARTITION_DEPTH) DEFAULT_DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_PARTITION_DEPTH) Collectors(java.util.stream.Collectors) DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_FORMAT) DEFAULT_DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_FORMAT) PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.PARTITIONS_LIST_PARALLELISM) List(java.util.List) LocalDate(java.time.LocalDate) DateTimeFormatter(java.time.format.DateTimeFormatter) DEFAULT_LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_LOOKBACK_DAYS) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Pair(org.apache.hudi.common.util.collection.Pair) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) LocalDate(java.time.LocalDate) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem)

Aggregations

LocalDate (java.time.LocalDate)1 DateTimeFormatter (java.time.format.DateTimeFormatter)1 ArrayList (java.util.ArrayList)1 Comparator (java.util.Comparator)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)1 SerializableConfiguration (org.apache.hudi.common.config.SerializableConfiguration)1 TypedProperties (org.apache.hudi.common.config.TypedProperties)1 Option (org.apache.hudi.common.util.Option)1 ValidationUtils (org.apache.hudi.common.util.ValidationUtils)1 ImmutablePair (org.apache.hudi.common.util.collection.ImmutablePair)1 Pair (org.apache.hudi.common.util.collection.Pair)1 ROOT_INPUT_PATH_PROP (org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP)1 DATE_FORMAT (org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_FORMAT)1 DATE_PARTITION_DEPTH (org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_PARTITION_DEPTH)1