Search in sources :

Example 6 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class TestDataSourceUtils method testCreateHoodieConfigWithAsyncClustering.

@Test
public void testCreateHoodieConfigWithAsyncClustering() {
    ArrayList<ImmutablePair<String, Boolean>> asyncClusteringKeyValues = new ArrayList<>(4);
    asyncClusteringKeyValues.add(new ImmutablePair(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), true));
    asyncClusteringKeyValues.add(new ImmutablePair(HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE.key(), true));
    asyncClusteringKeyValues.add(new ImmutablePair("hoodie.datasource.clustering.async.enable", true));
    asyncClusteringKeyValues.add(new ImmutablePair("hoodie.clustering.async.enabled", true));
    asyncClusteringKeyValues.stream().forEach(pair -> {
        HashMap<String, String> params = new HashMap<>(3);
        params.put(DataSourceWriteOptions.TABLE_TYPE().key(), DataSourceWriteOptions.TABLE_TYPE().defaultValue());
        params.put(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key(), DataSourceWriteOptions.PAYLOAD_CLASS_NAME().defaultValue());
        params.put(pair.left, pair.right.toString());
        HoodieWriteConfig hoodieConfig = DataSourceUtils.createHoodieConfig(avroSchemaString, config.getBasePath(), "test", params);
        assertEquals(pair.right, hoodieConfig.isAsyncClusteringEnabled());
        TypedProperties prop = new TypedProperties();
        prop.putAll(params);
        assertEquals(pair.right, HoodieClusteringConfig.from(prop).isAsyncClusteringEnabled());
    });
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) ArgumentMatchers.anyString(org.mockito.ArgumentMatchers.anyString) TypedProperties(org.apache.hudi.common.config.TypedProperties) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 7 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class SavepointActionExecutor method execute.

@Override
public HoodieSavepointMetadata execute() {
    Option<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
    if (!table.getCompletedCommitsTimeline().containsInstant(instantTime)) {
        throw new HoodieSavepointException("Could not savepoint non-existing commit " + instantTime);
    }
    try {
        // Check the last commit that was not cleaned and check if savepoint time is > that commit
        String lastCommitRetained;
        if (cleanInstant.isPresent()) {
            HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get());
            lastCommitRetained = cleanMetadata.getEarliestCommitToRetain();
        } else {
            lastCommitRetained = table.getCompletedCommitsTimeline().firstInstant().get().getTimestamp();
        }
        // Cannot allow savepoint time on a commit that could have been cleaned
        ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.GREATER_THAN_OR_EQUALS, lastCommitRetained), "Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained);
        context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime);
        List<String> partitions = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), table.getMetaClient().getBasePath());
        Map<String, List<String>> latestFilesMap = context.mapToPair(partitions, partitionPath -> {
            // Scan all partitions files with this commit time
            LOG.info("Collecting latest files in partition path " + partitionPath);
            TableFileSystemView.BaseFileOnlyView view = table.getBaseFileOnlyView();
            List<String> latestFiles = view.getLatestBaseFilesBeforeOrOn(partitionPath, instantTime).map(HoodieBaseFile::getFileName).collect(Collectors.toList());
            return new ImmutablePair<>(partitionPath, latestFiles);
        }, null);
        HoodieSavepointMetadata metadata = TimelineMetadataUtils.convertSavepointMetadata(user, comment, latestFilesMap);
        // Nothing to save in the savepoint
        table.getActiveTimeline().createNewInstant(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, instantTime));
        table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, instantTime), TimelineMetadataUtils.serializeSavepointMetadata(metadata));
        LOG.info("Savepoint " + instantTime + " created");
        return metadata;
    } catch (IOException e) {
        throw new HoodieSavepointException("Failed to savepoint " + instantTime, e);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) IOException(java.io.IOException) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) HoodieSavepointException(org.apache.hudi.exception.HoodieSavepointException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) List(java.util.List) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView)

Example 8 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class CleanActionExecutor method clean.

/**
 * Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
 * skews in partitions to clean by making files to clean as the unit of task distribution.
 *
 * @throws IllegalArgumentException if unknown cleaning policy is provided
 */
List<HoodieCleanStat> clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) {
    int cleanerParallelism = Math.min((int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), config.getCleanerParallelism());
    LOG.info("Using cleanerParallelism: " + cleanerParallelism);
    context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions");
    Stream<Pair<String, CleanFileInfo>> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream().flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile()))));
    Stream<ImmutablePair<String, PartitionCleanStat>> partitionCleanStats = context.mapPartitionsToPairAndReduceByKey(filesToBeDeletedPerPartition, iterator -> deleteFilesFunc(iterator, table), PartitionCleanStat::merge, cleanerParallelism);
    Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    // Return PartitionCleanStat for each partition passed.
    return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> {
        PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) ? partitionCleanStatsMap.get(partitionPath) : new PartitionCleanStat(partitionPath);
        HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain();
        return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath).withEarliestCommitRetained(Option.ofNullable(actionInstant != null ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), actionInstant.getAction(), actionInstant.getTimestamp()) : null)).withDeletePathPattern(partitionCleanStat.deletePathPatterns()).withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()).withFailedDeletes(partitionCleanStat.failedDeleteFiles()).withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()).withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()).withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()).build();
    }).collect(Collectors.toList());
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) BaseActionExecutor(org.apache.hudi.table.action.BaseActionExecutor) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) TransactionManager(org.apache.hudi.client.transaction.TransactionManager) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Stream(java.util.stream.Stream) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CleanFileInfo(org.apache.hudi.common.model.CleanFileInfo) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) ArrayList(java.util.ArrayList) List(java.util.List) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 9 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class DFSPathSelector method getNextFilePathsAndMaxModificationTime.

/**
 * Get the list of files changed since last checkpoint.
 *
 * @param lastCheckpointStr the last checkpoint time string, empty if first run
 * @param sourceLimit       max bytes to read each time
 * @return the list of files concatenated and their latest modified time
 */
@Deprecated
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(Option<String> lastCheckpointStr, long sourceLimit) {
    try {
        // obtain all eligible files under root folder.
        log.info("Root path => " + props.getString(Config.ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit);
        long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
        List<FileStatus> eligibleFiles = listEligibleFiles(fs, new Path(props.getString(Config.ROOT_INPUT_PATH_PROP)), lastCheckpointTime);
        // sort them by modification time.
        eligibleFiles.sort(Comparator.comparingLong(FileStatus::getModificationTime));
        // Filter based on checkpoint & input size, if needed
        long currentBytes = 0;
        long newCheckpointTime = lastCheckpointTime;
        List<FileStatus> filteredFiles = new ArrayList<>();
        for (FileStatus f : eligibleFiles) {
            if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
                // so that some files with the same modification time won't be skipped in next read
                break;
            }
            newCheckpointTime = f.getModificationTime();
            currentBytes += f.getLen();
            filteredFiles.add(f);
        }
        // no data to read
        if (filteredFiles.isEmpty()) {
            return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
        }
        // read the files out.
        String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
        return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
    } catch (IOException ioe) {
        throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) TypedProperties(org.apache.hudi.common.config.TypedProperties) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) DataSourceUtils(org.apache.hudi.DataSourceUtils) FileStatus(org.apache.hadoop.fs.FileStatus) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) List(java.util.List) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) ArrayList(java.util.ArrayList) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 10 with ImmutablePair

use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.

the class DatePartitionPathSelector method getNextFilePathsAndMaxModificationTime.

@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(JavaSparkContext sparkContext, Option<String> lastCheckpointStr, long sourceLimit) {
    // If not specified the current date is assumed by default.
    LocalDate currentDate = LocalDate.parse(props.getString(Config.CURRENT_DATE, LocalDate.now().toString()));
    // obtain all eligible files under root folder.
    LOG.info("Root path => " + props.getString(ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit + " depth of day partition => " + datePartitionDepth + " num prev days to list => " + numPrevDaysToList + " from current date => " + currentDate);
    long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
    HoodieSparkEngineContext context = new HoodieSparkEngineContext(sparkContext);
    SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
    List<String> prunedPartitionPaths = pruneDatePartitionPaths(context, fs, props.getString(ROOT_INPUT_PATH_PROP), currentDate);
    List<FileStatus> eligibleFiles = context.flatMap(prunedPartitionPaths, path -> {
        FileSystem fs = new Path(path).getFileSystem(serializedConf.get());
        return listEligibleFiles(fs, new Path(path), lastCheckpointTime).stream();
    }, partitionsListParallelism);
    // sort them by modification time ascending.
    List<FileStatus> sortedEligibleFiles = eligibleFiles.stream().sorted(Comparator.comparingLong(FileStatus::getModificationTime)).collect(Collectors.toList());
    // Filter based on checkpoint & input size, if needed
    long currentBytes = 0;
    long newCheckpointTime = lastCheckpointTime;
    List<FileStatus> filteredFiles = new ArrayList<>();
    for (FileStatus f : sortedEligibleFiles) {
        if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
            // so that some files with the same modification time won't be skipped in next read
            break;
        }
        newCheckpointTime = f.getModificationTime();
        currentBytes += f.getLen();
        filteredFiles.add(f);
    }
    // no data to read
    if (filteredFiles.isEmpty()) {
        return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
    }
    // read the files out.
    String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
    return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
}
Also used : Path(org.apache.hadoop.fs.Path) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) DEFAULT_PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_PARTITIONS_LIST_PARALLELISM) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) ROOT_INPUT_PATH_PROP(org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP) TypedProperties(org.apache.hudi.common.config.TypedProperties) LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.LOOKBACK_DAYS) DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_PARTITION_DEPTH) DEFAULT_DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_PARTITION_DEPTH) Collectors(java.util.stream.Collectors) DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_FORMAT) DEFAULT_DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_FORMAT) PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.PARTITIONS_LIST_PARALLELISM) List(java.util.List) LocalDate(java.time.LocalDate) DateTimeFormatter(java.time.format.DateTimeFormatter) DEFAULT_LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_LOOKBACK_DAYS) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Pair(org.apache.hudi.common.util.collection.Pair) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) LocalDate(java.time.LocalDate) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem)

Aggregations

ImmutablePair (org.apache.hudi.common.util.collection.ImmutablePair)14 ArrayList (java.util.ArrayList)11 List (java.util.List)10 Pair (org.apache.hudi.common.util.collection.Pair)9 IOException (java.io.IOException)8 Collectors (java.util.stream.Collectors)8 Path (org.apache.hadoop.fs.Path)8 Option (org.apache.hudi.common.util.Option)8 Map (java.util.Map)7 Arrays (java.util.Arrays)5 HashMap (java.util.HashMap)5 FileSystem (org.apache.hadoop.fs.FileSystem)5 TypedProperties (org.apache.hudi.common.config.TypedProperties)5 HoodieIOException (org.apache.hudi.exception.HoodieIOException)5 Configuration (org.apache.hadoop.conf.Configuration)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)4 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)4 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)4 LogManager (org.apache.log4j.LogManager)4