use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class TestDataSourceUtils method testCreateHoodieConfigWithAsyncClustering.
@Test
public void testCreateHoodieConfigWithAsyncClustering() {
ArrayList<ImmutablePair<String, Boolean>> asyncClusteringKeyValues = new ArrayList<>(4);
asyncClusteringKeyValues.add(new ImmutablePair(DataSourceWriteOptions.ASYNC_CLUSTERING_ENABLE().key(), true));
asyncClusteringKeyValues.add(new ImmutablePair(HoodieClusteringConfig.ASYNC_CLUSTERING_ENABLE.key(), true));
asyncClusteringKeyValues.add(new ImmutablePair("hoodie.datasource.clustering.async.enable", true));
asyncClusteringKeyValues.add(new ImmutablePair("hoodie.clustering.async.enabled", true));
asyncClusteringKeyValues.stream().forEach(pair -> {
HashMap<String, String> params = new HashMap<>(3);
params.put(DataSourceWriteOptions.TABLE_TYPE().key(), DataSourceWriteOptions.TABLE_TYPE().defaultValue());
params.put(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key(), DataSourceWriteOptions.PAYLOAD_CLASS_NAME().defaultValue());
params.put(pair.left, pair.right.toString());
HoodieWriteConfig hoodieConfig = DataSourceUtils.createHoodieConfig(avroSchemaString, config.getBasePath(), "test", params);
assertEquals(pair.right, hoodieConfig.isAsyncClusteringEnabled());
TypedProperties prop = new TypedProperties();
prop.putAll(params);
assertEquals(pair.right, HoodieClusteringConfig.from(prop).isAsyncClusteringEnabled());
});
}
use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class SavepointActionExecutor method execute.
@Override
public HoodieSavepointMetadata execute() {
Option<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
if (!table.getCompletedCommitsTimeline().containsInstant(instantTime)) {
throw new HoodieSavepointException("Could not savepoint non-existing commit " + instantTime);
}
try {
// Check the last commit that was not cleaned and check if savepoint time is > that commit
String lastCommitRetained;
if (cleanInstant.isPresent()) {
HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get());
lastCommitRetained = cleanMetadata.getEarliestCommitToRetain();
} else {
lastCommitRetained = table.getCompletedCommitsTimeline().firstInstant().get().getTimestamp();
}
// Cannot allow savepoint time on a commit that could have been cleaned
ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.GREATER_THAN_OR_EQUALS, lastCommitRetained), "Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained);
context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime);
List<String> partitions = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), table.getMetaClient().getBasePath());
Map<String, List<String>> latestFilesMap = context.mapToPair(partitions, partitionPath -> {
// Scan all partitions files with this commit time
LOG.info("Collecting latest files in partition path " + partitionPath);
TableFileSystemView.BaseFileOnlyView view = table.getBaseFileOnlyView();
List<String> latestFiles = view.getLatestBaseFilesBeforeOrOn(partitionPath, instantTime).map(HoodieBaseFile::getFileName).collect(Collectors.toList());
return new ImmutablePair<>(partitionPath, latestFiles);
}, null);
HoodieSavepointMetadata metadata = TimelineMetadataUtils.convertSavepointMetadata(user, comment, latestFilesMap);
// Nothing to save in the savepoint
table.getActiveTimeline().createNewInstant(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, instantTime));
table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, instantTime), TimelineMetadataUtils.serializeSavepointMetadata(metadata));
LOG.info("Savepoint " + instantTime + " created");
return metadata;
} catch (IOException e) {
throw new HoodieSavepointException("Failed to savepoint " + instantTime, e);
}
}
use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class CleanActionExecutor method clean.
/**
* Performs cleaning of partition paths according to cleaning policy and returns the number of files cleaned. Handles
* skews in partitions to clean by making files to clean as the unit of task distribution.
*
* @throws IllegalArgumentException if unknown cleaning policy is provided
*/
List<HoodieCleanStat> clean(HoodieEngineContext context, HoodieCleanerPlan cleanerPlan) {
int cleanerParallelism = Math.min((int) (cleanerPlan.getFilePathsToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()), config.getCleanerParallelism());
LOG.info("Using cleanerParallelism: " + cleanerParallelism);
context.setJobStatus(this.getClass().getSimpleName(), "Perform cleaning of partitions");
Stream<Pair<String, CleanFileInfo>> filesToBeDeletedPerPartition = cleanerPlan.getFilePathsToBeDeletedPerPartition().entrySet().stream().flatMap(x -> x.getValue().stream().map(y -> new ImmutablePair<>(x.getKey(), new CleanFileInfo(y.getFilePath(), y.getIsBootstrapBaseFile()))));
Stream<ImmutablePair<String, PartitionCleanStat>> partitionCleanStats = context.mapPartitionsToPairAndReduceByKey(filesToBeDeletedPerPartition, iterator -> deleteFilesFunc(iterator, table), PartitionCleanStat::merge, cleanerParallelism);
Map<String, PartitionCleanStat> partitionCleanStatsMap = partitionCleanStats.collect(Collectors.toMap(Pair::getKey, Pair::getValue));
// Return PartitionCleanStat for each partition passed.
return cleanerPlan.getFilePathsToBeDeletedPerPartition().keySet().stream().map(partitionPath -> {
PartitionCleanStat partitionCleanStat = partitionCleanStatsMap.containsKey(partitionPath) ? partitionCleanStatsMap.get(partitionPath) : new PartitionCleanStat(partitionPath);
HoodieActionInstant actionInstant = cleanerPlan.getEarliestInstantToRetain();
return HoodieCleanStat.newBuilder().withPolicy(config.getCleanerPolicy()).withPartitionPath(partitionPath).withEarliestCommitRetained(Option.ofNullable(actionInstant != null ? new HoodieInstant(HoodieInstant.State.valueOf(actionInstant.getState()), actionInstant.getAction(), actionInstant.getTimestamp()) : null)).withDeletePathPattern(partitionCleanStat.deletePathPatterns()).withSuccessfulDeletes(partitionCleanStat.successDeleteFiles()).withFailedDeletes(partitionCleanStat.failedDeleteFiles()).withDeleteBootstrapBasePathPatterns(partitionCleanStat.getDeleteBootstrapBasePathPatterns()).withSuccessfulDeleteBootstrapBaseFiles(partitionCleanStat.getSuccessfulDeleteBootstrapBaseFiles()).withFailedDeleteBootstrapBaseFiles(partitionCleanStat.getFailedDeleteBootstrapBaseFiles()).build();
}).collect(Collectors.toList());
}
use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class DFSPathSelector method getNextFilePathsAndMaxModificationTime.
/**
* Get the list of files changed since last checkpoint.
*
* @param lastCheckpointStr the last checkpoint time string, empty if first run
* @param sourceLimit max bytes to read each time
* @return the list of files concatenated and their latest modified time
*/
@Deprecated
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(Option<String> lastCheckpointStr, long sourceLimit) {
try {
// obtain all eligible files under root folder.
log.info("Root path => " + props.getString(Config.ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit);
long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
List<FileStatus> eligibleFiles = listEligibleFiles(fs, new Path(props.getString(Config.ROOT_INPUT_PATH_PROP)), lastCheckpointTime);
// sort them by modification time.
eligibleFiles.sort(Comparator.comparingLong(FileStatus::getModificationTime));
// Filter based on checkpoint & input size, if needed
long currentBytes = 0;
long newCheckpointTime = lastCheckpointTime;
List<FileStatus> filteredFiles = new ArrayList<>();
for (FileStatus f : eligibleFiles) {
if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
// so that some files with the same modification time won't be skipped in next read
break;
}
newCheckpointTime = f.getModificationTime();
currentBytes += f.getLen();
filteredFiles.add(f);
}
// no data to read
if (filteredFiles.isEmpty()) {
return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
}
// read the files out.
String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
} catch (IOException ioe) {
throw new HoodieIOException("Unable to read from source from checkpoint: " + lastCheckpointStr, ioe);
}
}
use of org.apache.hudi.common.util.collection.ImmutablePair in project hudi by apache.
the class DatePartitionPathSelector method getNextFilePathsAndMaxModificationTime.
@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(JavaSparkContext sparkContext, Option<String> lastCheckpointStr, long sourceLimit) {
// If not specified the current date is assumed by default.
LocalDate currentDate = LocalDate.parse(props.getString(Config.CURRENT_DATE, LocalDate.now().toString()));
// obtain all eligible files under root folder.
LOG.info("Root path => " + props.getString(ROOT_INPUT_PATH_PROP) + " source limit => " + sourceLimit + " depth of day partition => " + datePartitionDepth + " num prev days to list => " + numPrevDaysToList + " from current date => " + currentDate);
long lastCheckpointTime = lastCheckpointStr.map(Long::parseLong).orElse(Long.MIN_VALUE);
HoodieSparkEngineContext context = new HoodieSparkEngineContext(sparkContext);
SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
List<String> prunedPartitionPaths = pruneDatePartitionPaths(context, fs, props.getString(ROOT_INPUT_PATH_PROP), currentDate);
List<FileStatus> eligibleFiles = context.flatMap(prunedPartitionPaths, path -> {
FileSystem fs = new Path(path).getFileSystem(serializedConf.get());
return listEligibleFiles(fs, new Path(path), lastCheckpointTime).stream();
}, partitionsListParallelism);
// sort them by modification time ascending.
List<FileStatus> sortedEligibleFiles = eligibleFiles.stream().sorted(Comparator.comparingLong(FileStatus::getModificationTime)).collect(Collectors.toList());
// Filter based on checkpoint & input size, if needed
long currentBytes = 0;
long newCheckpointTime = lastCheckpointTime;
List<FileStatus> filteredFiles = new ArrayList<>();
for (FileStatus f : sortedEligibleFiles) {
if (currentBytes + f.getLen() >= sourceLimit && f.getModificationTime() > newCheckpointTime) {
// so that some files with the same modification time won't be skipped in next read
break;
}
newCheckpointTime = f.getModificationTime();
currentBytes += f.getLen();
filteredFiles.add(f);
}
// no data to read
if (filteredFiles.isEmpty()) {
return new ImmutablePair<>(Option.empty(), String.valueOf(newCheckpointTime));
}
// read the files out.
String pathStr = filteredFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(newCheckpointTime));
}
Aggregations