Search in sources :

Example 1 with HoodieClusteringStrategy

use of org.apache.hudi.avro.model.HoodieClusteringStrategy in project hudi by apache.

the class PartitionAwareClusteringPlanStrategy method generateClusteringPlan.

@Override
public Option<HoodieClusteringPlan> generateClusteringPlan() {
    HoodieTableMetaClient metaClient = getHoodieTable().getMetaClient();
    LOG.info("Scheduling clustering for " + metaClient.getBasePath());
    HoodieWriteConfig config = getWriteConfig();
    List<String> partitionPaths = FSUtils.getAllPartitionPaths(getEngineContext(), config.getMetadataConfig(), metaClient.getBasePath());
    // get matched partitions if set
    partitionPaths = getMatchedPartitions(config, partitionPaths);
    // filter the partition paths if needed to reduce list status
    partitionPaths = filterPartitionPaths(partitionPaths);
    if (partitionPaths.isEmpty()) {
        // In case no partitions could be picked, return no clustering plan
        return Option.empty();
    }
    List<HoodieClusteringGroup> clusteringGroups = getEngineContext().flatMap(partitionPaths, partitionPath -> {
        List<FileSlice> fileSlicesEligible = getFileSlicesEligibleForClustering(partitionPath).collect(Collectors.toList());
        return buildClusteringGroupsForPartition(partitionPath, fileSlicesEligible).limit(getWriteConfig().getClusteringMaxNumGroups());
    }, partitionPaths.size()).stream().limit(getWriteConfig().getClusteringMaxNumGroups()).collect(Collectors.toList());
    if (clusteringGroups.isEmpty()) {
        LOG.info("No data available to cluster");
        return Option.empty();
    }
    HoodieClusteringStrategy strategy = HoodieClusteringStrategy.newBuilder().setStrategyClassName(getWriteConfig().getClusteringExecutionStrategyClass()).setStrategyParams(getStrategyParams()).build();
    return Option.of(HoodieClusteringPlan.newBuilder().setStrategy(strategy).setInputGroups(clusteringGroups).setExtraMetadata(getExtraMetadata()).setVersion(getPlanVersion()).setPreserveHoodieMetadata(getWriteConfig().isPreserveHoodieCommitMetadataForClustering()).build());
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Collectors(java.util.stream.Collectors) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) List(java.util.List) Stream(java.util.stream.Stream) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) ClusteringPlanPartitionFilter(org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilter) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) List(java.util.List) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup)

Example 2 with HoodieClusteringStrategy

use of org.apache.hudi.avro.model.HoodieClusteringStrategy in project hudi by apache.

the class ClusteringUtils method createClusteringPlan.

/**
 * Create clustering plan from input fileSliceGroups.
 */
public static HoodieClusteringPlan createClusteringPlan(String strategyClassName, Map<String, String> strategyParams, List<FileSlice>[] fileSliceGroups, Map<String, String> extraMetadata) {
    List<HoodieClusteringGroup> clusteringGroups = Arrays.stream(fileSliceGroups).map(fileSliceGroup -> {
        Map<String, Double> groupMetrics = buildMetrics(fileSliceGroup);
        List<HoodieSliceInfo> sliceInfos = getFileSliceInfo(fileSliceGroup);
        return HoodieClusteringGroup.newBuilder().setSlices(sliceInfos).setMetrics(groupMetrics).build();
    }).collect(Collectors.toList());
    HoodieClusteringStrategy strategy = HoodieClusteringStrategy.newBuilder().setStrategyClassName(strategyClassName).setStrategyParams(strategyParams).build();
    return HoodieClusteringPlan.newBuilder().setInputGroups(clusteringGroups).setExtraMetadata(extraMetadata).setStrategy(strategy).build();
}
Also used : Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) HashMap(java.util.HashMap) Logger(org.apache.log4j.Logger) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) AbstractMap(java.util.AbstractMap) List(java.util.List) Stream(java.util.stream.Stream) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieSliceInfo(org.apache.hudi.avro.model.HoodieSliceInfo) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) List(java.util.List) HashMap(java.util.HashMap) Map(java.util.Map) AbstractMap(java.util.AbstractMap) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup)

Aggregations

Arrays (java.util.Arrays)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 Stream (java.util.stream.Stream)2 HoodieClusteringGroup (org.apache.hudi.avro.model.HoodieClusteringGroup)2 HoodieClusteringPlan (org.apache.hudi.avro.model.HoodieClusteringPlan)2 HoodieClusteringStrategy (org.apache.hudi.avro.model.HoodieClusteringStrategy)2 FSUtils (org.apache.hudi.common.fs.FSUtils)2 FileSlice (org.apache.hudi.common.model.FileSlice)2 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)2 LogManager (org.apache.log4j.LogManager)2 Logger (org.apache.log4j.Logger)2 IOException (java.io.IOException)1 AbstractMap (java.util.AbstractMap)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Pattern (java.util.regex.Pattern)1 HoodieRequestedReplaceMetadata (org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata)1 HoodieSliceInfo (org.apache.hudi.avro.model.HoodieSliceInfo)1 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)1