Search in sources :

Example 11 with OptimizationFailureException

use of com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException in project cruise-control by linkedin.

the class CapacityGoal method rebalanceForBroker.

/**
 * (1) REBALANCE BY LEADERSHIP MOVEMENT:
 * Perform leadership movement to ensure that the load on brokers and/or hosts (see {@link Resource#isHostResource()}
 * and {@link Resource#isBrokerResource()}) for the outbound network load and CPU is under the capacity limit.
 *
 * <p>
 * (2) REBALANCE BY REPLICA MOVEMENT:
 * Perform optimization via replica movement for the given resource to ensure rebalance: The load on brokers and/or
 * hosts (see {@link Resource#isHostResource()} and {@link Resource#isBrokerResource()}) for the given resource is
 * under the capacity limit.
 *
 * @param broker         Broker to be balanced.
 * @param clusterModel   The state of the cluster.
 * @param optimizedGoals Optimized goals.
 * @param excludedTopics The topics that should be excluded from the optimization action.
 */
@Override
protected void rebalanceForBroker(Broker broker, ClusterModel clusterModel, Set<Goal> optimizedGoals, Set<String> excludedTopics) throws OptimizationFailureException {
    LOG.debug("balancing broker {}, optimized goals = {}", broker, optimizedGoals);
    Resource currentResource = resource();
    double capacityThreshold = _balancingConstraint.capacityThreshold(currentResource);
    double brokerCapacityLimit = broker.capacityFor(currentResource) * capacityThreshold;
    double hostCapacityLimit = broker.host().capacityFor(currentResource) * capacityThreshold;
    boolean isUtilizationOverLimit = isUtilizationOverLimit(broker, currentResource, brokerCapacityLimit, hostCapacityLimit);
    if (!isUtilizationOverLimit) {
        // The utilization of source broker and/or host for the current resource is already under the capacity limit.
        return;
    }
    // First try REBALANCE BY LEADERSHIP MOVEMENT:
    if (currentResource == Resource.NW_OUT || currentResource == Resource.CPU) {
        // Sort replicas by descending order of preference to relocate. Preference is based on resource cost.
        // Only leaders in the source broker are sorted.
        List<Replica> sortedLeadersInSourceBroker = broker.sortedLeadersFor(currentResource);
        for (Replica leader : sortedLeadersInSourceBroker) {
            if (shouldExclude(leader, excludedTopics)) {
                continue;
            }
            // Get followers of this leader and sort them in ascending order by their broker resource utilization.
            List<Replica> followers = clusterModel.partition(leader.topicPartition()).followers();
            clusterModel.sortReplicasInAscendingOrderByBrokerResourceUtilization(followers, currentResource);
            List<Broker> eligibleBrokers = followers.stream().map(Replica::broker).collect(Collectors.toList());
            Broker b = maybeApplyBalancingAction(clusterModel, leader, eligibleBrokers, ActionType.LEADERSHIP_MOVEMENT, optimizedGoals);
            if (b == null) {
                LOG.debug("Failed to move leader replica {} to any other brokers in {}", leader, eligibleBrokers);
            }
            isUtilizationOverLimit = isUtilizationOverLimit(broker, currentResource, brokerCapacityLimit, hostCapacityLimit);
            // Broker utilization has successfully been reduced under the capacity limit for the current resource.
            if (!isUtilizationOverLimit) {
                break;
            }
        }
    }
    // If leader movement did not work, move replicas.
    if (isUtilizationOverLimit) {
        // Get sorted healthy brokers under host and/or broker capacity limit (depending on the current resource).
        List<Broker> sortedHealthyBrokersUnderCapacityLimit = clusterModel.sortedHealthyBrokersUnderThreshold(currentResource, capacityThreshold);
        // be satisfied, throw an exception.
        for (Replica replica : broker.sortedReplicas(currentResource)) {
            if (shouldExclude(replica, excludedTopics)) {
                continue;
            }
            // Unless the target broker would go over the host- and/or broker-level capacity,
            // the movement will be successful.
            Broker b = maybeApplyBalancingAction(clusterModel, replica, sortedHealthyBrokersUnderCapacityLimit, ActionType.REPLICA_MOVEMENT, optimizedGoals);
            if (b == null) {
                LOG.debug("Failed to move replica {} to any broker in {}", replica, sortedHealthyBrokersUnderCapacityLimit);
            }
            // If capacity limit was not satisfied before, check if it is satisfied now.
            isUtilizationOverLimit = isUtilizationOverLimit(broker, currentResource, brokerCapacityLimit, hostCapacityLimit);
            // Broker utilization has successfully been reduced under the capacity limit for the current resource.
            if (!isUtilizationOverLimit) {
                break;
            }
        }
    }
    if (isUtilizationOverLimit) {
        if (!currentResource.isHostResource()) {
            // Utilization is above the capacity limit after all replicas in the given source broker were checked.
            throw new OptimizationFailureException("Violated capacity limit of " + brokerCapacityLimit + " via broker " + "utilization of " + broker.load().expectedUtilizationFor(currentResource) + " with broker id " + broker.id() + " for resource " + currentResource);
        } else {
            throw new OptimizationFailureException("Violated capacity limit of " + hostCapacityLimit + " via host " + "utilization of " + broker.host().load().expectedUtilizationFor(currentResource) + " with hostname " + broker.host().name() + " for resource " + currentResource);
        }
    }
}
Also used : Broker(com.linkedin.kafka.cruisecontrol.model.Broker) OptimizationFailureException(com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException) Resource(com.linkedin.kafka.cruisecontrol.common.Resource) Replica(com.linkedin.kafka.cruisecontrol.model.Replica)

Example 12 with OptimizationFailureException

use of com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException in project cruise-control by linkedin.

the class PotentialNwOutGoal method updateGoalState.

/**
 * Update goal state after one round of self-healing / rebalance.
 *
 * @param clusterModel The state of the cluster.
 */
@Override
protected void updateGoalState(ClusterModel clusterModel, Set<String> excludedTopics) throws OptimizationFailureException {
    // Sanity check: No self-healing eligible replica should remain at a decommissioned broker.
    for (Replica replica : clusterModel.selfHealingEligibleReplicas()) {
        if (replica.broker().isAlive()) {
            continue;
        }
        if (_selfHealingDeadBrokersOnly) {
            throw new OptimizationFailureException("Self healing failed to move the replica away from decommissioned brokers.");
        }
        _selfHealingDeadBrokersOnly = true;
        LOG.warn("Ignoring potential network outbound limit to relocate remaining replicas from dead brokers to healthy ones.");
        return;
    }
    finish();
}
Also used : OptimizationFailureException(com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException) Replica(com.linkedin.kafka.cruisecontrol.model.Replica)

Example 13 with OptimizationFailureException

use of com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException in project cruise-control by linkedin.

the class RackAwareGoal method ensureRackAware.

private void ensureRackAware(ClusterModel clusterModel, Set<String> excludedTopics) throws OptimizationFailureException {
    // Sanity check to confirm that the final distribution is rack aware.
    for (Replica leader : clusterModel.leaderReplicas()) {
        if (excludedTopics.contains(leader.topicPartition().topic())) {
            continue;
        }
        Set<String> replicaBrokersRackIds = new HashSet<>();
        Set<Broker> followerBrokers = new HashSet<>(clusterModel.partition(leader.topicPartition()).followerBrokers());
        // Add rack Id of replicas.
        for (Broker followerBroker : followerBrokers) {
            String followerRackId = followerBroker.rack().id();
            replicaBrokersRackIds.add(followerRackId);
        }
        replicaBrokersRackIds.add(leader.broker().rack().id());
        if (replicaBrokersRackIds.size() != (followerBrokers.size() + 1)) {
            throw new OptimizationFailureException("Optimization for goal " + name() + " failed for rack-awareness of " + "partition " + leader.topicPartition());
        }
    }
}
Also used : Broker(com.linkedin.kafka.cruisecontrol.model.Broker) OptimizationFailureException(com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException) Replica(com.linkedin.kafka.cruisecontrol.model.Replica) HashSet(java.util.HashSet)

Example 14 with OptimizationFailureException

use of com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException in project cruise-control by linkedin.

the class RackAwareGoal method rebalanceForBroker.

/**
 * Rack-awareness violations can be resolved with replica movements.
 *
 * @param broker         Broker to be balanced.
 * @param clusterModel   The state of the cluster.
 * @param optimizedGoals Optimized goals.
 * @param excludedTopics The topics that should be excluded from the optimization action.
 */
@Override
protected void rebalanceForBroker(Broker broker, ClusterModel clusterModel, Set<Goal> optimizedGoals, Set<String> excludedTopics) throws OptimizationFailureException {
    LOG.debug("balancing broker {}, optimized goals = {}", broker, optimizedGoals);
    // Satisfy rack awareness requirement.
    SortedSet<Replica> replicas = new TreeSet<>(broker.replicas());
    for (Replica replica : replicas) {
        if ((broker.isAlive() && satisfiedRackAwareness(replica, clusterModel)) || shouldExclude(replica, excludedTopics)) {
            continue;
        }
        // Rack awareness is violated. Move replica to a broker in another rack.
        if (maybeApplyBalancingAction(clusterModel, replica, rackAwareEligibleBrokers(replica, clusterModel), ActionType.REPLICA_MOVEMENT, optimizedGoals) == null) {
            throw new OptimizationFailureException("Violated rack-awareness requirement for broker with id " + broker.id() + ".");
        }
    }
}
Also used : OptimizationFailureException(com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException) TreeSet(java.util.TreeSet) Replica(com.linkedin.kafka.cruisecontrol.model.Replica)

Example 15 with OptimizationFailureException

use of com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException in project cruise-control by linkedin.

the class ReplicaCapacityGoal method initGoalState.

/**
 * This is a hard goal; hence, the proposals are not limited to dead broker replicas in case of self-healing.
 * Sanity Check: Each node has sufficient number of replicas that can be moved to satisfy the replica capacity goal.
 *
 *  @param clusterModel The state of the cluster.
 * @param excludedTopics The topics that should be excluded from the optimization proposals.
 */
@Override
protected void initGoalState(ClusterModel clusterModel, Set<String> excludedTopics) throws OptimizationFailureException {
    List<String> topicsToRebalance = new ArrayList<>(clusterModel.topics());
    topicsToRebalance.removeAll(excludedTopics);
    if (topicsToRebalance.isEmpty()) {
        LOG.warn("All topics are excluded from {}.", name());
    }
    // Sanity check: excluded topic replicas in a broker cannot exceed the max number of allowed replicas per broker.
    int totalReplicasInCluster = 0;
    for (Broker broker : brokersToBalance(clusterModel)) {
        if (!broker.isAlive()) {
            _isSelfHealingMode = true;
            continue;
        }
        int excludedReplicasInBroker = 0;
        for (String topic : excludedTopics) {
            excludedReplicasInBroker += broker.replicasOfTopicInBroker(topic).size();
        }
        if (excludedReplicasInBroker > _balancingConstraint.maxReplicasPerBroker()) {
            throw new OptimizationFailureException(String.format("Replicas of excluded topics in broker: %d exceeds the maximum " + "allowed number of replicas: %d.", excludedReplicasInBroker, _balancingConstraint.maxReplicasPerBroker()));
        }
        // Calculate total number of replicas for the next sanity check.
        totalReplicasInCluster += broker.replicas().size();
    }
    // Sanity check: total replicas in the cluster cannot be more than the allowed replicas in the cluster.
    long maxReplicasInCluster = _balancingConstraint.maxReplicasPerBroker() * clusterModel.healthyBrokers().size();
    if (totalReplicasInCluster > maxReplicasInCluster) {
        throw new OptimizationFailureException(String.format("Total replicas in cluster: %d exceeds the maximum allowed " + "replicas in cluster: %d.", totalReplicasInCluster, maxReplicasInCluster));
    }
}
Also used : Broker(com.linkedin.kafka.cruisecontrol.model.Broker) OptimizationFailureException(com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException) ArrayList(java.util.ArrayList) BalancingConstraint(com.linkedin.kafka.cruisecontrol.analyzer.BalancingConstraint)

Aggregations

OptimizationFailureException (com.linkedin.kafka.cruisecontrol.exception.OptimizationFailureException)15 Broker (com.linkedin.kafka.cruisecontrol.model.Broker)8 Replica (com.linkedin.kafka.cruisecontrol.model.Replica)8 HashSet (java.util.HashSet)3 Map (java.util.Map)3 BalancingConstraint (com.linkedin.kafka.cruisecontrol.analyzer.BalancingConstraint)2 Resource (com.linkedin.kafka.cruisecontrol.common.Resource)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 ClusterModelStats (com.linkedin.kafka.cruisecontrol.model.ClusterModelStats)1 Load (com.linkedin.kafka.cruisecontrol.model.Load)1 Partition (com.linkedin.kafka.cruisecontrol.model.Partition)1 List (java.util.List)1 TreeSet (java.util.TreeSet)1