Search in sources :

Example 1 with BrokerCapacityResolutionException

use of com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException in project cruise-control by linkedin.

the class LoadMonitor method populateClusterCapacity.

private void populateClusterCapacity(boolean populateReplicaPlacementInfo, boolean allowCapacityEstimation, ClusterModel clusterModel, Cluster cluster) throws TimeoutException, BrokerCapacityResolutionException {
    // Create the racks and brokers.
    // If broker capacity is allowed to estimate broker capacity, shuffle nodes before getting their capacity from the
    // capacity resolver. This is good for the capacity resolver to estimate the capacity of the nodes, for which the
    // capacity retrieval has failed.
    // The use case for this estimation is that if the capacity of one of the nodes is not available (e.g. due to some
    // 3rd party service issue), the capacity resolver may want to use the capacity of a peer node as the capacity for
    // that node.
    // To this end, Cruise Control handles the case that the first node is problematic so the capacity resolver does
    // not have the chance to get the capacity for the other nodes.
    // Shuffling the node order helps, as the problematic node is unlikely to always be the first node in the list.
    List<Node> shuffledNodes = allowCapacityEstimation ? new ArrayList<>(cluster.nodes()) : cluster.nodes();
    if (allowCapacityEstimation) {
        Collections.shuffle(shuffledNodes);
    }
    for (Node node : shuffledNodes) {
        // If the rack is not specified, we use the host info as rack info.
        String rack = getRackHandleNull(node);
        clusterModel.createRack(rack);
        BrokerCapacityInfo brokerCapacity;
        try {
            brokerCapacity = _brokerCapacityConfigResolver.capacityForBroker(rack, node.host(), node.id(), BROKER_CAPACITY_FETCH_TIMEOUT_MS, allowCapacityEstimation);
            LOG.debug("Capacity of broker {}: {}, (LogDir: {}, Cores: {}).", node.id(), brokerCapacity.capacity(), brokerCapacity.diskCapacityByLogDir(), brokerCapacity.numCpuCores());
            if (populateReplicaPlacementInfo && brokerCapacity.diskCapacityByLogDir() == null) {
                throw new IllegalStateException(String.format("Missing disk capacity information for logDirs on broker %d. " + "Are you trying to use a JBOD feature on a non-JBOD Kafka deployment?", node.id()));
            }
        } catch (TimeoutException | BrokerCapacityResolutionException e) {
            String errorMessage = String.format("Unable to retrieve capacity for broker %d. This may be caused by churn in " + "the cluster, please retry.", node.id());
            LOG.warn(errorMessage, e);
            throw e;
        }
        clusterModel.createBroker(rack, node.host(), node.id(), brokerCapacity, populateReplicaPlacementInfo);
    }
}
Also used : BrokerCapacityResolutionException(com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException) BrokerCapacityInfo(com.linkedin.kafka.cruisecontrol.config.BrokerCapacityInfo) Node(org.apache.kafka.common.Node) TimeoutException(java.util.concurrent.TimeoutException)

Example 2 with BrokerCapacityResolutionException

use of com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException in project cruise-control by linkedin.

the class CruiseControlMetricsProcessorTest method testWithCpuCapacityEstimation.

@Test
public void testWithCpuCapacityEstimation() throws TimeoutException, BrokerCapacityResolutionException {
    Set<CruiseControlMetric> metrics = getCruiseControlMetrics();
    // All estimated.
    BrokerCapacityConfigResolver brokerCapacityConfigResolverAllEstimated = EasyMock.mock(BrokerCapacityConfigResolver.class);
    EasyMock.expect(brokerCapacityConfigResolverAllEstimated.capacityForBroker(EasyMock.anyString(), EasyMock.anyString(), EasyMock.anyInt(), EasyMock.anyLong(), EasyMock.eq(false))).andThrow(new BrokerCapacityResolutionException("Unable to resolve capacity.")).anyTimes();
    EasyMock.replay(brokerCapacityConfigResolverAllEstimated);
    CruiseControlMetricsProcessor processor = new CruiseControlMetricsProcessor(brokerCapacityConfigResolverAllEstimated, false);
    for (CruiseControlMetric cruiseControlMetric : metrics) {
        processor.addMetric(cruiseControlMetric);
    }
    Cluster cluster = getCluster();
    processor.process(cluster, TEST_PARTITIONS, MetricSampler.SamplingMode.ALL);
    for (Node node : cluster.nodes()) {
        assertNull(processor.cachedNumCoresByBroker().get(node.id()));
    }
    // Capacity resolver unable to retrieve broker capacity.
    BrokerCapacityConfigResolver brokerCapacityConfigResolverTimeout = EasyMock.mock(BrokerCapacityConfigResolver.class);
    EasyMock.expect(brokerCapacityConfigResolverTimeout.capacityForBroker(EasyMock.anyString(), EasyMock.anyString(), EasyMock.anyInt(), EasyMock.anyLong(), EasyMock.anyBoolean())).andThrow(new TimeoutException("Unable to resolve capacity.")).anyTimes();
    EasyMock.replay(brokerCapacityConfigResolverTimeout);
    processor = new CruiseControlMetricsProcessor(brokerCapacityConfigResolverTimeout, false);
    for (CruiseControlMetric cruiseControlMetric : metrics) {
        processor.addMetric(cruiseControlMetric);
    }
    cluster = getCluster();
    processor.process(cluster, TEST_PARTITIONS, MetricSampler.SamplingMode.ALL);
    for (Node node : cluster.nodes()) {
        assertNull(processor.cachedNumCoresByBroker().get(node.id()));
    }
    // Some estimated.
    BrokerCapacityConfigResolver brokerCapacityConfigResolverSomeEstimated = EasyMock.mock(BrokerCapacityConfigResolver.class);
    EasyMock.expect(brokerCapacityConfigResolverSomeEstimated.capacityForBroker(EasyMock.anyString(), EasyMock.anyString(), EasyMock.eq(BROKER_ID_1), EasyMock.anyLong(), EasyMock.anyBoolean())).andThrow(new TimeoutException("Unable to resolve capacity.")).anyTimes();
    EasyMock.expect(brokerCapacityConfigResolverSomeEstimated.capacityForBroker(EasyMock.anyString(), EasyMock.anyString(), EasyMock.eq(BROKER_ID_0), EasyMock.anyLong(), EasyMock.anyBoolean())).andReturn(new BrokerCapacityInfo(EMPTY_BROKER_CAPACITY, Collections.emptyMap(), MOCK_NUM_CPU_CORES)).anyTimes();
    EasyMock.replay(brokerCapacityConfigResolverSomeEstimated);
    processor = new CruiseControlMetricsProcessor(brokerCapacityConfigResolverSomeEstimated, false);
    for (CruiseControlMetric metric : metrics) {
        processor.addMetric(metric);
    }
    processor.process(cluster, TEST_PARTITIONS, MetricSampler.SamplingMode.ALL);
    assertEquals(MOCK_NUM_CPU_CORES, (short) processor.cachedNumCoresByBroker().get(BROKER_ID_0));
    assertNull(processor.cachedNumCoresByBroker().get(BROKER_ID_1));
    EasyMock.verify(brokerCapacityConfigResolverTimeout, brokerCapacityConfigResolverSomeEstimated, brokerCapacityConfigResolverAllEstimated);
}
Also used : BrokerCapacityResolutionException(com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException) CruiseControlMetric(com.linkedin.kafka.cruisecontrol.metricsreporter.metric.CruiseControlMetric) BrokerCapacityInfo(com.linkedin.kafka.cruisecontrol.config.BrokerCapacityInfo) BrokerCapacityConfigResolver(com.linkedin.kafka.cruisecontrol.config.BrokerCapacityConfigResolver) Node(org.apache.kafka.common.Node) Cluster(org.apache.kafka.common.Cluster) TimeoutException(java.util.concurrent.TimeoutException) Test(org.junit.Test)

Example 3 with BrokerCapacityResolutionException

use of com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException in project cruise-control by linkedin.

the class MonitorUtils method populatePartitionLoad.

/**
 * Create replicas of the partition with the given (1) identifier and (2) load information to populate the given cluster model.
 * If partition with the given identifier does not exist in the given cluster, do nothing.
 *
 * @param cluster Kafka cluster.
 * @param clusterModel The cluster model to populate load information.
 * @param tp Topic partition that identifies the partition to populate the load for.
 * @param valuesAndExtrapolations The values and extrapolations of the leader replica.
 * @param replicaPlacementInfo The distribution of replicas over broker logdirs if available, {@code null} otherwise.
 * @param brokerCapacityConfigResolver The resolver for retrieving broker capacities.
 * @param allowCapacityEstimation whether allow capacity estimation in cluster model if the underlying live broker capacity is unavailable.
 */
static void populatePartitionLoad(Cluster cluster, ClusterModel clusterModel, TopicPartition tp, ValuesAndExtrapolations valuesAndExtrapolations, Map<TopicPartition, Map<Integer, String>> replicaPlacementInfo, BrokerCapacityConfigResolver brokerCapacityConfigResolver, boolean allowCapacityEstimation) throws TimeoutException {
    PartitionInfo partitionInfo = cluster.partition(tp);
    // If partition info does not exist, the topic may have been deleted.
    if (partitionInfo != null) {
        Set<Integer> aliveBrokers = cluster.nodes().stream().mapToInt(Node::id).boxed().collect(Collectors.toSet());
        boolean needToAdjustCpuUsage = true;
        Set<Integer> deadBrokersWithUnknownCapacity = new HashSet<>();
        for (int index = 0; index < partitionInfo.replicas().length; index++) {
            Node replica = partitionInfo.replicas()[index];
            String rack = getRackHandleNull(replica);
            BrokerCapacityInfo brokerCapacity;
            try {
                // Do not allow capacity estimation for dead brokers.
                brokerCapacity = brokerCapacityConfigResolver.capacityForBroker(rack, replica.host(), replica.id(), BROKER_CAPACITY_FETCH_TIMEOUT_MS, aliveBrokers.contains(replica.id()) && allowCapacityEstimation);
            } catch (TimeoutException | BrokerCapacityResolutionException e) {
                // Capacity resolver may not be able to return the capacity information of dead brokers.
                if (!aliveBrokers.contains(replica.id())) {
                    brokerCapacity = new BrokerCapacityInfo(EMPTY_BROKER_CAPACITY);
                    deadBrokersWithUnknownCapacity.add(replica.id());
                } else {
                    String errorMessage = String.format("Unable to retrieve capacity for broker %d. This may be caused by churn in " + "the cluster, please retry.", replica.id());
                    LOG.warn(errorMessage, e);
                    throw new TimeoutException(errorMessage);
                }
            }
            clusterModel.handleDeadBroker(rack, replica.id(), brokerCapacity);
            boolean isLeader;
            if (partitionInfo.leader() == null) {
                LOG.warn("Detected offline partition {}-{}, skipping", partitionInfo.topic(), partitionInfo.partition());
                continue;
            } else {
                isLeader = replica.id() == partitionInfo.leader().id();
            }
            boolean isOffline = Arrays.stream(partitionInfo.offlineReplicas()).anyMatch(offlineReplica -> offlineReplica.id() == replica.id());
            String logdir = replicaPlacementInfo == null ? null : replicaPlacementInfo.get(tp).get(replica.id());
            // If the replica's logdir is null, it is either because replica placement information is not populated for the cluster
            // model or this replica is hosted on a dead disk and is not considered for intra-broker replica operations.
            clusterModel.createReplica(rack, replica.id(), tp, index, isLeader, isOffline, logdir, false);
            clusterModel.setReplicaLoad(rack, replica.id(), tp, getAggregatedMetricValues(valuesAndExtrapolations, cluster.partition(tp), isLeader, needToAdjustCpuUsage), valuesAndExtrapolations.windows());
            needToAdjustCpuUsage = false;
        }
        if (!deadBrokersWithUnknownCapacity.isEmpty()) {
            LOG.info("Assign empty capacity to brokers {} because they are dead and capacity resolver is unable to fetch their capacity.", deadBrokersWithUnknownCapacity);
        }
    }
}
Also used : BrokerCapacityResolutionException(com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException) BrokerCapacityInfo(com.linkedin.kafka.cruisecontrol.config.BrokerCapacityInfo) Node(org.apache.kafka.common.Node) PartitionInfo(org.apache.kafka.common.PartitionInfo) HashSet(java.util.HashSet) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

BrokerCapacityInfo (com.linkedin.kafka.cruisecontrol.config.BrokerCapacityInfo)3 BrokerCapacityResolutionException (com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException)3 TimeoutException (java.util.concurrent.TimeoutException)3 Node (org.apache.kafka.common.Node)3 BrokerCapacityConfigResolver (com.linkedin.kafka.cruisecontrol.config.BrokerCapacityConfigResolver)1 CruiseControlMetric (com.linkedin.kafka.cruisecontrol.metricsreporter.metric.CruiseControlMetric)1 HashSet (java.util.HashSet)1 Cluster (org.apache.kafka.common.Cluster)1 PartitionInfo (org.apache.kafka.common.PartitionInfo)1 Test (org.junit.Test)1