use of com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException in project cruise-control by linkedin.
the class LoadMonitor method populateClusterCapacity.
private void populateClusterCapacity(boolean populateReplicaPlacementInfo, boolean allowCapacityEstimation, ClusterModel clusterModel, Cluster cluster) throws TimeoutException, BrokerCapacityResolutionException {
// Create the racks and brokers.
// If broker capacity is allowed to estimate broker capacity, shuffle nodes before getting their capacity from the
// capacity resolver. This is good for the capacity resolver to estimate the capacity of the nodes, for which the
// capacity retrieval has failed.
// The use case for this estimation is that if the capacity of one of the nodes is not available (e.g. due to some
// 3rd party service issue), the capacity resolver may want to use the capacity of a peer node as the capacity for
// that node.
// To this end, Cruise Control handles the case that the first node is problematic so the capacity resolver does
// not have the chance to get the capacity for the other nodes.
// Shuffling the node order helps, as the problematic node is unlikely to always be the first node in the list.
List<Node> shuffledNodes = allowCapacityEstimation ? new ArrayList<>(cluster.nodes()) : cluster.nodes();
if (allowCapacityEstimation) {
Collections.shuffle(shuffledNodes);
}
for (Node node : shuffledNodes) {
// If the rack is not specified, we use the host info as rack info.
String rack = getRackHandleNull(node);
clusterModel.createRack(rack);
BrokerCapacityInfo brokerCapacity;
try {
brokerCapacity = _brokerCapacityConfigResolver.capacityForBroker(rack, node.host(), node.id(), BROKER_CAPACITY_FETCH_TIMEOUT_MS, allowCapacityEstimation);
LOG.debug("Capacity of broker {}: {}, (LogDir: {}, Cores: {}).", node.id(), brokerCapacity.capacity(), brokerCapacity.diskCapacityByLogDir(), brokerCapacity.numCpuCores());
if (populateReplicaPlacementInfo && brokerCapacity.diskCapacityByLogDir() == null) {
throw new IllegalStateException(String.format("Missing disk capacity information for logDirs on broker %d. " + "Are you trying to use a JBOD feature on a non-JBOD Kafka deployment?", node.id()));
}
} catch (TimeoutException | BrokerCapacityResolutionException e) {
String errorMessage = String.format("Unable to retrieve capacity for broker %d. This may be caused by churn in " + "the cluster, please retry.", node.id());
LOG.warn(errorMessage, e);
throw e;
}
clusterModel.createBroker(rack, node.host(), node.id(), brokerCapacity, populateReplicaPlacementInfo);
}
}
use of com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException in project cruise-control by linkedin.
the class CruiseControlMetricsProcessorTest method testWithCpuCapacityEstimation.
@Test
public void testWithCpuCapacityEstimation() throws TimeoutException, BrokerCapacityResolutionException {
Set<CruiseControlMetric> metrics = getCruiseControlMetrics();
// All estimated.
BrokerCapacityConfigResolver brokerCapacityConfigResolverAllEstimated = EasyMock.mock(BrokerCapacityConfigResolver.class);
EasyMock.expect(brokerCapacityConfigResolverAllEstimated.capacityForBroker(EasyMock.anyString(), EasyMock.anyString(), EasyMock.anyInt(), EasyMock.anyLong(), EasyMock.eq(false))).andThrow(new BrokerCapacityResolutionException("Unable to resolve capacity.")).anyTimes();
EasyMock.replay(brokerCapacityConfigResolverAllEstimated);
CruiseControlMetricsProcessor processor = new CruiseControlMetricsProcessor(brokerCapacityConfigResolverAllEstimated, false);
for (CruiseControlMetric cruiseControlMetric : metrics) {
processor.addMetric(cruiseControlMetric);
}
Cluster cluster = getCluster();
processor.process(cluster, TEST_PARTITIONS, MetricSampler.SamplingMode.ALL);
for (Node node : cluster.nodes()) {
assertNull(processor.cachedNumCoresByBroker().get(node.id()));
}
// Capacity resolver unable to retrieve broker capacity.
BrokerCapacityConfigResolver brokerCapacityConfigResolverTimeout = EasyMock.mock(BrokerCapacityConfigResolver.class);
EasyMock.expect(brokerCapacityConfigResolverTimeout.capacityForBroker(EasyMock.anyString(), EasyMock.anyString(), EasyMock.anyInt(), EasyMock.anyLong(), EasyMock.anyBoolean())).andThrow(new TimeoutException("Unable to resolve capacity.")).anyTimes();
EasyMock.replay(brokerCapacityConfigResolverTimeout);
processor = new CruiseControlMetricsProcessor(brokerCapacityConfigResolverTimeout, false);
for (CruiseControlMetric cruiseControlMetric : metrics) {
processor.addMetric(cruiseControlMetric);
}
cluster = getCluster();
processor.process(cluster, TEST_PARTITIONS, MetricSampler.SamplingMode.ALL);
for (Node node : cluster.nodes()) {
assertNull(processor.cachedNumCoresByBroker().get(node.id()));
}
// Some estimated.
BrokerCapacityConfigResolver brokerCapacityConfigResolverSomeEstimated = EasyMock.mock(BrokerCapacityConfigResolver.class);
EasyMock.expect(brokerCapacityConfigResolverSomeEstimated.capacityForBroker(EasyMock.anyString(), EasyMock.anyString(), EasyMock.eq(BROKER_ID_1), EasyMock.anyLong(), EasyMock.anyBoolean())).andThrow(new TimeoutException("Unable to resolve capacity.")).anyTimes();
EasyMock.expect(brokerCapacityConfigResolverSomeEstimated.capacityForBroker(EasyMock.anyString(), EasyMock.anyString(), EasyMock.eq(BROKER_ID_0), EasyMock.anyLong(), EasyMock.anyBoolean())).andReturn(new BrokerCapacityInfo(EMPTY_BROKER_CAPACITY, Collections.emptyMap(), MOCK_NUM_CPU_CORES)).anyTimes();
EasyMock.replay(brokerCapacityConfigResolverSomeEstimated);
processor = new CruiseControlMetricsProcessor(brokerCapacityConfigResolverSomeEstimated, false);
for (CruiseControlMetric metric : metrics) {
processor.addMetric(metric);
}
processor.process(cluster, TEST_PARTITIONS, MetricSampler.SamplingMode.ALL);
assertEquals(MOCK_NUM_CPU_CORES, (short) processor.cachedNumCoresByBroker().get(BROKER_ID_0));
assertNull(processor.cachedNumCoresByBroker().get(BROKER_ID_1));
EasyMock.verify(brokerCapacityConfigResolverTimeout, brokerCapacityConfigResolverSomeEstimated, brokerCapacityConfigResolverAllEstimated);
}
use of com.linkedin.kafka.cruisecontrol.exception.BrokerCapacityResolutionException in project cruise-control by linkedin.
the class MonitorUtils method populatePartitionLoad.
/**
* Create replicas of the partition with the given (1) identifier and (2) load information to populate the given cluster model.
* If partition with the given identifier does not exist in the given cluster, do nothing.
*
* @param cluster Kafka cluster.
* @param clusterModel The cluster model to populate load information.
* @param tp Topic partition that identifies the partition to populate the load for.
* @param valuesAndExtrapolations The values and extrapolations of the leader replica.
* @param replicaPlacementInfo The distribution of replicas over broker logdirs if available, {@code null} otherwise.
* @param brokerCapacityConfigResolver The resolver for retrieving broker capacities.
* @param allowCapacityEstimation whether allow capacity estimation in cluster model if the underlying live broker capacity is unavailable.
*/
static void populatePartitionLoad(Cluster cluster, ClusterModel clusterModel, TopicPartition tp, ValuesAndExtrapolations valuesAndExtrapolations, Map<TopicPartition, Map<Integer, String>> replicaPlacementInfo, BrokerCapacityConfigResolver brokerCapacityConfigResolver, boolean allowCapacityEstimation) throws TimeoutException {
PartitionInfo partitionInfo = cluster.partition(tp);
// If partition info does not exist, the topic may have been deleted.
if (partitionInfo != null) {
Set<Integer> aliveBrokers = cluster.nodes().stream().mapToInt(Node::id).boxed().collect(Collectors.toSet());
boolean needToAdjustCpuUsage = true;
Set<Integer> deadBrokersWithUnknownCapacity = new HashSet<>();
for (int index = 0; index < partitionInfo.replicas().length; index++) {
Node replica = partitionInfo.replicas()[index];
String rack = getRackHandleNull(replica);
BrokerCapacityInfo brokerCapacity;
try {
// Do not allow capacity estimation for dead brokers.
brokerCapacity = brokerCapacityConfigResolver.capacityForBroker(rack, replica.host(), replica.id(), BROKER_CAPACITY_FETCH_TIMEOUT_MS, aliveBrokers.contains(replica.id()) && allowCapacityEstimation);
} catch (TimeoutException | BrokerCapacityResolutionException e) {
// Capacity resolver may not be able to return the capacity information of dead brokers.
if (!aliveBrokers.contains(replica.id())) {
brokerCapacity = new BrokerCapacityInfo(EMPTY_BROKER_CAPACITY);
deadBrokersWithUnknownCapacity.add(replica.id());
} else {
String errorMessage = String.format("Unable to retrieve capacity for broker %d. This may be caused by churn in " + "the cluster, please retry.", replica.id());
LOG.warn(errorMessage, e);
throw new TimeoutException(errorMessage);
}
}
clusterModel.handleDeadBroker(rack, replica.id(), brokerCapacity);
boolean isLeader;
if (partitionInfo.leader() == null) {
LOG.warn("Detected offline partition {}-{}, skipping", partitionInfo.topic(), partitionInfo.partition());
continue;
} else {
isLeader = replica.id() == partitionInfo.leader().id();
}
boolean isOffline = Arrays.stream(partitionInfo.offlineReplicas()).anyMatch(offlineReplica -> offlineReplica.id() == replica.id());
String logdir = replicaPlacementInfo == null ? null : replicaPlacementInfo.get(tp).get(replica.id());
// If the replica's logdir is null, it is either because replica placement information is not populated for the cluster
// model or this replica is hosted on a dead disk and is not considered for intra-broker replica operations.
clusterModel.createReplica(rack, replica.id(), tp, index, isLeader, isOffline, logdir, false);
clusterModel.setReplicaLoad(rack, replica.id(), tp, getAggregatedMetricValues(valuesAndExtrapolations, cluster.partition(tp), isLeader, needToAdjustCpuUsage), valuesAndExtrapolations.windows());
needToAdjustCpuUsage = false;
}
if (!deadBrokersWithUnknownCapacity.isEmpty()) {
LOG.info("Assign empty capacity to brokers {} because they are dead and capacity resolver is unable to fetch their capacity.", deadBrokersWithUnknownCapacity);
}
}
}
Aggregations