Search in sources :

Example 1 with PartitionState

use of com.linkedin.d2.balancer.strategies.relative.PartitionState in project rest.li by linkedin.

the class DegraderLoadBalancerStrategyV3 method updatePartitionState.

private void updatePartitionState(long clusterGenerationId, Partition partition, List<TrackerClient> trackerClients, DegraderLoadBalancerStrategyConfig config) {
    PartitionDegraderLoadBalancerState partitionState = partition.getState();
    List<TrackerClientUpdater> clientUpdaters = new ArrayList<TrackerClientUpdater>();
    for (TrackerClient client : trackerClients) {
        clientUpdaters.add(new TrackerClientUpdater(client, partition.getId()));
    }
    boolean quarantineEnabled = _state._enableQuarantine.get();
    if (config.getQuarantineMaxPercent() > 0.0 && !quarantineEnabled) {
        // check the hosts to see if the quarantine can be enabled.
        if (_state._retryTimesForQuarantine.incrementAndGet() <= MAX_RETRIES_TO_CHECK_QUARANTINE) {
            _config.getExecutorService().submit(() -> checkQuarantineState(clientUpdaters, config));
        }
    }
    // doUpdatePartitionState has no side effects on _state or trackerClients.
    // all changes to the trackerClients would be recorded in clientUpdaters
    partitionState = doUpdatePartitionState(clusterGenerationId, partition.getId(), partitionState, config, clientUpdaters, quarantineEnabled);
    partition.setState(partitionState);
    // only if state update succeeded, do we actually apply the recorded changes to trackerClients
    for (TrackerClientUpdater clientUpdater : clientUpdaters) {
        clientUpdater.update();
    }
}
Also used : TrackerClient(com.linkedin.d2.balancer.clients.TrackerClient) ArrayList(java.util.ArrayList)

Example 2 with PartitionState

use of com.linkedin.d2.balancer.strategies.relative.PartitionState in project rest.li by linkedin.

the class DegraderLoadBalancerStrategyV3 method updatePartitionState.

private void updatePartitionState(long clusterGenerationId, Partition partition, List<DegraderTrackerClient> trackerClients, DegraderLoadBalancerStrategyConfig config) {
    PartitionDegraderLoadBalancerState partitionState = partition.getState();
    List<DegraderTrackerClientUpdater> clientUpdaters = new ArrayList<>();
    for (DegraderTrackerClient client : trackerClients) {
        clientUpdaters.add(new DegraderTrackerClientUpdater(client, partition.getId()));
    }
    boolean quarantineEnabled = _state.isQuarantineEnabled();
    if (config.getQuarantineMaxPercent() > 0.0 && !quarantineEnabled) {
        // check the hosts to see if the quarantine can be enabled.
        if (_state.incrementAndGetQuarantineRetries() <= MAX_RETRIES_TO_CHECK_QUARANTINE) {
            _config.getExecutorService().submit(() -> checkQuarantineState(clientUpdaters, config));
        }
    }
    // doUpdatePartitionState has no side effects on _state or trackerClients.
    // all changes to the trackerClients would be recorded in clientUpdaters
    partitionState = doUpdatePartitionState(clusterGenerationId, partition.getId(), partitionState, config, clientUpdaters, quarantineEnabled);
    partition.setState(partitionState);
    // only if state update succeeded, do we actually apply the recorded changes to trackerClients
    for (DegraderTrackerClientUpdater clientUpdater : clientUpdaters) {
        clientUpdater.update();
    }
}
Also used : DegraderTrackerClient(com.linkedin.d2.balancer.clients.DegraderTrackerClient) ArrayList(java.util.ArrayList)

Example 3 with PartitionState

use of com.linkedin.d2.balancer.strategies.relative.PartitionState in project rest.li by linkedin.

the class QuarantineManager method preCheckQuarantineState.

/**
 * Pre-check if quarantine can be enabled before directly enabling it
 * We limit the number of server hosts to prevent too many connections to be made at once when the downstream cluster is large
 *
 * @param partitionState The state of the partition
 * @param quarantineLatency The quarantine latency threshold
 */
private void preCheckQuarantineState(PartitionState partitionState, long quarantineLatency) {
    Callback<None> healthCheckCallback = new HealthCheckCallBack<>();
    partitionState.getTrackerClients().stream().limit(MAX_HOSTS_TO_PRE_CHECK_QUARANTINE).forEach(client -> {
        try {
            HealthCheck healthCheckClient = partitionState.getHealthCheckMap().get(client);
            if (healthCheckClient == null) {
                healthCheckClient = new HealthCheckClientBuilder().setHealthCheckOperations(_healthCheckOperations).setHealthCheckPath(_quarantineProperties.getHealthCheckPath()).setServicePath(_servicePath).setClock(_clock).setLatency(quarantineLatency).setMethod(_quarantineProperties.getHealthCheckMethod().toString()).setClient(client).build();
                partitionState.getHealthCheckMap().put(client, healthCheckClient);
            }
            healthCheckClient.checkHealth(healthCheckCallback);
        } catch (URISyntaxException e) {
            LOG.error("Error to build healthCheckClient ", e);
        }
    });
}
Also used : HealthCheck(com.linkedin.d2.balancer.util.healthcheck.HealthCheck) HealthCheckClientBuilder(com.linkedin.d2.balancer.util.healthcheck.HealthCheckClientBuilder) URISyntaxException(java.net.URISyntaxException) None(com.linkedin.common.util.None)

Example 4 with PartitionState

use of com.linkedin.d2.balancer.strategies.relative.PartitionState in project rest.li by linkedin.

the class QuarantineManager method checkAndRemoveQuarantine.

/**
 * Check if the quarantine still applies for each tracker client.
 * Remove it from the map if the quarantine is no long applicable. Put the client into recovery state right after the quarantine.
 *
 * @param partitionState The current state of the partition
 */
private void checkAndRemoveQuarantine(PartitionState partitionState) {
    Map<TrackerClient, LoadBalancerQuarantine> quarantineMap = partitionState.getQuarantineMap();
    Map<TrackerClient, LoadBalancerQuarantine> quarantineHistory = partitionState.getQuarantineHistory();
    Set<TrackerClient> recoverySet = partitionState.getRecoveryTrackerClients();
    for (TrackerClient trackerClient : partitionState.getTrackerClients()) {
        LoadBalancerQuarantine quarantine = quarantineMap.get(trackerClient);
        if (quarantine != null && quarantine.checkUpdateQuarantineState()) {
            // Evict client from quarantine
            quarantineMap.remove(trackerClient);
            quarantineHistory.put(trackerClient, quarantine);
            recoverySet.add(trackerClient);
        }
    }
}
Also used : LoadBalancerQuarantine(com.linkedin.d2.balancer.strategies.LoadBalancerQuarantine) TrackerClient(com.linkedin.d2.balancer.clients.TrackerClient)

Example 5 with PartitionState

use of com.linkedin.d2.balancer.strategies.relative.PartitionState in project rest.li by linkedin.

the class StateUpdater method calculateBaseHealthScore.

private void calculateBaseHealthScore(Set<TrackerClient> trackerClients, PartitionState partitionState, long avgClusterLatency, Map<TrackerClient, CallTracker.CallStats> lastCallStatsMap) {
    Map<TrackerClient, TrackerClientState> trackerClientStateMap = partitionState.getTrackerClientStateMap();
    // Update health score
    long clusterCallCount = 0;
    long clusterErrorCount = 0;
    for (TrackerClient trackerClient : trackerClients) {
        CallTracker.CallStats latestCallStats = lastCallStatsMap.get(trackerClient);
        if (trackerClientStateMap.containsKey(trackerClient)) {
            TrackerClientState trackerClientState = trackerClientStateMap.get(trackerClient);
            int callCount = latestCallStats.getCallCount() + latestCallStats.getOutstandingCount();
            if (trackerClient.doNotLoadBalance()) {
                trackerClientState.setHealthState(TrackerClientState.HealthState.HEALTHY);
                trackerClientState.setHealthScore(MAX_HEALTH_SCORE);
                trackerClientState.setCallCount(callCount);
            } else {
                double errorRate = getErrorRate(latestCallStats.getErrorTypeCounts(), callCount);
                long avgLatency = getAvgHostLatency(latestCallStats);
                double oldHealthScore = trackerClientState.getHealthScore();
                double newHealthScore = oldHealthScore;
                clusterCallCount += callCount;
                clusterErrorCount += errorRate * callCount;
                if (isUnhealthy(trackerClientState, avgClusterLatency, callCount, avgLatency, errorRate)) {
                    // If it is above high latency, we reduce the health score by down step
                    newHealthScore = Double.max(trackerClientState.getHealthScore() - _relativeStrategyProperties.getDownStep(), MIN_HEALTH_SCORE);
                    trackerClientState.setHealthState(TrackerClientState.HealthState.UNHEALTHY);
                    LOG.debug("Host is unhealthy. Host: " + trackerClient.toString() + ", errorRate: " + errorRate + ", latency: " + avgClusterLatency + ", callCount: " + callCount + ", healthScore dropped from " + trackerClientState.getHealthScore() + " to " + newHealthScore);
                } else if (trackerClientState.getHealthScore() < MAX_HEALTH_SCORE && isHealthy(trackerClientState, avgClusterLatency, callCount, avgLatency, errorRate)) {
                    if (oldHealthScore < _relativeStrategyProperties.getSlowStartThreshold()) {
                        // If the client is healthy and slow start is enabled, we double the health score
                        newHealthScore = oldHealthScore > MIN_HEALTH_SCORE ? Math.min(MAX_HEALTH_SCORE, SLOW_START_RECOVERY_FACTOR * oldHealthScore) : SLOW_START_INITIAL_HEALTH_SCORE;
                    } else {
                        // If slow start is not enabled, we just increase the health score by up step
                        newHealthScore = Math.min(MAX_HEALTH_SCORE, oldHealthScore + _relativeStrategyProperties.getUpStep());
                    }
                    trackerClientState.setHealthState(TrackerClientState.HealthState.HEALTHY);
                } else {
                    trackerClientState.setHealthState(TrackerClientState.HealthState.NEUTRAL);
                }
                trackerClientState.setHealthScore(newHealthScore);
                trackerClientState.setCallCount(callCount);
            }
        } else {
            // Initializing a new client score
            if (trackerClient.doNotSlowStart() || trackerClient.doNotLoadBalance()) {
                trackerClientStateMap.put(trackerClient, new TrackerClientState(MAX_HEALTH_SCORE, _relativeStrategyProperties.getMinCallCount()));
            } else {
                trackerClientStateMap.put(trackerClient, new TrackerClientState(_relativeStrategyProperties.getInitialHealthScore(), _relativeStrategyProperties.getMinCallCount()));
            }
        }
    }
    partitionState.setPartitionStats(avgClusterLatency, clusterCallCount, clusterErrorCount);
}
Also used : TrackerClient(com.linkedin.d2.balancer.clients.TrackerClient) CallTracker(com.linkedin.util.degrader.CallTracker)

Aggregations

TrackerClient (com.linkedin.d2.balancer.clients.TrackerClient)17 Test (org.testng.annotations.Test)12 D2RelativeStrategyProperties (com.linkedin.d2.D2RelativeStrategyProperties)10 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)9 URI (java.net.URI)6 LoadBalancerQuarantine (com.linkedin.d2.balancer.strategies.LoadBalancerQuarantine)5 HashMap (java.util.HashMap)5 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)5 CountDownLatch (java.util.concurrent.CountDownLatch)4 CallTracker (com.linkedin.util.degrader.CallTracker)2 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 None (com.linkedin.common.util.None)1 DegraderTrackerClient (com.linkedin.d2.balancer.clients.DegraderTrackerClient)1 DelegatingRingFactory (com.linkedin.d2.balancer.strategies.DelegatingRingFactory)1 PartitionStateUpdateListener (com.linkedin.d2.balancer.strategies.PartitionStateUpdateListener)1 PartitionState (com.linkedin.d2.balancer.strategies.relative.PartitionState)1 RelativeLoadBalancerStrategy (com.linkedin.d2.balancer.strategies.relative.RelativeLoadBalancerStrategy)1 TrackerClientState (com.linkedin.d2.balancer.strategies.relative.TrackerClientState)1 Ring (com.linkedin.d2.balancer.util.hashing.Ring)1