Search in sources :

Example 6 with DegraderControl

use of com.linkedin.util.degrader.DegraderControl in project rest.li by linkedin.

the class DegraderLoadBalancerTest method testWeightedAndLatencyDegradationBalancingRing.

@Test(groups = { "small", "back-end" }, dataProvider = "consistentHashAlgorithms")
public void testWeightedAndLatencyDegradationBalancingRing(String consistentHashAlgorithm) throws URISyntaxException {
    DegraderLoadBalancerStrategyV3 strategy = getStrategy(consistentHashAlgorithm);
    List<TrackerClient> clients = new ArrayList<TrackerClient>();
    URI uri1 = URI.create("http://test.linkedin.com:3242/fdsaf");
    URI uri2 = URI.create("http://test.linkedin.com:3243/fdsaf");
    TestClock clock1 = new TestClock();
    TestClock clock2 = new TestClock();
    TrackerClient client1 = new TrackerClient(uri1, getDefaultPartitionData(1d), new TestLoadBalancerClient(uri1), clock1, null);
    TrackerClient client2 = new TrackerClient(uri2, getDefaultPartitionData(0.8d), new TestLoadBalancerClient(uri2), clock2, null);
    clients.add(client1);
    clients.add(client2);
    DegraderControl dcClient2Default = client2.getDegraderControl(DEFAULT_PARTITION_ID);
    dcClient2Default.setOverrideMinCallCount(1);
    dcClient2Default.setMinCallCount(1);
    dcClient2Default.setMaxDropRate(1d);
    dcClient2Default.setUpStep(0.4d);
    dcClient2Default.setHighErrorRate(0);
    CallCompletion cc = client2.getCallTracker().startCall();
    clock2.addMs(1);
    cc.endCallWithError();
    clock1.addMs(15000);
    clock2.addMs(5000);
    // trigger a state update
    assertNotNull(getTrackerClient(strategy, null, new RequestContext(), 1, clients));
    // now do a basic verification to verify getTrackerClient is properly weighting things
    double calls = 10000d;
    int client1Count = 0;
    int client2Count = 0;
    double tolerance = 0.05d;
    for (int i = 0; i < calls; ++i) {
        TrackerClient client = getTrackerClient(strategy, null, new RequestContext(), 1, clients);
        assertNotNull(client);
        if (client.getUri().equals(uri1)) {
            ++client1Count;
        } else {
            ++client2Count;
        }
    }
    assertTrue(Math.abs((client1Count / calls) - (100 / 148d)) < tolerance);
    assertTrue(Math.abs((client2Count / calls) - (48 / 148d)) < tolerance);
}
Also used : TrackerClient(com.linkedin.d2.balancer.clients.TrackerClient) CallCompletion(com.linkedin.util.degrader.CallCompletion) ArrayList(java.util.ArrayList) DegraderControl(com.linkedin.util.degrader.DegraderControl) RequestContext(com.linkedin.r2.message.RequestContext) URI(java.net.URI) Test(org.testng.annotations.Test) TrackerClientTest(com.linkedin.d2.balancer.clients.TrackerClientTest)

Example 7 with DegraderControl

use of com.linkedin.util.degrader.DegraderControl in project rest.li by linkedin.

the class DegraderLoadBalancerTest method testWeightedAndLatencyDegradationBalancingRingWithPartitions.

@Test(groups = { "small", "back-end" }, dataProvider = "consistentHashAlgorithms")
public void testWeightedAndLatencyDegradationBalancingRingWithPartitions(String consistentHashAlgorithm) throws URISyntaxException {
    DegraderLoadBalancerStrategyV3 strategy = getStrategy(consistentHashAlgorithm);
    List<TrackerClient> clientsForPartition0 = new ArrayList<TrackerClient>();
    List<TrackerClient> clientsForPartition1 = new ArrayList<TrackerClient>();
    URI uri1 = URI.create("http://someTestService/someTestUrl");
    URI uri2 = URI.create("http://abcxfweuoeueoueoueoukeueoueoueoueoueouo/2354");
    URI uri3 = URI.create("http://slashdot/blah");
    URI uri4 = URI.create("http://idle/server");
    TestClock clock1 = new TestClock();
    TestClock clock2 = new TestClock();
    TestClock clock3 = new TestClock();
    @SuppressWarnings("serial") TrackerClient client1 = new TrackerClient(uri1, new HashMap<Integer, PartitionData>() {

        {
            put(0, new PartitionData(1d));
        }
    }, new TestLoadBalancerClient(uri1), clock1, null);
    @SuppressWarnings("serial") TrackerClient client2 = new TrackerClient(uri2, new HashMap<Integer, PartitionData>() {

        {
            put(0, new PartitionData(0.5d));
            put(1, new PartitionData(0.5d));
        }
    }, new TestLoadBalancerClient(uri2), clock2, null);
    @SuppressWarnings("serial") TrackerClient client3 = new TrackerClient(uri3, new HashMap<Integer, PartitionData>() {

        {
            put(1, new PartitionData(1d));
        }
    }, new TestLoadBalancerClient(uri3), clock3, null);
    final int partitionId0 = 0;
    clientsForPartition0.add(client1);
    clientsForPartition0.add(client2);
    final int partitionId1 = 1;
    clientsForPartition1.add(client2);
    clientsForPartition1.add(client3);
    // force client2 to be disabled
    DegraderControl dcClient2Partition0 = client2.getDegraderControl(0);
    DegraderControl dcClient2Partition1 = client2.getDegraderControl(1);
    dcClient2Partition0.setOverrideMinCallCount(1);
    dcClient2Partition0.setMinCallCount(1);
    dcClient2Partition0.setMaxDropRate(1d);
    dcClient2Partition0.setUpStep(0.4d);
    dcClient2Partition0.setHighErrorRate(0);
    dcClient2Partition1.setOverrideMinCallCount(1);
    dcClient2Partition1.setMinCallCount(1);
    dcClient2Partition1.setMaxDropRate(1d);
    dcClient2Partition1.setUpStep(0.4d);
    dcClient2Partition1.setHighErrorRate(0);
    CallCompletion cc = client2.getCallTracker().startCall();
    clock2.addMs(1);
    cc.endCallWithError();
    // force client3 to be disabled
    DegraderControl dcClient3Partition1 = client3.getDegraderControl(1);
    dcClient3Partition1.setOverrideMinCallCount(1);
    dcClient3Partition1.setMinCallCount(1);
    dcClient3Partition1.setMaxDropRate(1d);
    dcClient3Partition1.setHighErrorRate(0);
    dcClient3Partition1.setUpStep(0.2d);
    CallCompletion cc3 = client3.getCallTracker().startCall();
    clock3.addMs(1);
    cc3.endCallWithError();
    clock1.addMs(15000);
    clock2.addMs(5000);
    clock3.addMs(5000);
    // trigger a state update
    assertNotNull(strategy.getTrackerClient(null, new RequestContext(), 1, partitionId0, clientsForPartition0));
    assertNotNull(strategy.getTrackerClient(null, new RequestContext(), 1, partitionId1, clientsForPartition1));
    assertNotNull(strategy.getRing(1, partitionId0, clientsForPartition0));
    assertNotNull(strategy.getRing(1, partitionId1, clientsForPartition1));
    // now do a basic verification to verify getTrackerClient is properly weighting things
    int calls = 10000;
    int client1Count = 0;
    int client2Count = 0;
    double tolerance = 0.05d;
    for (int i = 0; i < calls; ++i) {
        TrackerClient client = strategy.getTrackerClient(null, new RequestContext(), 1, partitionId0, clientsForPartition0);
        assertNotNull(client);
        if (client.getUri().equals(uri1)) {
            ++client1Count;
        } else {
            ++client2Count;
        }
    }
    assertTrue(Math.abs((client1Count / (double) calls) - (100 / 130d)) < tolerance);
    assertTrue(Math.abs((client2Count / (double) calls) - (30 / 130d)) < tolerance);
    client2Count = 0;
    int client3Count = 0;
    int client4Count = 0;
    for (int i = 0; i < calls; ++i) {
        TrackerClient client = strategy.getTrackerClient(null, new RequestContext(), 1, partitionId1, clientsForPartition1);
        assertNotNull(client);
        if (client.getUri().equals(uri3)) {
            ++client3Count;
        } else if (client.getUri().equals(uri2)) {
            ++client2Count;
        } else {
            ++client4Count;
        }
    }
    assertTrue(Math.abs((client3Count / (double) calls) - (80 / 110d)) < tolerance);
    assertTrue(Math.abs((client2Count / (double) calls) - (30 / 110d)) < tolerance);
    assertTrue(client4Count == 0);
}
Also used : ArrayList(java.util.ArrayList) DegraderControl(com.linkedin.util.degrader.DegraderControl) URI(java.net.URI) TrackerClient(com.linkedin.d2.balancer.clients.TrackerClient) CallCompletion(com.linkedin.util.degrader.CallCompletion) PartitionData(com.linkedin.d2.balancer.properties.PartitionData) RequestContext(com.linkedin.r2.message.RequestContext) Test(org.testng.annotations.Test) TrackerClientTest(com.linkedin.d2.balancer.clients.TrackerClientTest)

Example 8 with DegraderControl

use of com.linkedin.util.degrader.DegraderControl in project rest.li by linkedin.

the class DegraderLoadBalancerStrategyV3 method doUpdatePartitionState.

/**
   * updatePartitionState
   *
   * We have two mechanisms to influence the health and traffic patterns of the client. They are
   * by load balancing (switching traffic from one host to another) and by degrading service
   * (dropping calls). We load balance by allocating points in a consistent hash ring based on the
   * computedDropRate of the individual TrackerClients, which takes into account the latency
   * seen by that TrackerClient's requests. We can alternatively, if the cluster is
   * unhealthy (by using a high latency watermark) drop a portion of traffic across all tracker
   * clients corresponding to this cluster.
   *
   * The reason we do not currently consider error rate when adjusting the hash ring is that
   * there are legitimate errors that servers can send back for clients to handle, such as
   * 400 return codes. A potential improvement would be to catch transport level exceptions and 500
   * level return codes, but the implication of that would need to be carefully understood and documented.
   *
   * We don't want both to reduce hash points and allow clients to manage their own drop rates
   * because the clients do not have a global view that the load balancing strategy does. Without
   * a global view, the clients won't know if it already has a reduced number of hash points. If the
   * client continues to drop at the same drop rate as before their points have been reduced, then
   * the client would have its outbound request reduced by both reduction in points and the client's
   * drop rate. To avoid this, the drop rate is managed globally by the load balancing strategy and
   * provided to each client. The strategy will alternate between adjusting the hash ring points or
   * the global drop rate in order to avoid double penalizing a client.
   *
   * We also have a mechanism for recovery if the number of points in the hash ring is not
   * enough to receive traffic. The initialRecoveryLevel is a number between 0.0 and 1.0, and
   * corresponds to a weight of the tracker client's full hash points.
   * The reason for the weight is to allow an initialRecoveryLevel that corresponds to
   * less than one hash point. This would be useful if a "cooling off" period is desirable for the
   * misbehaving tracker clients, ie , given a full weight of 100 hash points,0.005 means that
   * there will be one cooling off period before the client is reintroduced into the hash ring.
   *
   * The second configuration, rampFactor, will geometrically increase the
   * previous recoveryLevel if traffic still hasn't been seen for that tracker client.
   *
   * For example, given initialRecoveryLevel = 0.01, rampFactor = 2, and default tracker client hash
   * points of 100, we will increase the hash points in this pattern on successive update States:
   *  0.01, 0.02, 0.04, 0.08, 0.16, 0.32, etc., aborting as soon as
   * calls are recorded for that tracker client.
   *
   * We also have highWaterMark and lowWaterMark as properties of the DegraderLoadBalancer strategy
   * so that the strategy can make decisions on whether to start dropping traffic globally across
   * all tracker clients for this cluster. The amount of traffic to drop is controlled by the
   * globalStepUp and globalStepDown properties, where globalStepUp controls how much the global
   * drop rate increases per interval, and globalStepDown controls how much the global drop rate
   * decreases per interval. We only step up the global drop rate when the average cluster latency
   * is higher than the highWaterMark, and only step down the global drop rate when the average
   * cluster latency is lower than the global drop rate.
   *
   * This code is thread reentrant. Multiple threads can potentially call this concurrently, and so
   * callers must pass in the DegraderLoadBalancerState that they based their shouldUpdate() call on.
   * The multiple threads may have different views of the trackerClients latency, but this is
   * ok as the new state in the end will have only taken one action (either loadbalance or
   * call-dropping with at most one step). Currently we will not call this concurrently, as
   * checkUpdatePartitionState will control entry to a single thread.
   *
   * @param clusterGenerationId
   * @param trackerClientUpdaters
   * @param oldState
   * @param config
   */
private static PartitionDegraderLoadBalancerState doUpdatePartitionState(long clusterGenerationId, int partitionId, PartitionDegraderLoadBalancerState oldState, DegraderLoadBalancerStrategyConfig config, List<TrackerClientUpdater> trackerClientUpdaters, boolean isQuarantineEnabled) {
    debug(_log, "updating state for: ", trackerClientUpdaters);
    double sumOfClusterLatencies = 0.0;
    long totalClusterCallCount = 0;
    double newMaxDropRate;
    boolean hashRingChanges = false;
    boolean recoveryMapChanges = false;
    boolean quarantineMapChanged = false;
    PartitionDegraderLoadBalancerState.Strategy strategy = oldState.getStrategy();
    Map<TrackerClient, Double> oldRecoveryMap = oldState.getRecoveryMap();
    Map<TrackerClient, Double> newRecoveryMap = new HashMap<TrackerClient, Double>(oldRecoveryMap);
    double currentOverrideDropRate = oldState.getCurrentOverrideDropRate();
    double initialRecoveryLevel = config.getInitialRecoveryLevel();
    double ringRampFactor = config.getRingRampFactor();
    int pointsPerWeight = config.getPointsPerWeight();
    PartitionDegraderLoadBalancerState newState;
    Map<TrackerClient, DegraderLoadBalancerQuarantine> quarantineMap = oldState.getQuarantineMap();
    Map<TrackerClient, DegraderLoadBalancerQuarantine> quarantineHistory = oldState.getQuarantineHistory();
    Set<TrackerClient> activeClients = new HashSet<>();
    long clk = config.getClock().currentTimeMillis();
    for (TrackerClientUpdater clientUpdater : trackerClientUpdaters) {
        TrackerClient client = clientUpdater.getTrackerClient();
        DegraderControl degraderControl = client.getDegraderControl(partitionId);
        double averageLatency = degraderControl.getLatency();
        long callCount = degraderControl.getCallCount();
        oldState.getPreviousMaxDropRate().put(client, clientUpdater.getMaxDropRate());
        sumOfClusterLatencies += averageLatency * callCount;
        totalClusterCallCount += callCount;
        boolean recoveryMapContainsClient = newRecoveryMap.containsKey(client);
        if (isQuarantineEnabled) {
            activeClients.add(client);
            // Check/update quarantine state if current client is already under quarantine
            DegraderLoadBalancerQuarantine quarantine = quarantineMap.get(client);
            if (quarantine != null && quarantine.checkUpdateQuarantineState()) {
                // Evict client from quarantine
                quarantineMap.remove(client);
                quarantineHistory.put(client, quarantine);
                _log.info("TrackerClient {} evicted from quarantine @ {}", client.getUri(), clk);
                // Next need to put the client to slow-start/recovery mode to gradually pick up traffic.
                // For now simply force the weight to the initialRecoveryLevel so the client can gradually recover
                // RecoveryMap is used here to track the clients that just evicted from quarantine
                // They'll not be quarantined again in the recovery phase even though the effective
                // weight is within the range.
                newRecoveryMap.put(client, degraderControl.getMaxDropRate());
                clientUpdater.setMaxDropRate(1.0 - initialRecoveryLevel);
                quarantineMapChanged = true;
            }
        }
        if (recoveryMapContainsClient) {
            // points in the hash ring for the clients.
            if (callCount == 0) {
                // if this client is enrolled in the program, decrease the maxDropRate
                // it is important to note that this excludes clients that haven't gotten traffic
                // due solely to low volume.
                double oldMaxDropRate = clientUpdater.getMaxDropRate();
                double transmissionRate = 1.0 - oldMaxDropRate;
                if (transmissionRate <= 0.0) {
                    // We use the initialRecoveryLevel to indicate how many points to initially set
                    // the tracker client to when traffic has stopped flowing to this node.
                    transmissionRate = initialRecoveryLevel;
                } else {
                    transmissionRate *= ringRampFactor;
                    transmissionRate = Math.min(transmissionRate, 1.0);
                }
                newMaxDropRate = 1.0 - transmissionRate;
                if (strategy == PartitionDegraderLoadBalancerState.Strategy.LOAD_BALANCE) {
                    // if it's the hash ring's turn to adjust, then adjust the maxDropRate.
                    // Otherwise, we let the call dropping strategy take it's turn, even if
                    // it may do nothing.
                    clientUpdater.setMaxDropRate(newMaxDropRate);
                }
                recoveryMapChanges = true;
            } else {
                // else if the recovery map contains the client and the call count was > 0
                // tough love here, once the rehab clients start taking traffic, we
                // restore their maxDropRate to it's original value, and unenroll them
                // from the program.
                // This is safe because the hash ring points are controlled by the
                // computedDropRate variable, and the call dropping rate is controlled by
                // the overrideDropRate. The maxDropRate only serves to cap the computedDropRate and
                // overrideDropRate.
                // We store the maxDropRate and restore it here because the initialRecoveryLevel could
                // potentially be higher than what the default maxDropRate allowed. (the maxDropRate doesn't
                // necessarily have to be 1.0). For instance, if the maxDropRate was 0.99, and the
                // initialRecoveryLevel was 0.05  then we need to store the old maxDropRate.
                clientUpdater.setMaxDropRate(newRecoveryMap.get(client));
                newRecoveryMap.remove(client);
                recoveryMapChanges = true;
            }
        }
    }
    // in TrackerClientUpdaters -- those URIs were removed from zookeeper
    if (isQuarantineEnabled) {
        quarantineMap.entrySet().removeIf(e -> !activeClients.contains(e.getKey()));
        quarantineHistory.entrySet().removeIf(e -> !activeClients.contains(e.getKey()));
    }
    if (oldState.getClusterGenerationId() == clusterGenerationId && totalClusterCallCount <= 0 && !recoveryMapChanges && !quarantineMapChanged) {
        // if the cluster has not been called recently (total cluster call count is <= 0)
        // and we already have a state with the same set of URIs (same cluster generation),
        // and no clients are in rehab or evicted from quarantine, then don't change anything.
        debug(_log, "New state is the same as the old state so we're not changing anything. Old state = ", oldState, ", config= ", config);
        return new PartitionDegraderLoadBalancerState(oldState, clusterGenerationId, config.getClock().currentTimeMillis());
    }
    // update our overrides.
    double newCurrentAvgClusterLatency = -1;
    if (totalClusterCallCount > 0) {
        newCurrentAvgClusterLatency = sumOfClusterLatencies / totalClusterCallCount;
    }
    debug(_log, "average cluster latency: ", newCurrentAvgClusterLatency);
    // This points map stores how many hash map points to allocate for each tracker client.
    Map<URI, Integer> points = new HashMap<URI, Integer>();
    Map<URI, Integer> oldPointsMap = oldState.getPointsMap();
    for (TrackerClientUpdater clientUpdater : trackerClientUpdaters) {
        TrackerClient client = clientUpdater.getTrackerClient();
        double successfulTransmissionWeight;
        URI clientUri = client.getUri();
        // Don't take into account cluster health when calculating the number of points
        // for each client. This is because the individual clients already take into account
        // latency and errors, and a successfulTransmissionWeight can and should be made
        // independent of other nodes in the cluster. Otherwise, one unhealthy client in a small
        // cluster can take down the entire cluster if the avg latency is too high.
        // The global drop rate will take into account the cluster latency. High cluster-wide error
        // rates are not something d2 can address.
        //
        // this client's maxDropRate and currentComputedDropRate may have been adjusted if it's in the
        // rehab program (to gradually send traffic it's way).
        DegraderControl degraderControl = client.getDegraderControl(partitionId);
        double dropRate = Math.min(degraderControl.getCurrentComputedDropRate(), clientUpdater.getMaxDropRate());
        // calculate the weight as the probability of successful transmission to this
        // node divided by the probability of successful transmission to the entire
        // cluster
        double clientWeight = client.getPartitionWeight(partitionId);
        successfulTransmissionWeight = clientWeight * (1.0 - dropRate);
        // calculate the weight as the probability of a successful transmission to this node
        // multiplied by the client's self-defined weight. thus, the node's final weight
        // takes into account both the self defined weight (to account for different
        // hardware in the same cluster) and the performance of the node (as defined by the
        // node's degrader).
        debug(_log, "computed new weight for uri ", clientUri, ": ", successfulTransmissionWeight);
        // keep track if we're making actual changes to the Hash Ring in this updatePartitionState.
        int newPoints = (int) (successfulTransmissionWeight * pointsPerWeight);
        boolean quarantineEffect = false;
        if (isQuarantineEnabled) {
            if (quarantineMap.containsKey(client)) {
                // If the client is still in quarantine, keep the points to 0 so no real traffic will be used
                newPoints = 0;
                quarantineEffect = true;
            } else //    HTTP_LB_QUARANTINE_MAX_PERCENT)
            if (successfulTransmissionWeight <= 0.0 && clientWeight > EPSILON && degraderControl.isHigh()) {
                if (1.0 * quarantineMap.size() < Math.ceil(trackerClientUpdaters.size() * config.getQuarantineMaxPercent())) {
                    // Put the client into quarantine
                    DegraderLoadBalancerQuarantine quarantine = quarantineHistory.remove(client);
                    if (quarantine == null) {
                        quarantine = new DegraderLoadBalancerQuarantine(clientUpdater, config, oldState.getServiceName());
                    }
                    // If the trackerClient was just recently evicted from quarantine, it is possible that
                    // the service is already in trouble while the quarantine probing approach works
                    // fine. In such case we'll reuse the previous waiting duration instead of starting
                    // from scratch again
                    quarantine.reset((clk - quarantine.getLastChecked()) > DegraderLoadBalancerStrategyConfig.DEFAULT_QUARANTINE_REENTRY_TIME);
                    quarantineMap.put(client, quarantine);
                    // reduce the points to 0 so no real traffic will be used
                    newPoints = 0;
                    _log.warn("TrackerClient {} is put into quarantine {}. OverrideDropRate = {}, callCount = {}, latency = {}," + " errorRate = {}", new Object[] { client.getUri(), quarantine, degraderControl.getMaxDropRate(), degraderControl.getCallCount(), degraderControl.getLatency(), degraderControl.getErrorRate() });
                    quarantineEffect = true;
                } else {
                    _log.error("Quarantine for service {} is full! Could not add {}", oldState.getServiceName(), client);
                }
            }
        }
        // client into the recovery program, because we don't want this tracker client to get any traffic.
        if (!quarantineEffect && newPoints == 0 && clientWeight > EPSILON) {
            // We are choking off traffic to this tracker client.
            // Enroll this tracker client in the recovery program so that
            // we can make sure it still gets some traffic
            Double oldMaxDropRate = clientUpdater.getMaxDropRate();
            // set the default recovery level.
            newPoints = (int) (initialRecoveryLevel * pointsPerWeight);
            // Keep track of the original maxDropRate
            if (!newRecoveryMap.containsKey(client)) {
                // keep track of this client,
                newRecoveryMap.put(client, oldMaxDropRate);
                clientUpdater.setMaxDropRate(1.0 - initialRecoveryLevel);
            }
        }
        points.put(clientUri, newPoints);
        if (!oldPointsMap.containsKey(clientUri) || oldPointsMap.get(clientUri) != newPoints) {
            hashRingChanges = true;
        }
    }
    // if there were changes to the members of the cluster
    if ((strategy == PartitionDegraderLoadBalancerState.Strategy.LOAD_BALANCE && hashRingChanges) || oldState.getClusterGenerationId() != clusterGenerationId) {
        // atomic overwrite
        // try Call Dropping next time we updatePartitionState.
        newState = new PartitionDegraderLoadBalancerState(clusterGenerationId, config.getClock().currentTimeMillis(), true, oldState.getRingFactory(), points, PartitionDegraderLoadBalancerState.Strategy.CALL_DROPPING, currentOverrideDropRate, newCurrentAvgClusterLatency, newRecoveryMap, oldState.getServiceName(), oldState.getDegraderProperties(), totalClusterCallCount, quarantineMap, quarantineHistory);
        logState(oldState, newState, partitionId, config, trackerClientUpdaters);
    } else {
        // time to try call dropping strategy, if necessary.
        // we are explicitly setting the override drop rate to a number between 0 and 1, inclusive.
        double newDropLevel = Math.max(0.0, currentOverrideDropRate);
        // to get the cluster latency stabilized
        if (newCurrentAvgClusterLatency > 0 && totalClusterCallCount >= config.getMinClusterCallCountHighWaterMark()) {
            // statistically significant
            if (newCurrentAvgClusterLatency >= config.getHighWaterMark() && currentOverrideDropRate != 1.0) {
                // if the cluster latency is too high and we can drop more traffic
                newDropLevel = Math.min(1.0, newDropLevel + config.getGlobalStepUp());
            } else if (newCurrentAvgClusterLatency <= config.getLowWaterMark() && currentOverrideDropRate != 0.0) {
                // else if the cluster latency is good and we can reduce the override drop rate
                newDropLevel = Math.max(0.0, newDropLevel - config.getGlobalStepDown());
            }
        // else the averageClusterLatency is between Low and High, or we can't change anything more,
        // then do not change anything.
        } else if (newCurrentAvgClusterLatency > 0 && totalClusterCallCount >= config.getMinClusterCallCountLowWaterMark()) {
            //but we might recover a bit if the latency is healthy
            if (newCurrentAvgClusterLatency <= config.getLowWaterMark() && currentOverrideDropRate != 0.0) {
                // the cluster latency is good and we can reduce the override drop rate
                newDropLevel = Math.max(0.0, newDropLevel - config.getGlobalStepDown());
            }
        // else the averageClusterLatency is somewhat high but since the qps is not that high, we shouldn't degrade
        } else {
            // if we enter here that means we have very low traffic. We should reduce the overrideDropRate, if possible.
            // when we have below 1 QPS traffic, we should be pretty confident that the cluster can handle very low
            // traffic. Of course this is depending on the MinClusterCallCountLowWaterMark that the service owner sets.
            // Another reason is this might have happened if we had somehow choked off all traffic to the cluster, most
            // likely in a one node/small cluster scenario. Obviously, we can't check latency here,
            // we'll have to rely on the metric in the next updatePartitionState. If the cluster is still having
            // latency problems, then we will oscillate between off and letting a little traffic through,
            // and that is acceptable. If the latency, though high, is deemed acceptable, then the
            // watermarks can be adjusted to let more traffic through.
            newDropLevel = Math.max(0.0, newDropLevel - config.getGlobalStepDown());
        }
        if (newDropLevel != currentOverrideDropRate) {
            overrideClusterDropRate(partitionId, newDropLevel, trackerClientUpdaters);
        }
        // don't change the points map or the recoveryMap, but try load balancing strategy next time.
        newState = new PartitionDegraderLoadBalancerState(clusterGenerationId, config.getClock().currentTimeMillis(), true, oldState.getRingFactory(), oldPointsMap, PartitionDegraderLoadBalancerState.Strategy.LOAD_BALANCE, newDropLevel, newCurrentAvgClusterLatency, isQuarantineEnabled ? newRecoveryMap : oldRecoveryMap, oldState.getServiceName(), oldState.getDegraderProperties(), totalClusterCallCount, quarantineMap, quarantineHistory);
        logState(oldState, newState, partitionId, config, trackerClientUpdaters);
        points = oldPointsMap;
    }
    // adjust the min call count for each client based on the hash ring reduction and call dropping
    // fraction.
    overrideMinCallCount(partitionId, currentOverrideDropRate, trackerClientUpdaters, points, pointsPerWeight);
    return newState;
}
Also used : HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) DegraderControl(com.linkedin.util.degrader.DegraderControl) URI(java.net.URI) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TrackerClient(com.linkedin.d2.balancer.clients.TrackerClient) HashSet(java.util.HashSet)

Example 9 with DegraderControl

use of com.linkedin.util.degrader.DegraderControl in project rest.li by linkedin.

the class TrackerClientUpdater method update.

void update() {
    DegraderControl degraderControl = _trackerClient.getDegraderControl(_partitionId);
    degraderControl.setOverrideDropRate(_overrideDropRate);
    degraderControl.setMaxDropRate(_maxDropRate);
    degraderControl.setOverrideMinCallCount(_overrideMinCallCount);
}
Also used : DegraderControl(com.linkedin.util.degrader.DegraderControl)

Example 10 with DegraderControl

use of com.linkedin.util.degrader.DegraderControl in project rest.li by linkedin.

the class DegraderLoadBalancerTest method testBalancingRing.

@Test(groups = { "small", "back-end" }, dataProvider = "consistentHashAlgorithms")
public void testBalancingRing(String consistentHashAlgorithm) throws URISyntaxException {
    DegraderLoadBalancerStrategyV3 strategy = getStrategy(consistentHashAlgorithm);
    List<TrackerClient> clients = new ArrayList<TrackerClient>();
    URI uri1 = URI.create("http://someTestService/someTestUrl");
    URI uri2 = URI.create("http://abcxfweuoeueoueoueoukeueoueoueoueoueouo/2354");
    TestClock clock1 = new TestClock();
    TestClock clock2 = new TestClock();
    TrackerClient client1 = getClient(uri1, clock1);
    TrackerClient client2 = getClient(uri2, clock2);
    clients.add(client1);
    clients.add(client2);
    // force client2 to be disabled
    DegraderControl dcClient2Default = client2.getDegraderControl(DEFAULT_PARTITION_ID);
    dcClient2Default.setOverrideMinCallCount(1);
    dcClient2Default.setMinCallCount(1);
    dcClient2Default.setMaxDropRate(1d);
    dcClient2Default.setUpStep(0.4d);
    dcClient2Default.setHighErrorRate(0);
    CallCompletion cc = client2.getCallTracker().startCall();
    clock2.addMs(1);
    cc.endCallWithError();
    clock1.addMs(15000);
    clock2.addMs(5000);
    // now do a basic verification to verify getTrackerClient is properly weighting things
    double calls = 10000d;
    int client1Count = 0;
    int client2Count = 0;
    double tolerance = 0.05d;
    for (int i = 0; i < calls; ++i) {
        TrackerClient client = getTrackerClient(strategy, null, new RequestContext(), 1, clients);
        assertNotNull(client);
        if (client.getUri().equals(uri1)) {
            ++client1Count;
        } else {
            ++client2Count;
        }
    }
    assertTrue(Math.abs((client1Count / calls) - (100 / 160d)) < tolerance);
    assertTrue(Math.abs((client2Count / calls) - (60 / 160d)) < tolerance);
}
Also used : TrackerClient(com.linkedin.d2.balancer.clients.TrackerClient) CallCompletion(com.linkedin.util.degrader.CallCompletion) ArrayList(java.util.ArrayList) DegraderControl(com.linkedin.util.degrader.DegraderControl) RequestContext(com.linkedin.r2.message.RequestContext) URI(java.net.URI) Test(org.testng.annotations.Test) TrackerClientTest(com.linkedin.d2.balancer.clients.TrackerClientTest)

Aggregations

DegraderControl (com.linkedin.util.degrader.DegraderControl)16 TrackerClient (com.linkedin.d2.balancer.clients.TrackerClient)13 URI (java.net.URI)13 RequestContext (com.linkedin.r2.message.RequestContext)11 CallCompletion (com.linkedin.util.degrader.CallCompletion)10 ArrayList (java.util.ArrayList)10 Test (org.testng.annotations.Test)10 HashMap (java.util.HashMap)9 TrackerClientTest (com.linkedin.d2.balancer.clients.TrackerClientTest)8 AtomicLong (java.util.concurrent.atomic.AtomicLong)6 URIRequest (com.linkedin.d2.balancer.util.URIRequest)5 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 Callback (com.linkedin.common.callback.Callback)2 RemoteInvocationException (com.linkedin.r2.RemoteInvocationException)2 RestRequest (com.linkedin.r2.message.rest.RestRequest)2 TransportClient (com.linkedin.r2.transport.common.bridge.client.TransportClient)2 TransportCallback (com.linkedin.r2.transport.common.bridge.common.TransportCallback)2 SettableClock (com.linkedin.util.clock.SettableClock)2 CallTracker (com.linkedin.util.degrader.CallTracker)2 Map (java.util.Map)2