use of com.linkedin.d2.balancer.util.hashing.Ring in project rest.li by linkedin.
the class DegraderLoadBalancerTest method testClusterRecovery2TC.
@Test(groups = { "small", "back-end" })
public void testClusterRecovery2TC() {
final int NUM_CHECKS = 5;
final Long TIME_INTERVAL = 5000L;
Map<String, Object> myMap = new HashMap<String, Object>();
// 1,2,4,8,16,32,64,100% steps, given a 2x recovery step coefficient
int localStepsToFullRecovery = 8;
myMap.put(PropertyKeys.HTTP_LB_INITIAL_RECOVERY_LEVEL, 0.005);
myMap.put(PropertyKeys.HTTP_LB_RING_RAMP_FACTOR, 2d);
myMap.put(PropertyKeys.HTTP_LB_STRATEGY_PROPERTIES_UPDATE_INTERVAL_MS, TIME_INTERVAL);
TestClock clock = new TestClock();
myMap.put(PropertyKeys.CLOCK, clock);
DegraderLoadBalancerStrategyConfig config = DegraderLoadBalancerStrategyConfig.createHttpConfigFromMap(myMap);
DegraderLoadBalancerStrategyV3 strategy = new DegraderLoadBalancerStrategyV3(config, "DegraderLoadBalancerTest", null);
List<TrackerClient> clients = new ArrayList<TrackerClient>();
URI uri1 = URI.create("http://test.linkedin.com:3242/fdsaf");
URI uri2 = URI.create("http://test.linkedin.com:3243/fdsaf");
URIRequest request = new URIRequest(uri1);
List<CallCompletion> ccList = new ArrayList<CallCompletion>();
CallCompletion cc;
TrackerClient client1 = new TrackerClient(uri1, getDefaultPartitionData(1d), new TestLoadBalancerClient(uri1), clock, null);
TrackerClient client2 = new TrackerClient(uri2, getDefaultPartitionData(1d), new TestLoadBalancerClient(uri2), clock, null);
clients.add(client1);
clients.add(client2);
// force client1 to be disabled if we encounter errors/high latency
DegraderControl dcClient1Default = client1.getDegraderControl(DEFAULT_PARTITION_ID);
dcClient1Default.setMinCallCount(5);
dcClient1Default.setOverrideMinCallCount(5);
dcClient1Default.setUpStep(1.0);
// force client2 to be disabled if we encounter errors/high latency
DegraderControl dcClient2Default = client2.getDegraderControl(DEFAULT_PARTITION_ID);
dcClient2Default.setOverrideMinCallCount(5);
dcClient2Default.setMinCallCount(5);
dcClient2Default.setUpStep(0.4);
// Have one cycle of successful calls to verify valid tracker clients returned.
// try load balancing on this updateState, need to updateState before forcing the strategy.
TrackerClient resultTC = getTrackerClient(strategy, request, new RequestContext(), 1, clients);
strategy.setStrategy(DEFAULT_PARTITION_ID, DegraderLoadBalancerStrategyV3.PartitionDegraderLoadBalancerState.Strategy.LOAD_BALANCE);
resultTC = getTrackerClient(strategy, request, new RequestContext(), 1, clients);
assertNotNull(resultTC, "expected non-null trackerclient");
for (int j = 0; j < NUM_CHECKS; j++) {
ccList.add(client1.getCallTracker().startCall());
ccList.add(client2.getCallTracker().startCall());
}
clock.addMs(1);
for (Iterator<CallCompletion> iter = ccList.listIterator(); iter.hasNext(); ) {
cc = iter.next();
cc.endCall();
}
// bump to next interval, and get stats.
clock.addMs(5000);
// try Load balancing on this updateState
strategy.setStrategy(DEFAULT_PARTITION_ID, DegraderLoadBalancerStrategyV3.PartitionDegraderLoadBalancerState.Strategy.LOAD_BALANCE);
resultTC = getTrackerClient(strategy, request, new RequestContext(), 1, clients);
assertNotNull(resultTC, "expected non-null trackerclient");
Assert.assertEquals(dcClient1Default.getCurrentComputedDropRate(), 0.0);
Assert.assertEquals(dcClient2Default.getCurrentComputedDropRate(), 0.0);
// now simulate a bad cluster state with high error and high latency
for (int j = 0; j < NUM_CHECKS; j++) {
ccList.add(client1.getCallTracker().startCall());
ccList.add(client2.getCallTracker().startCall());
}
clock.addMs(3500);
for (Iterator<CallCompletion> iter = ccList.listIterator(); iter.hasNext(); ) {
cc = iter.next();
cc.endCallWithError();
}
// go to next interval
clock.addMs(5000);
Assert.assertEquals(dcClient1Default.getCurrentComputedDropRate(), 1.0);
Assert.assertEquals(dcClient2Default.getCurrentComputedDropRate(), 0.4);
// trigger a state update, the returned TrackerClient should be client2
// because client 1 should have gone up to a 1.0 drop rate, and the cluster should
// be unhealthy
strategy.setStrategy(DEFAULT_PARTITION_ID, DegraderLoadBalancerStrategyV3.PartitionDegraderLoadBalancerState.Strategy.LOAD_BALANCE);
resultTC = getTrackerClient(strategy, request, new RequestContext(), 1, clients);
assertEquals(resultTC, client2);
// it's current drop rate.
do {
// go to next time interval.
clock.addMs(TIME_INTERVAL);
// adjust the hash ring this time.
strategy.setStrategy(DEFAULT_PARTITION_ID, DegraderLoadBalancerStrategyV3.PartitionDegraderLoadBalancerState.Strategy.LOAD_BALANCE);
resultTC = getTrackerClient(strategy, request, new RequestContext(), 1, clients);
localStepsToFullRecovery--;
} while (localStepsToFullRecovery > 0);
assertNotNull(resultTC, "expected non-null trackerclient");
assertTrue(strategy.getState().getPartitionState(DEFAULT_PARTITION_ID).getPointsMap().get(client1.getUri()) == client1.getPartitionWeight(DEFAULT_PARTITION_ID) * config.getPointsPerWeight(), "client1 did not recover to full weight in hash map.");
Assert.assertEquals(dcClient2Default.getCurrentComputedDropRate(), 0.4, "client2 drop rate not as expected");
cc = client1.getCallTracker().startCall();
clock.addMs(10);
cc.endCall();
clock.addMs(TIME_INTERVAL);
resultTC = getTrackerClient(strategy, request, new RequestContext(), 1, clients);
assertNotNull(resultTC, "expected non-null trackerclient");
}
use of com.linkedin.d2.balancer.util.hashing.Ring in project rest.li by linkedin.
the class DegraderLoadBalancerTest method testClusterRecoverySlow1TC.
@Test(groups = { "small", "back-end" })
public void testClusterRecoverySlow1TC() {
Map<String, Object> myMap = new HashMap<String, Object>();
Long timeInterval = 5000L;
TestClock clock = new TestClock();
myMap.put(PropertyKeys.CLOCK, clock);
// We want the degrader to have one cooling off period, and then re-enter the ring. This
myMap.put(PropertyKeys.HTTP_LB_INITIAL_RECOVERY_LEVEL, 0.005);
myMap.put(PropertyKeys.HTTP_LB_RING_RAMP_FACTOR, 2.0);
myMap.put(PropertyKeys.HTTP_LB_STRATEGY_PROPERTIES_UPDATE_INTERVAL_MS, timeInterval);
// it will take two intervals for the TC to be reintroduced into the hash ring.
int stepsToFullRecovery = 1;
//test Strategy V3
DegraderLoadBalancerStrategyConfig config = DegraderLoadBalancerStrategyConfig.createHttpConfigFromMap(myMap);
DegraderLoadBalancerStrategyV3 strategyV3 = new DegraderLoadBalancerStrategyV3(config, "DegraderLoadBalancerTest", null);
DegraderLoadBalancerStrategyAdapter strategy = new DegraderLoadBalancerStrategyAdapter(strategyV3);
clusterRecovery1TC(myMap, clock, stepsToFullRecovery, timeInterval, strategy, null, DegraderLoadBalancerStrategyV3.PartitionDegraderLoadBalancerState.Strategy.LOAD_BALANCE);
//test Strategy V2
config = DegraderLoadBalancerStrategyConfig.createHttpConfigFromMap(myMap);
DegraderLoadBalancerStrategyV2_1 strategyV2 = new DegraderLoadBalancerStrategyV2_1(config, "DegraderLoadBalancerTest", null);
strategy = new DegraderLoadBalancerStrategyAdapter(strategyV2);
clusterRecovery1TC(myMap, clock, stepsToFullRecovery, timeInterval, strategy, DegraderLoadBalancerStrategyV2_1.DegraderLoadBalancerState.Strategy.LOAD_BALANCE, null);
}
use of com.linkedin.d2.balancer.util.hashing.Ring in project rest.li by linkedin.
the class DegraderLoadBalancerTest method testClusterRecoveryAfter100PercentDropCall.
@Test(groups = { "small", "back-end" })
public void testClusterRecoveryAfter100PercentDropCall() {
Map<String, Object> myMap = new HashMap<String, Object>();
Long timeInterval = 5000L;
TestClock clock = new TestClock();
myMap.put(PropertyKeys.CLOCK, clock);
// We want the degrader to have one cooling off period, and then re-enter the ring. This
myMap.put(PropertyKeys.HTTP_LB_INITIAL_RECOVERY_LEVEL, 0.005);
myMap.put(PropertyKeys.HTTP_LB_RING_RAMP_FACTOR, 2.0);
myMap.put(PropertyKeys.HTTP_LB_STRATEGY_PROPERTIES_UPDATE_INTERVAL_MS, timeInterval);
myMap.put(PropertyKeys.HTTP_LB_GLOBAL_STEP_UP, 1.0);
myMap.put(PropertyKeys.HTTP_LB_GLOBAL_STEP_DOWN, 0.8);
myMap.put(PropertyKeys.HTTP_LB_CLUSTER_MIN_CALL_COUNT_HIGH_WATER_MARK, 1l);
myMap.put(PropertyKeys.HTTP_LB_CLUSTER_MIN_CALL_COUNT_LOW_WATER_MARK, 1l);
//test Strategy V3
DegraderLoadBalancerStrategyConfig config = DegraderLoadBalancerStrategyConfig.createHttpConfigFromMap(myMap);
DegraderLoadBalancerStrategyV3 strategyV3 = new DegraderLoadBalancerStrategyV3(config, "DegraderLoadBalancerTest", null);
DegraderLoadBalancerStrategyAdapter strategy = new DegraderLoadBalancerStrategyAdapter(strategyV3);
clusterTotalRecovery1TC(myMap, clock, timeInterval, strategy);
//test Strategy V2
config = DegraderLoadBalancerStrategyConfig.createHttpConfigFromMap(myMap);
DegraderLoadBalancerStrategyV2_1 strategyV2 = new DegraderLoadBalancerStrategyV2_1(config, "DegraderLoadBalancerTest", null);
strategy = new DegraderLoadBalancerStrategyAdapter(strategyV2);
clusterTotalRecovery1TC(myMap, clock, timeInterval, strategy);
}
use of com.linkedin.d2.balancer.util.hashing.Ring in project rest.li by linkedin.
the class DegraderLoadBalancerTest method testDegraderLoadBalancerSimulator.
private void testDegraderLoadBalancerSimulator(DegraderLoadBalancerStrategyAdapter adapter, TestClock clock, long timeInterval, List<TrackerClient> clients, double qps, DegraderImpl.Config degraderConfig) {
long clusterGenerationId = 1;
double overrideDropRate = 0.0;
//simulate latency 4000 ms
//1st round we use LOAD_BALANCING strategy. Since we have a high latency we will decrease the number of points
//from 100 to 80 (transmissionRate * points per weight).
TrackerClient resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 80, true, 0.0, 4000, false, false);
assertNotNull(resultTC);
//2nd round drop rate should be increased by DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_UP
overrideDropRate += DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_UP;
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 80, false, overrideDropRate, 4000, false, false);
//3rd round. We alternate back to LOAD_BALANCING strategy and we drop the points even more
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 39, true, overrideDropRate, 4000, false, false);
//4th round. The drop rate should be increased again like 2nd round
overrideDropRate += DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_UP;
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 39, false, overrideDropRate, 4000, false, false);
//5th round. Alternate to changing hash ring again.
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, true, overrideDropRate, 4000, false, false);
//6th round. Same as 5th round, we'll increase the drop rate
overrideDropRate += DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_UP;
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, false, overrideDropRate, 4000, false, false);
//7th round. The # of point in hashring is at the minimum so we can't decrease it further. At this point the client
//is in recovery mode. But since we can't change the hashring anymore, we'll always in CALL_DROPPING mode
//so the next strategy is expected to be LOAD_BALANCING mode.
overrideDropRate += DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_UP;
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, false, overrideDropRate, 4000, false, false);
//8th round. We'll increase the drop rate to the max.
overrideDropRate += DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_UP;
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, false, overrideDropRate, 4000, false, false);
//9th round, now we'll simulate as if there still a call even though we drop 100% of all request to get
//tracker client. The assumption is there's some thread that still holds tracker client and we want
//to make sure we can handle the request and we can't degrade the cluster even further.
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, false, overrideDropRate, 4000, false, false);
//10th round, now we'll simulate as if there's no call because we dropped all request
//even though we are in LOAD_BALANCING mode and this tracker client is in recovery mode and there's no call
//so the hashring doesn't change so we go back to reducing the drop rate to 0.8 and that means the next
//strategy is LOAD_BALANCE
overrideDropRate -= DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_DOWN;
resultTC = simulateAndTestOneInterval(timeInterval, clock, 0.0, clients, adapter, clusterGenerationId, 1, false, overrideDropRate, 4000, false, false);
//11th round, this time we'll simulate the latency is now 1000 ms (so it's within low and high watermark). Drop rate
//should stay the same and everything else should stay the same
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, false, overrideDropRate, 1000, false, false);
//we'll simulate the client dying one by one until all the clients are gone
int numberOfClients = clients.size();
HashSet<URI> uris = new HashSet<URI>();
HashSet<URI> removedUris = new HashSet<URI>();
for (TrackerClient client : clients) {
uris.add(client.getUri());
}
LinkedList<TrackerClient> removedClients = new LinkedList<TrackerClient>();
//loadBalancing strategy will always be picked because there is no hash ring changes
boolean isLoadBalancingStrategyTurn = true;
for (int i = numberOfClients; i > 0; i--) {
TrackerClient removed = clients.remove(0);
uris.remove(removed.getUri());
removedClients.addLast(removed);
removedUris.add(removed.getUri());
clusterGenerationId++;
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, isLoadBalancingStrategyTurn, overrideDropRate, 1000, false, false);
if (i == 1) {
assertNull(resultTC);
} else {
//the override drop rate is 0.8)
if (resultTC != null) {
assertTrue(uris.contains(resultTC.getUri()));
assertFalse(removedUris.contains(resultTC.getUri()));
}
}
}
assertTrue(uris.isEmpty());
assertTrue(clients.isEmpty());
assertEquals(removedUris.size(), numberOfClients);
assertEquals(removedClients.size(), numberOfClients);
//we'll simulate the client start reviving one by one until all clients are back up again
for (int i = numberOfClients; i > 0; i--) {
TrackerClient added = removedClients.remove(0);
//we have to create a new client. The old client has a degraded DegraderImpl. And in production enviroment
//when a new client join a cluster, it should be in good state. This means there should be 100 points
//in the hash ring for this client
TrackerClient newClient = new TrackerClient(added.getUri(), getDefaultPartitionData(1d), new TestLoadBalancerClient(added.getUri()), clock, degraderConfig);
clients.add(newClient);
uris.add(added.getUri());
removedUris.remove(added.getUri());
clusterGenerationId++;
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 100, isLoadBalancingStrategyTurn, overrideDropRate, 1000, false, false);
if (resultTC != null) {
assertTrue(uris.contains(resultTC.getUri()));
assertFalse(removedUris.contains(resultTC.getUri()));
}
}
//the number of points because there is no hash ring changes
for (overrideDropRate -= DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_DOWN; overrideDropRate >= 0; overrideDropRate -= DegraderLoadBalancerStrategyConfig.DEFAULT_GLOBAL_STEP_DOWN) {
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 100, false, overrideDropRate, 300, false, false);
}
//we should have recovered fully by this time
overrideDropRate = 0.0;
resultTC = simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 100, false, overrideDropRate, 300, false, false);
assertNotNull(resultTC);
clusterGenerationId++;
//simulate the increase of certain error (connect exception, closedChannelException) rate will cause degradation.
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 80, true, 0.0, 300, false, true);
//switching to call dropping strategy
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 80, false, 0.0, 300, false, true);
//continue the degradation
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 39, true, 0.0, 300, false, true);
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 39, false, 0.0, 300, false, true);
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, true, 0.0, 300, false, true);
//now let's remove all the error and see how the cluster recover but we have to wait until next round because
//this round is CALL_DROP strategy
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 1, false, 0.0, 300, false, false);
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 39, true, 0.0, 300, false, false);
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 39, false, 0.0, 300, false, false);
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 80, true, 0.0, 300, false, false);
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 80, false, 0.0, 300, false, false);
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 100, true, 0.0, 300, false, false);
//make sure if we have error that is not from CONNECT_EXCEPTION or CLOSED_CHANNEL_EXCEPTION we don't degrade
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 100, false, 0.0, 300, true, false);
//since there's no change in hash ring due to error NOT of CONNECT_EXCEPTION or CLOSED_CHANNEL_EXCEPTION,
//the strategy won't change to CALL_DROPPING
simulateAndTestOneInterval(timeInterval, clock, qps, clients, adapter, clusterGenerationId, 100, false, 0.0, 300, true, false);
}
use of com.linkedin.d2.balancer.util.hashing.Ring in project rest.li by linkedin.
the class LoadBalancerSimulator method getPoints.
/**
* Given a serviceName and partition number, return the hashring points for each URI
* @param serviceName
* @param partition
* @return
* @throws ServiceUnavailableException
*/
public Map<URI, Integer> getPoints(String serviceName, int partition) throws ServiceUnavailableException {
URI serviceUri = URI.create("d2://" + serviceName);
Ring<URI> ring = _loadBalancer.getRings(serviceUri).get(partition);
Map<URI, Integer> pointsMap = new HashMap<>();
Iterator<URI> iter = ring.getIterator(0);
iter.forEachRemaining(uri -> pointsMap.compute(uri, (k, v) -> v == null ? 1 : v + 1));
return pointsMap;
}
Aggregations