use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class CoordinatorTests method testUnHealthyLeaderRemoved.
public void testUnHealthyLeaderRemoved() {
AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>(new StatusInfo(HEALTHY, "healthy-info"));
try (Cluster cluster = new Cluster(randomIntBetween(1, 3), true, Settings.EMPTY, () -> nodeHealthServiceStatus.get())) {
cluster.runRandomly();
cluster.stabilise();
final ClusterNode leader = cluster.getAnyLeader();
logger.info("--> adding three new healthy nodes");
ClusterNode newNode1 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, () -> new StatusInfo(HEALTHY, "healthy-info"));
ClusterNode newNode2 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, () -> new StatusInfo(HEALTHY, "healthy-info"));
ClusterNode newNode3 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, () -> new StatusInfo(HEALTHY, "healthy-info"));
cluster.clusterNodes.add(newNode1);
cluster.clusterNodes.add(newNode2);
cluster.clusterNodes.add(newNode3);
cluster.stabilise(// The first pinging discovers the master
defaultMillis(DISCOVERY_FIND_PEERS_INTERVAL_SETTING) + // One message delay to send a join
DEFAULT_DELAY_VARIABILITY + // followup reconfiguration
3 * 2 * DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
logger.info("--> changing health status of leader {} to unhealthy", leader);
nodeHealthServiceStatus.getAndSet(new StatusInfo(UNHEALTHY, "unhealthy-info"));
cluster.stabilise(// first wait for all the followers to notice the leader has gone
(defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + defaultMillis(LEADER_CHECK_TIMEOUT_SETTING)) + // then wait for a follower to be promoted to leader
DEFAULT_ELECTION_DELAY + // and the first publication times out because of the unresponsive node
defaultMillis(PUBLISH_TIMEOUT_SETTING) + // there might be a term bump causing another election
DEFAULT_ELECTION_DELAY + // then wait for both of:
Math.max(// 1. the term bumping publication to time out
defaultMillis(PUBLISH_TIMEOUT_SETTING), // 2. the new leader to notice that the old leader is unresponsive
(defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING))) + // then wait for the new leader to commit a state without the old leader
DEFAULT_CLUSTER_STATE_UPDATE_DELAY + // then wait for the followup reconfiguration
DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
assertThat(cluster.getAnyLeader().getId(), anyOf(equalTo(newNode1.getId()), equalTo(newNode2.getId()), equalTo(newNode3.getId())));
}
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class FollowersCheckerTests method testFailsNodeThatRejectsCheck.
public void testFailsNodeThatRejectsCheck() {
final Settings settings = randomSettings();
testBehaviourOfFailingNode(settings, () -> {
throw new OpenSearchException("simulated exception");
}, "followers check retry count exceeded", (FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis(), () -> new StatusInfo(HEALTHY, "healthy-info"));
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class FollowersCheckerTests method testResponder.
public void testResponder() {
final DiscoveryNode leader = new DiscoveryNode("leader", buildNewFakeTransportAddress(), Version.CURRENT);
final DiscoveryNode follower = new DiscoveryNode("follower", buildNewFakeTransportAddress(), Version.CURRENT);
final Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), follower.getName()).build();
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
final MockTransport mockTransport = new MockTransport() {
@Override
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
throw new AssertionError("no requests expected");
}
};
final TransportService transportService = mockTransport.createTransportService(settings, deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> follower, null, emptySet());
transportService.start();
transportService.acceptIncomingRequests();
final AtomicBoolean calledCoordinator = new AtomicBoolean();
final AtomicReference<RuntimeException> coordinatorException = new AtomicReference<>();
final FollowersChecker followersChecker = new FollowersChecker(settings, transportService, fcr -> {
assertTrue(calledCoordinator.compareAndSet(false, true));
final RuntimeException exception = coordinatorException.get();
if (exception != null) {
throw exception;
}
}, (node, reason) -> {
assert false : node;
}, () -> new StatusInfo(HEALTHY, "healthy-info"));
{
// Does not call into the coordinator in the normal case
final long term = randomNonNegativeLong();
followersChecker.updateFastResponseState(term, Mode.FOLLOWER);
final ExpectsSuccess expectsSuccess = new ExpectsSuccess();
transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(term, leader), expectsSuccess);
deterministicTaskQueue.runAllTasks();
assertTrue(expectsSuccess.succeeded());
assertFalse(calledCoordinator.get());
}
{
// Does not call into the coordinator for a term that's too low, just rejects immediately
final long leaderTerm = randomLongBetween(1, Long.MAX_VALUE - 1);
final long followerTerm = randomLongBetween(leaderTerm + 1, Long.MAX_VALUE);
followersChecker.updateFastResponseState(followerTerm, Mode.FOLLOWER);
final AtomicReference<TransportException> receivedException = new AtomicReference<>();
transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(leaderTerm, leader), new TransportResponseHandler<TransportResponse.Empty>() {
@Override
public TransportResponse.Empty read(StreamInput in) {
return TransportResponse.Empty.INSTANCE;
}
@Override
public void handleResponse(TransportResponse.Empty response) {
fail("unexpected success");
}
@Override
public void handleException(TransportException exp) {
assertThat(exp, not(nullValue()));
assertTrue(receivedException.compareAndSet(null, exp));
}
@Override
public String executor() {
return Names.SAME;
}
});
deterministicTaskQueue.runAllTasks();
assertFalse(calledCoordinator.get());
assertThat(receivedException.get(), not(nullValue()));
}
{
// Calls into the coordinator if the term needs bumping
final long leaderTerm = randomLongBetween(2, Long.MAX_VALUE);
final long followerTerm = randomLongBetween(1, leaderTerm - 1);
followersChecker.updateFastResponseState(followerTerm, Mode.FOLLOWER);
final ExpectsSuccess expectsSuccess = new ExpectsSuccess();
transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(leaderTerm, leader), expectsSuccess);
deterministicTaskQueue.runAllTasks();
assertTrue(expectsSuccess.succeeded());
assertTrue(calledCoordinator.get());
calledCoordinator.set(false);
}
{
// Calls into the coordinator if not a follower
final long term = randomNonNegativeLong();
followersChecker.updateFastResponseState(term, randomFrom(Mode.LEADER, Mode.CANDIDATE));
final ExpectsSuccess expectsSuccess = new ExpectsSuccess();
transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(term, leader), expectsSuccess);
deterministicTaskQueue.runAllTasks();
assertTrue(expectsSuccess.succeeded());
assertTrue(calledCoordinator.get());
calledCoordinator.set(false);
}
{
// If it calls into the coordinator and the coordinator throws an exception then it's passed back to the caller
final long term = randomNonNegativeLong();
followersChecker.updateFastResponseState(term, randomFrom(Mode.LEADER, Mode.CANDIDATE));
final String exceptionMessage = "test simulated exception " + randomNonNegativeLong();
coordinatorException.set(new OpenSearchException(exceptionMessage));
final AtomicReference<TransportException> receivedException = new AtomicReference<>();
transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(term, leader), new TransportResponseHandler<TransportResponse.Empty>() {
@Override
public TransportResponse.Empty read(StreamInput in) {
return TransportResponse.Empty.INSTANCE;
}
@Override
public void handleResponse(TransportResponse.Empty response) {
fail("unexpected success");
}
@Override
public void handleException(TransportException exp) {
assertThat(exp, not(nullValue()));
assertTrue(receivedException.compareAndSet(null, exp));
}
@Override
public String executor() {
return Names.SAME;
}
});
deterministicTaskQueue.runAllTasks();
assertTrue(calledCoordinator.get());
assertThat(receivedException.get(), not(nullValue()));
assertThat(receivedException.get().getRootCause().getMessage(), equalTo(exceptionMessage));
}
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class JoinHelperTests method testJoinFailureOnUnhealthyNodes.
public void testJoinFailureOnUnhealthyNodes() {
DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(Settings.builder().put(NODE_NAME_SETTING.getKey(), "node0").build(), random());
CapturingTransport capturingTransport = new CapturingTransport();
DiscoveryNode localNode = new DiscoveryNode("node0", buildNewFakeTransportAddress(), Version.CURRENT);
TransportService transportService = capturingTransport.createTransportService(Settings.EMPTY, deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR, x -> localNode, null, Collections.emptySet());
AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>(new StatusInfo(UNHEALTHY, "unhealthy-info"));
JoinHelper joinHelper = new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> null, (joinRequest, joinCallback) -> {
throw new AssertionError();
}, startJoinRequest -> {
throw new AssertionError();
}, Collections.emptyList(), (s, p, r) -> {
}, () -> nodeHealthServiceStatus.get());
transportService.start();
DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), Version.CURRENT);
DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), Version.CURRENT);
assertFalse(joinHelper.isJoinPending());
// check that sending a join to node1 doesn't work
Optional<Join> optionalJoin1 = randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node1, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
joinHelper.sendJoinRequest(node1, randomNonNegativeLong(), optionalJoin1);
CapturedRequest[] capturedRequests1 = capturingTransport.getCapturedRequestsAndClear();
assertThat(capturedRequests1.length, equalTo(0));
assertFalse(joinHelper.isJoinPending());
// check that sending a join to node2 doesn't work
Optional<Join> optionalJoin2 = randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node2, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
transportService.start();
joinHelper.sendJoinRequest(node2, randomNonNegativeLong(), optionalJoin2);
CapturedRequest[] capturedRequests2 = capturingTransport.getCapturedRequestsAndClear();
assertThat(capturedRequests2.length, equalTo(0));
assertFalse(joinHelper.isJoinPending());
nodeHealthServiceStatus.getAndSet(new StatusInfo(HEALTHY, "healthy-info"));
// check that sending another join to node1 now works again
joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
CapturedRequest[] capturedRequests1a = capturingTransport.getCapturedRequestsAndClear();
assertThat(capturedRequests1a.length, equalTo(1));
CapturedRequest capturedRequest1a = capturedRequests1a[0];
assertEquals(node1, capturedRequest1a.node);
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class ClusterFormationFailureHelperTests method testDescriptionOnMasterIneligibleNodes.
public void testDescriptionOnMasterIneligibleNodes() {
final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).version(12L).metadata(Metadata.builder().coordinationMetadata(CoordinationMetadata.builder().term(4L).build())).nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet: have discovered []; discovery will continue using [] from hosts providers " + "and [] from last-known cluster state; node term 15, last-accepted version 12 in term 4"));
final TransportAddress otherAddress = buildNewFakeTransportAddress();
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 16L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet: have discovered []; discovery will continue using [" + otherAddress + "] from hosts providers and [] from last-known cluster state; node term 16, last-accepted version 12 in term 4"));
final DiscoveryNode otherNode = new DiscoveryNode("other", buildNewFakeTransportAddress(), Version.CURRENT);
assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 17L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet: have discovered [" + otherNode + "]; discovery will continue using [] from hosts providers " + "and [] from last-known cluster state; node term 17, last-accepted version 12 in term 4"));
}
Aggregations