use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class FollowersCheckerTests method testFailsNodeThatDoesNotRespond.
public void testFailsNodeThatDoesNotRespond() {
final Settings settings = randomSettings();
testBehaviourOfFailingNode(settings, () -> null, "followers check retry count exceeded", (FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis() + FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) * FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings).millis(), () -> new StatusInfo(HEALTHY, "healthy-info"));
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class LeaderCheckerTests method testFollowerBehaviour.
public void testFollowerBehaviour() {
final DiscoveryNode leader1 = new DiscoveryNode("leader-1", buildNewFakeTransportAddress(), Version.CURRENT);
final DiscoveryNode leader2 = randomBoolean() ? leader1 : new DiscoveryNode("leader-2", buildNewFakeTransportAddress(), Version.CURRENT);
final DiscoveryNode localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
Settings.Builder settingsBuilder = Settings.builder().put(NODE_NAME_SETTING.getKey(), localNode.getId());
final long leaderCheckIntervalMillis;
if (randomBoolean()) {
leaderCheckIntervalMillis = randomLongBetween(1000, 60000);
settingsBuilder.put(LEADER_CHECK_INTERVAL_SETTING.getKey(), leaderCheckIntervalMillis + "ms");
} else {
leaderCheckIntervalMillis = LEADER_CHECK_INTERVAL_SETTING.get(Settings.EMPTY).millis();
}
final long leaderCheckTimeoutMillis;
if (randomBoolean()) {
leaderCheckTimeoutMillis = randomLongBetween(1, 60000);
settingsBuilder.put(LEADER_CHECK_TIMEOUT_SETTING.getKey(), leaderCheckTimeoutMillis + "ms");
} else {
leaderCheckTimeoutMillis = LEADER_CHECK_TIMEOUT_SETTING.get(Settings.EMPTY).millis();
}
final int leaderCheckRetryCount;
if (randomBoolean()) {
leaderCheckRetryCount = randomIntBetween(1, 10);
settingsBuilder.put(LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), leaderCheckRetryCount);
} else {
leaderCheckRetryCount = LEADER_CHECK_RETRY_COUNT_SETTING.get(Settings.EMPTY);
}
final AtomicLong checkCount = new AtomicLong();
final AtomicBoolean allResponsesFail = new AtomicBoolean();
final Settings settings = settingsBuilder.build();
logger.info("--> using {}", settings);
final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
final MockTransport mockTransport = new MockTransport() {
int consecutiveFailedRequestsCount;
@Override
protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
assertThat(action, equalTo(LEADER_CHECK_ACTION_NAME));
assertTrue(node.equals(leader1) || node.equals(leader2));
super.onSendRequest(requestId, action, request, node);
final boolean mustSucceed = leaderCheckRetryCount - 1 <= consecutiveFailedRequestsCount;
final long responseDelay = randomLongBetween(0, leaderCheckTimeoutMillis + (mustSucceed ? -1 : 60000));
final boolean successResponse = allResponsesFail.get() == false && (mustSucceed || randomBoolean());
if (responseDelay >= leaderCheckTimeoutMillis || successResponse == false) {
consecutiveFailedRequestsCount += 1;
} else {
consecutiveFailedRequestsCount = 0;
}
checkCount.incrementAndGet();
deterministicTaskQueue.scheduleAt(deterministicTaskQueue.getCurrentTimeMillis() + responseDelay, new Runnable() {
@Override
public void run() {
if (successResponse) {
handleResponse(requestId, Empty.INSTANCE);
} else {
handleRemoteError(requestId, new OpenSearchException("simulated error"));
}
}
@Override
public String toString() {
return (successResponse ? "successful" : "unsuccessful") + " response to request " + requestId;
}
});
}
};
final TransportService transportService = mockTransport.createTransportService(settings, deterministicTaskQueue.getThreadPool(), NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> localNode, null, emptySet());
transportService.start();
transportService.acceptIncomingRequests();
final AtomicBoolean leaderFailed = new AtomicBoolean();
final LeaderChecker leaderChecker = new LeaderChecker(settings, transportService, e -> {
assertThat(e.getMessage(), matchesRegex("node \\[.*\\] failed \\[[1-9][0-9]*\\] consecutive checks"));
assertTrue(leaderFailed.compareAndSet(false, true));
}, () -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
logger.info("--> creating first checker");
leaderChecker.updateLeader(leader1);
{
final long maxCheckCount = randomLongBetween(2, 1000);
logger.info("--> checking that no failure is detected in {} checks", maxCheckCount);
while (checkCount.get() < maxCheckCount) {
deterministicTaskQueue.runAllRunnableTasks();
deterministicTaskQueue.advanceTime();
}
}
leaderChecker.updateLeader(null);
logger.info("--> running remaining tasks");
deterministicTaskQueue.runAllTasks();
assertFalse(leaderFailed.get());
logger.info("--> creating second checker");
leaderChecker.updateLeader(leader2);
{
checkCount.set(0);
final long maxCheckCount = randomLongBetween(2, 1000);
logger.info("--> checking again that no failure is detected in {} checks", maxCheckCount);
while (checkCount.get() < maxCheckCount) {
deterministicTaskQueue.runAllRunnableTasks();
deterministicTaskQueue.advanceTime();
}
deterministicTaskQueue.runAllRunnableTasks();
final long failureTime = deterministicTaskQueue.getCurrentTimeMillis();
allResponsesFail.set(true);
logger.info("--> failing at {}ms", failureTime);
while (leaderFailed.get() == false) {
deterministicTaskQueue.advanceTime();
deterministicTaskQueue.runAllRunnableTasks();
}
assertThat(deterministicTaskQueue.getCurrentTimeMillis() - failureTime, lessThanOrEqualTo((leaderCheckIntervalMillis + leaderCheckTimeoutMillis) * leaderCheckRetryCount + // needed because a successful check response might be in flight at the time of failure
leaderCheckTimeoutMillis));
}
leaderChecker.updateLeader(null);
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class CoordinatorTests method testCannotJoinClusterWithDifferentUUID.
public void testCannotJoinClusterWithDifferentUUID() throws IllegalAccessException {
try (Cluster cluster1 = new Cluster(randomIntBetween(1, 3))) {
cluster1.runRandomly();
cluster1.stabilise();
final ClusterNode nodeInOtherCluster;
try (Cluster cluster2 = new Cluster(3)) {
cluster2.runRandomly();
cluster2.stabilise();
nodeInOtherCluster = randomFrom(cluster2.clusterNodes);
}
final ClusterNode newNode = cluster1.new ClusterNode(nextNodeIndex.getAndIncrement(), nodeInOtherCluster.getLocalNode(), n -> cluster1.new MockPersistedState(n, nodeInOtherCluster.persistedState, Function.identity(), Function.identity()), nodeInOtherCluster.nodeSettings, () -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
cluster1.clusterNodes.add(newNode);
try (MockLogAppender mockAppender = MockLogAppender.createForLoggers(LogManager.getLogger(JoinHelper.class))) {
mockAppender.addExpectation(new MockLogAppender.SeenEventExpectation("test1", JoinHelper.class.getCanonicalName(), Level.INFO, "*failed to join*"));
cluster1.runFor(DEFAULT_STABILISATION_TIME, "failing join validation");
mockAppender.assertAllExpectationsMatched();
}
assertEquals(0, newNode.getLastAppliedClusterState().version());
newNode.close();
final ClusterNode detachedNode = newNode.restartedNode(DetachClusterCommand::updateMetadata, term -> DetachClusterCommand.updateCurrentTerm(), newNode.nodeSettings);
cluster1.clusterNodes.replaceAll(cn -> cn == newNode ? detachedNode : cn);
cluster1.stabilise();
}
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class JoinHelperTests method testJoinDeduplication.
public void testJoinDeduplication() {
DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(Settings.builder().put(NODE_NAME_SETTING.getKey(), "node0").build(), random());
CapturingTransport capturingTransport = new CapturingTransport();
DiscoveryNode localNode = new DiscoveryNode("node0", buildNewFakeTransportAddress(), Version.CURRENT);
TransportService transportService = capturingTransport.createTransportService(Settings.EMPTY, deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR, x -> localNode, null, Collections.emptySet());
JoinHelper joinHelper = new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> null, (joinRequest, joinCallback) -> {
throw new AssertionError();
}, startJoinRequest -> {
throw new AssertionError();
}, Collections.emptyList(), (s, p, r) -> {
}, () -> new StatusInfo(HEALTHY, "info"));
transportService.start();
DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), Version.CURRENT);
DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), Version.CURRENT);
assertFalse(joinHelper.isJoinPending());
// check that sending a join to node1 works
Optional<Join> optionalJoin1 = randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node1, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
CapturedRequest[] capturedRequests1 = capturingTransport.getCapturedRequestsAndClear();
assertThat(capturedRequests1.length, equalTo(1));
CapturedRequest capturedRequest1 = capturedRequests1[0];
assertEquals(node1, capturedRequest1.node);
assertTrue(joinHelper.isJoinPending());
// check that sending a join to node2 works
Optional<Join> optionalJoin2 = randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node2, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
joinHelper.sendJoinRequest(node2, 0L, optionalJoin2);
CapturedRequest[] capturedRequests2 = capturingTransport.getCapturedRequestsAndClear();
assertThat(capturedRequests2.length, equalTo(1));
CapturedRequest capturedRequest2 = capturedRequests2[0];
assertEquals(node2, capturedRequest2.node);
// check that sending another join to node1 is a noop as the previous join is still in progress
joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
assertThat(capturingTransport.getCapturedRequestsAndClear().length, equalTo(0));
// complete the previous join to node1
if (randomBoolean()) {
capturingTransport.handleResponse(capturedRequest1.requestId, TransportResponse.Empty.INSTANCE);
} else {
capturingTransport.handleRemoteError(capturedRequest1.requestId, new CoordinationStateRejectedException("dummy"));
}
// check that sending another join to node1 now works again
joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
CapturedRequest[] capturedRequests1a = capturingTransport.getCapturedRequestsAndClear();
assertThat(capturedRequests1a.length, equalTo(1));
CapturedRequest capturedRequest1a = capturedRequests1a[0];
assertEquals(node1, capturedRequest1a.node);
// check that sending another join to node2 works if the optionalJoin is different
Optional<Join> optionalJoin2a = optionalJoin2.isPresent() && randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node2, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
joinHelper.sendJoinRequest(node2, 0L, optionalJoin2a);
CapturedRequest[] capturedRequests2a = capturingTransport.getCapturedRequestsAndClear();
assertThat(capturedRequests2a.length, equalTo(1));
CapturedRequest capturedRequest2a = capturedRequests2a[0];
assertEquals(node2, capturedRequest2a.node);
// complete all the joins and check that isJoinPending is updated
assertTrue(joinHelper.isJoinPending());
capturingTransport.handleRemoteError(capturedRequest2.requestId, new CoordinationStateRejectedException("dummy"));
capturingTransport.handleRemoteError(capturedRequest1a.requestId, new CoordinationStateRejectedException("dummy"));
capturingTransport.handleRemoteError(capturedRequest2a.requestId, new CoordinationStateRejectedException("dummy"));
assertFalse(joinHelper.isJoinPending());
}
use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.
the class NodeJoinTests method testBecomeFollowerFailsPendingJoin.
public void testBecomeFollowerFailsPendingJoin() throws Exception {
DiscoveryNode node0 = newNode(0, true);
DiscoveryNode node1 = newNode(1, true);
long initialTerm = randomLongBetween(1, 10);
long initialVersion = randomLongBetween(1, 10);
setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion, VotingConfiguration.of(node1)), () -> new StatusInfo(HEALTHY, "healthy-info"));
long newTerm = initialTerm + randomLongBetween(1, 10);
SimpleFuture fut = joinNodeAsync(new JoinRequest(node0, newTerm, Optional.of(new Join(node0, node0, newTerm, initialTerm, initialVersion))));
deterministicTaskQueue.runAllRunnableTasks();
assertFalse(fut.isDone());
assertFalse(isLocalNodeElectedMaster());
handleFollowerCheckFrom(node1, newTerm);
assertFalse(isLocalNodeElectedMaster());
assertThat(expectThrows(CoordinationStateRejectedException.class, () -> FutureUtils.get(fut)).getMessage(), containsString("became follower"));
assertFalse(isLocalNodeElectedMaster());
}
Aggregations