Search in sources :

Example 41 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class FollowersCheckerTests method testFailsNodeThatDoesNotRespond.

public void testFailsNodeThatDoesNotRespond() {
    final Settings settings = randomSettings();
    testBehaviourOfFailingNode(settings, () -> null, "followers check retry count exceeded", (FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis() + FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) * FOLLOWER_CHECK_TIMEOUT_SETTING.get(settings).millis(), () -> new StatusInfo(HEALTHY, "healthy-info"));
}
Also used : StatusInfo(org.opensearch.monitor.StatusInfo) Settings(org.opensearch.common.settings.Settings)

Example 42 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class LeaderCheckerTests method testFollowerBehaviour.

public void testFollowerBehaviour() {
    final DiscoveryNode leader1 = new DiscoveryNode("leader-1", buildNewFakeTransportAddress(), Version.CURRENT);
    final DiscoveryNode leader2 = randomBoolean() ? leader1 : new DiscoveryNode("leader-2", buildNewFakeTransportAddress(), Version.CURRENT);
    final DiscoveryNode localNode = new DiscoveryNode("local-node", buildNewFakeTransportAddress(), Version.CURRENT);
    Settings.Builder settingsBuilder = Settings.builder().put(NODE_NAME_SETTING.getKey(), localNode.getId());
    final long leaderCheckIntervalMillis;
    if (randomBoolean()) {
        leaderCheckIntervalMillis = randomLongBetween(1000, 60000);
        settingsBuilder.put(LEADER_CHECK_INTERVAL_SETTING.getKey(), leaderCheckIntervalMillis + "ms");
    } else {
        leaderCheckIntervalMillis = LEADER_CHECK_INTERVAL_SETTING.get(Settings.EMPTY).millis();
    }
    final long leaderCheckTimeoutMillis;
    if (randomBoolean()) {
        leaderCheckTimeoutMillis = randomLongBetween(1, 60000);
        settingsBuilder.put(LEADER_CHECK_TIMEOUT_SETTING.getKey(), leaderCheckTimeoutMillis + "ms");
    } else {
        leaderCheckTimeoutMillis = LEADER_CHECK_TIMEOUT_SETTING.get(Settings.EMPTY).millis();
    }
    final int leaderCheckRetryCount;
    if (randomBoolean()) {
        leaderCheckRetryCount = randomIntBetween(1, 10);
        settingsBuilder.put(LEADER_CHECK_RETRY_COUNT_SETTING.getKey(), leaderCheckRetryCount);
    } else {
        leaderCheckRetryCount = LEADER_CHECK_RETRY_COUNT_SETTING.get(Settings.EMPTY);
    }
    final AtomicLong checkCount = new AtomicLong();
    final AtomicBoolean allResponsesFail = new AtomicBoolean();
    final Settings settings = settingsBuilder.build();
    logger.info("--> using {}", settings);
    final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
    final MockTransport mockTransport = new MockTransport() {

        int consecutiveFailedRequestsCount;

        @Override
        protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
            assertThat(action, equalTo(LEADER_CHECK_ACTION_NAME));
            assertTrue(node.equals(leader1) || node.equals(leader2));
            super.onSendRequest(requestId, action, request, node);
            final boolean mustSucceed = leaderCheckRetryCount - 1 <= consecutiveFailedRequestsCount;
            final long responseDelay = randomLongBetween(0, leaderCheckTimeoutMillis + (mustSucceed ? -1 : 60000));
            final boolean successResponse = allResponsesFail.get() == false && (mustSucceed || randomBoolean());
            if (responseDelay >= leaderCheckTimeoutMillis || successResponse == false) {
                consecutiveFailedRequestsCount += 1;
            } else {
                consecutiveFailedRequestsCount = 0;
            }
            checkCount.incrementAndGet();
            deterministicTaskQueue.scheduleAt(deterministicTaskQueue.getCurrentTimeMillis() + responseDelay, new Runnable() {

                @Override
                public void run() {
                    if (successResponse) {
                        handleResponse(requestId, Empty.INSTANCE);
                    } else {
                        handleRemoteError(requestId, new OpenSearchException("simulated error"));
                    }
                }

                @Override
                public String toString() {
                    return (successResponse ? "successful" : "unsuccessful") + " response to request " + requestId;
                }
            });
        }
    };
    final TransportService transportService = mockTransport.createTransportService(settings, deterministicTaskQueue.getThreadPool(), NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> localNode, null, emptySet());
    transportService.start();
    transportService.acceptIncomingRequests();
    final AtomicBoolean leaderFailed = new AtomicBoolean();
    final LeaderChecker leaderChecker = new LeaderChecker(settings, transportService, e -> {
        assertThat(e.getMessage(), matchesRegex("node \\[.*\\] failed \\[[1-9][0-9]*\\] consecutive checks"));
        assertTrue(leaderFailed.compareAndSet(false, true));
    }, () -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
    logger.info("--> creating first checker");
    leaderChecker.updateLeader(leader1);
    {
        final long maxCheckCount = randomLongBetween(2, 1000);
        logger.info("--> checking that no failure is detected in {} checks", maxCheckCount);
        while (checkCount.get() < maxCheckCount) {
            deterministicTaskQueue.runAllRunnableTasks();
            deterministicTaskQueue.advanceTime();
        }
    }
    leaderChecker.updateLeader(null);
    logger.info("--> running remaining tasks");
    deterministicTaskQueue.runAllTasks();
    assertFalse(leaderFailed.get());
    logger.info("--> creating second checker");
    leaderChecker.updateLeader(leader2);
    {
        checkCount.set(0);
        final long maxCheckCount = randomLongBetween(2, 1000);
        logger.info("--> checking again that no failure is detected in {} checks", maxCheckCount);
        while (checkCount.get() < maxCheckCount) {
            deterministicTaskQueue.runAllRunnableTasks();
            deterministicTaskQueue.advanceTime();
        }
        deterministicTaskQueue.runAllRunnableTasks();
        final long failureTime = deterministicTaskQueue.getCurrentTimeMillis();
        allResponsesFail.set(true);
        logger.info("--> failing at {}ms", failureTime);
        while (leaderFailed.get() == false) {
            deterministicTaskQueue.advanceTime();
            deterministicTaskQueue.runAllRunnableTasks();
        }
        assertThat(deterministicTaskQueue.getCurrentTimeMillis() - failureTime, lessThanOrEqualTo((leaderCheckIntervalMillis + leaderCheckTimeoutMillis) * leaderCheckRetryCount + // needed because a successful check response might be in flight at the time of failure
        leaderCheckTimeoutMillis));
    }
    leaderChecker.updateLeader(null);
}
Also used : DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) TransportRequest(org.opensearch.transport.TransportRequest) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicLong(java.util.concurrent.atomic.AtomicLong) TransportService(org.opensearch.transport.TransportService) StatusInfo(org.opensearch.monitor.StatusInfo) MockTransport(org.opensearch.test.transport.MockTransport) OpenSearchException(org.opensearch.OpenSearchException) Settings(org.opensearch.common.settings.Settings)

Example 43 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class CoordinatorTests method testCannotJoinClusterWithDifferentUUID.

public void testCannotJoinClusterWithDifferentUUID() throws IllegalAccessException {
    try (Cluster cluster1 = new Cluster(randomIntBetween(1, 3))) {
        cluster1.runRandomly();
        cluster1.stabilise();
        final ClusterNode nodeInOtherCluster;
        try (Cluster cluster2 = new Cluster(3)) {
            cluster2.runRandomly();
            cluster2.stabilise();
            nodeInOtherCluster = randomFrom(cluster2.clusterNodes);
        }
        final ClusterNode newNode = cluster1.new ClusterNode(nextNodeIndex.getAndIncrement(), nodeInOtherCluster.getLocalNode(), n -> cluster1.new MockPersistedState(n, nodeInOtherCluster.persistedState, Function.identity(), Function.identity()), nodeInOtherCluster.nodeSettings, () -> new StatusInfo(StatusInfo.Status.HEALTHY, "healthy-info"));
        cluster1.clusterNodes.add(newNode);
        try (MockLogAppender mockAppender = MockLogAppender.createForLoggers(LogManager.getLogger(JoinHelper.class))) {
            mockAppender.addExpectation(new MockLogAppender.SeenEventExpectation("test1", JoinHelper.class.getCanonicalName(), Level.INFO, "*failed to join*"));
            cluster1.runFor(DEFAULT_STABILISATION_TIME, "failing join validation");
            mockAppender.assertAllExpectationsMatched();
        }
        assertEquals(0, newNode.getLastAppliedClusterState().version());
        newNode.close();
        final ClusterNode detachedNode = newNode.restartedNode(DetachClusterCommand::updateMetadata, term -> DetachClusterCommand.updateCurrentTerm(), newNode.nodeSettings);
        cluster1.clusterNodes.replaceAll(cn -> cn == newNode ? detachedNode : cn);
        cluster1.stabilise();
    }
}
Also used : ClusterNode(org.opensearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode) MockLogAppender(org.opensearch.test.MockLogAppender) StatusInfo(org.opensearch.monitor.StatusInfo)

Example 44 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class JoinHelperTests method testJoinDeduplication.

public void testJoinDeduplication() {
    DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(Settings.builder().put(NODE_NAME_SETTING.getKey(), "node0").build(), random());
    CapturingTransport capturingTransport = new CapturingTransport();
    DiscoveryNode localNode = new DiscoveryNode("node0", buildNewFakeTransportAddress(), Version.CURRENT);
    TransportService transportService = capturingTransport.createTransportService(Settings.EMPTY, deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR, x -> localNode, null, Collections.emptySet());
    JoinHelper joinHelper = new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> null, (joinRequest, joinCallback) -> {
        throw new AssertionError();
    }, startJoinRequest -> {
        throw new AssertionError();
    }, Collections.emptyList(), (s, p, r) -> {
    }, () -> new StatusInfo(HEALTHY, "info"));
    transportService.start();
    DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), Version.CURRENT);
    DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), Version.CURRENT);
    assertFalse(joinHelper.isJoinPending());
    // check that sending a join to node1 works
    Optional<Join> optionalJoin1 = randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node1, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
    joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
    CapturedRequest[] capturedRequests1 = capturingTransport.getCapturedRequestsAndClear();
    assertThat(capturedRequests1.length, equalTo(1));
    CapturedRequest capturedRequest1 = capturedRequests1[0];
    assertEquals(node1, capturedRequest1.node);
    assertTrue(joinHelper.isJoinPending());
    // check that sending a join to node2 works
    Optional<Join> optionalJoin2 = randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node2, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
    joinHelper.sendJoinRequest(node2, 0L, optionalJoin2);
    CapturedRequest[] capturedRequests2 = capturingTransport.getCapturedRequestsAndClear();
    assertThat(capturedRequests2.length, equalTo(1));
    CapturedRequest capturedRequest2 = capturedRequests2[0];
    assertEquals(node2, capturedRequest2.node);
    // check that sending another join to node1 is a noop as the previous join is still in progress
    joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
    assertThat(capturingTransport.getCapturedRequestsAndClear().length, equalTo(0));
    // complete the previous join to node1
    if (randomBoolean()) {
        capturingTransport.handleResponse(capturedRequest1.requestId, TransportResponse.Empty.INSTANCE);
    } else {
        capturingTransport.handleRemoteError(capturedRequest1.requestId, new CoordinationStateRejectedException("dummy"));
    }
    // check that sending another join to node1 now works again
    joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
    CapturedRequest[] capturedRequests1a = capturingTransport.getCapturedRequestsAndClear();
    assertThat(capturedRequests1a.length, equalTo(1));
    CapturedRequest capturedRequest1a = capturedRequests1a[0];
    assertEquals(node1, capturedRequest1a.node);
    // check that sending another join to node2 works if the optionalJoin is different
    Optional<Join> optionalJoin2a = optionalJoin2.isPresent() && randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node2, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
    joinHelper.sendJoinRequest(node2, 0L, optionalJoin2a);
    CapturedRequest[] capturedRequests2a = capturingTransport.getCapturedRequestsAndClear();
    assertThat(capturedRequests2a.length, equalTo(1));
    CapturedRequest capturedRequest2a = capturedRequests2a[0];
    assertEquals(node2, capturedRequest2a.node);
    // complete all the joins and check that isJoinPending is updated
    assertTrue(joinHelper.isJoinPending());
    capturingTransport.handleRemoteError(capturedRequest2.requestId, new CoordinationStateRejectedException("dummy"));
    capturingTransport.handleRemoteError(capturedRequest1a.requestId, new CoordinationStateRejectedException("dummy"));
    capturingTransport.handleRemoteError(capturedRequest2a.requestId, new CoordinationStateRejectedException("dummy"));
    assertFalse(joinHelper.isJoinPending());
}
Also used : DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) TransportService(org.opensearch.transport.TransportService) StatusInfo(org.opensearch.monitor.StatusInfo) CapturingTransport(org.opensearch.test.transport.CapturingTransport) CapturedRequest(org.opensearch.test.transport.CapturingTransport.CapturedRequest)

Example 45 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class NodeJoinTests method testBecomeFollowerFailsPendingJoin.

public void testBecomeFollowerFailsPendingJoin() throws Exception {
    DiscoveryNode node0 = newNode(0, true);
    DiscoveryNode node1 = newNode(1, true);
    long initialTerm = randomLongBetween(1, 10);
    long initialVersion = randomLongBetween(1, 10);
    setupFakeMasterServiceAndCoordinator(initialTerm, initialState(node0, initialTerm, initialVersion, VotingConfiguration.of(node1)), () -> new StatusInfo(HEALTHY, "healthy-info"));
    long newTerm = initialTerm + randomLongBetween(1, 10);
    SimpleFuture fut = joinNodeAsync(new JoinRequest(node0, newTerm, Optional.of(new Join(node0, node0, newTerm, initialTerm, initialVersion))));
    deterministicTaskQueue.runAllRunnableTasks();
    assertFalse(fut.isDone());
    assertFalse(isLocalNodeElectedMaster());
    handleFollowerCheckFrom(node1, newTerm);
    assertFalse(isLocalNodeElectedMaster());
    assertThat(expectThrows(CoordinationStateRejectedException.class, () -> FutureUtils.get(fut)).getMessage(), containsString("became follower"));
    assertFalse(isLocalNodeElectedMaster());
}
Also used : DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) StatusInfo(org.opensearch.monitor.StatusInfo)

Aggregations

StatusInfo (org.opensearch.monitor.StatusInfo)45 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)33 Settings (org.opensearch.common.settings.Settings)18 TransportService (org.opensearch.transport.TransportService)14 ClusterState (org.opensearch.cluster.ClusterState)11 AtomicReference (java.util.concurrent.atomic.AtomicReference)10 TransportRequest (org.opensearch.transport.TransportRequest)10 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)9 MockTransport (org.opensearch.test.transport.MockTransport)9 ClusterFormationState (org.opensearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState)8 DiscoveryNodes (org.opensearch.cluster.node.DiscoveryNodes)7 ConnectTransportException (org.opensearch.transport.ConnectTransportException)6 TransportException (org.opensearch.transport.TransportException)6 TransportResponse (org.opensearch.transport.TransportResponse)6 Empty (org.opensearch.transport.TransportResponse.Empty)6 HashSet (java.util.HashSet)5 VotingConfiguration (org.opensearch.cluster.coordination.CoordinationMetadata.VotingConfiguration)5 TransportAddress (org.opensearch.common.transport.TransportAddress)5 CapturingTransport (org.opensearch.test.transport.CapturingTransport)5 Set (java.util.Set)4