Search in sources :

Example 16 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class CoordinatorTests method testUnHealthyLeaderRemoved.

public void testUnHealthyLeaderRemoved() {
    AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>(new StatusInfo(HEALTHY, "healthy-info"));
    try (Cluster cluster = new Cluster(randomIntBetween(1, 3), true, Settings.EMPTY, () -> nodeHealthServiceStatus.get())) {
        cluster.runRandomly();
        cluster.stabilise();
        final ClusterNode leader = cluster.getAnyLeader();
        logger.info("--> adding three new healthy nodes");
        ClusterNode newNode1 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, () -> new StatusInfo(HEALTHY, "healthy-info"));
        ClusterNode newNode2 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, () -> new StatusInfo(HEALTHY, "healthy-info"));
        ClusterNode newNode3 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, () -> new StatusInfo(HEALTHY, "healthy-info"));
        cluster.clusterNodes.add(newNode1);
        cluster.clusterNodes.add(newNode2);
        cluster.clusterNodes.add(newNode3);
        cluster.stabilise(// The first pinging discovers the master
        defaultMillis(DISCOVERY_FIND_PEERS_INTERVAL_SETTING) + // One message delay to send a join
        DEFAULT_DELAY_VARIABILITY + // followup reconfiguration
        3 * 2 * DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
        logger.info("--> changing health status of leader {} to unhealthy", leader);
        nodeHealthServiceStatus.getAndSet(new StatusInfo(UNHEALTHY, "unhealthy-info"));
        cluster.stabilise(// first wait for all the followers to notice the leader has gone
        (defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + defaultMillis(LEADER_CHECK_TIMEOUT_SETTING)) + // then wait for a follower to be promoted to leader
        DEFAULT_ELECTION_DELAY + // and the first publication times out because of the unresponsive node
        defaultMillis(PUBLISH_TIMEOUT_SETTING) + // there might be a term bump causing another election
        DEFAULT_ELECTION_DELAY + // then wait for both of:
        Math.max(// 1. the term bumping publication to time out
        defaultMillis(PUBLISH_TIMEOUT_SETTING), // 2. the new leader to notice that the old leader is unresponsive
        (defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING))) + // then wait for the new leader to commit a state without the old leader
        DEFAULT_CLUSTER_STATE_UPDATE_DELAY + // then wait for the followup reconfiguration
        DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
        assertThat(cluster.getAnyLeader().getId(), anyOf(equalTo(newNode1.getId()), equalTo(newNode2.getId()), equalTo(newNode3.getId())));
    }
}
Also used : ClusterNode(org.opensearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode) StatusInfo(org.opensearch.monitor.StatusInfo) AtomicReference(java.util.concurrent.atomic.AtomicReference)

Example 17 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class FollowersCheckerTests method testFailsNodeThatRejectsCheck.

public void testFailsNodeThatRejectsCheck() {
    final Settings settings = randomSettings();
    testBehaviourOfFailingNode(settings, () -> {
        throw new OpenSearchException("simulated exception");
    }, "followers check retry count exceeded", (FOLLOWER_CHECK_RETRY_COUNT_SETTING.get(settings) - 1) * FOLLOWER_CHECK_INTERVAL_SETTING.get(settings).millis(), () -> new StatusInfo(HEALTHY, "healthy-info"));
}
Also used : StatusInfo(org.opensearch.monitor.StatusInfo) OpenSearchException(org.opensearch.OpenSearchException) Settings(org.opensearch.common.settings.Settings)

Example 18 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class FollowersCheckerTests method testResponder.

public void testResponder() {
    final DiscoveryNode leader = new DiscoveryNode("leader", buildNewFakeTransportAddress(), Version.CURRENT);
    final DiscoveryNode follower = new DiscoveryNode("follower", buildNewFakeTransportAddress(), Version.CURRENT);
    final Settings settings = Settings.builder().put(NODE_NAME_SETTING.getKey(), follower.getName()).build();
    final DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(settings, random());
    final MockTransport mockTransport = new MockTransport() {

        @Override
        protected void onSendRequest(long requestId, String action, TransportRequest request, DiscoveryNode node) {
            throw new AssertionError("no requests expected");
        }
    };
    final TransportService transportService = mockTransport.createTransportService(settings, deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR, boundTransportAddress -> follower, null, emptySet());
    transportService.start();
    transportService.acceptIncomingRequests();
    final AtomicBoolean calledCoordinator = new AtomicBoolean();
    final AtomicReference<RuntimeException> coordinatorException = new AtomicReference<>();
    final FollowersChecker followersChecker = new FollowersChecker(settings, transportService, fcr -> {
        assertTrue(calledCoordinator.compareAndSet(false, true));
        final RuntimeException exception = coordinatorException.get();
        if (exception != null) {
            throw exception;
        }
    }, (node, reason) -> {
        assert false : node;
    }, () -> new StatusInfo(HEALTHY, "healthy-info"));
    {
        // Does not call into the coordinator in the normal case
        final long term = randomNonNegativeLong();
        followersChecker.updateFastResponseState(term, Mode.FOLLOWER);
        final ExpectsSuccess expectsSuccess = new ExpectsSuccess();
        transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(term, leader), expectsSuccess);
        deterministicTaskQueue.runAllTasks();
        assertTrue(expectsSuccess.succeeded());
        assertFalse(calledCoordinator.get());
    }
    {
        // Does not call into the coordinator for a term that's too low, just rejects immediately
        final long leaderTerm = randomLongBetween(1, Long.MAX_VALUE - 1);
        final long followerTerm = randomLongBetween(leaderTerm + 1, Long.MAX_VALUE);
        followersChecker.updateFastResponseState(followerTerm, Mode.FOLLOWER);
        final AtomicReference<TransportException> receivedException = new AtomicReference<>();
        transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(leaderTerm, leader), new TransportResponseHandler<TransportResponse.Empty>() {

            @Override
            public TransportResponse.Empty read(StreamInput in) {
                return TransportResponse.Empty.INSTANCE;
            }

            @Override
            public void handleResponse(TransportResponse.Empty response) {
                fail("unexpected success");
            }

            @Override
            public void handleException(TransportException exp) {
                assertThat(exp, not(nullValue()));
                assertTrue(receivedException.compareAndSet(null, exp));
            }

            @Override
            public String executor() {
                return Names.SAME;
            }
        });
        deterministicTaskQueue.runAllTasks();
        assertFalse(calledCoordinator.get());
        assertThat(receivedException.get(), not(nullValue()));
    }
    {
        // Calls into the coordinator if the term needs bumping
        final long leaderTerm = randomLongBetween(2, Long.MAX_VALUE);
        final long followerTerm = randomLongBetween(1, leaderTerm - 1);
        followersChecker.updateFastResponseState(followerTerm, Mode.FOLLOWER);
        final ExpectsSuccess expectsSuccess = new ExpectsSuccess();
        transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(leaderTerm, leader), expectsSuccess);
        deterministicTaskQueue.runAllTasks();
        assertTrue(expectsSuccess.succeeded());
        assertTrue(calledCoordinator.get());
        calledCoordinator.set(false);
    }
    {
        // Calls into the coordinator if not a follower
        final long term = randomNonNegativeLong();
        followersChecker.updateFastResponseState(term, randomFrom(Mode.LEADER, Mode.CANDIDATE));
        final ExpectsSuccess expectsSuccess = new ExpectsSuccess();
        transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(term, leader), expectsSuccess);
        deterministicTaskQueue.runAllTasks();
        assertTrue(expectsSuccess.succeeded());
        assertTrue(calledCoordinator.get());
        calledCoordinator.set(false);
    }
    {
        // If it calls into the coordinator and the coordinator throws an exception then it's passed back to the caller
        final long term = randomNonNegativeLong();
        followersChecker.updateFastResponseState(term, randomFrom(Mode.LEADER, Mode.CANDIDATE));
        final String exceptionMessage = "test simulated exception " + randomNonNegativeLong();
        coordinatorException.set(new OpenSearchException(exceptionMessage));
        final AtomicReference<TransportException> receivedException = new AtomicReference<>();
        transportService.sendRequest(follower, FOLLOWER_CHECK_ACTION_NAME, new FollowerCheckRequest(term, leader), new TransportResponseHandler<TransportResponse.Empty>() {

            @Override
            public TransportResponse.Empty read(StreamInput in) {
                return TransportResponse.Empty.INSTANCE;
            }

            @Override
            public void handleResponse(TransportResponse.Empty response) {
                fail("unexpected success");
            }

            @Override
            public void handleException(TransportException exp) {
                assertThat(exp, not(nullValue()));
                assertTrue(receivedException.compareAndSet(null, exp));
            }

            @Override
            public String executor() {
                return Names.SAME;
            }
        });
        deterministicTaskQueue.runAllTasks();
        assertTrue(calledCoordinator.get());
        assertThat(receivedException.get(), not(nullValue()));
        assertThat(receivedException.get().getRootCause().getMessage(), equalTo(exceptionMessage));
    }
}
Also used : DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) TransportRequest(org.opensearch.transport.TransportRequest) TransportResponseHandler(org.opensearch.transport.TransportResponseHandler) AtomicReference(java.util.concurrent.atomic.AtomicReference) FollowerCheckRequest(org.opensearch.cluster.coordination.FollowersChecker.FollowerCheckRequest) TransportResponse(org.opensearch.transport.TransportResponse) ConnectTransportException(org.opensearch.transport.ConnectTransportException) TransportException(org.opensearch.transport.TransportException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Empty(org.opensearch.transport.TransportResponse.Empty) TransportService(org.opensearch.transport.TransportService) StatusInfo(org.opensearch.monitor.StatusInfo) MockTransport(org.opensearch.test.transport.MockTransport) StreamInput(org.opensearch.common.io.stream.StreamInput) OpenSearchException(org.opensearch.OpenSearchException) Settings(org.opensearch.common.settings.Settings)

Example 19 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class JoinHelperTests method testJoinFailureOnUnhealthyNodes.

public void testJoinFailureOnUnhealthyNodes() {
    DeterministicTaskQueue deterministicTaskQueue = new DeterministicTaskQueue(Settings.builder().put(NODE_NAME_SETTING.getKey(), "node0").build(), random());
    CapturingTransport capturingTransport = new CapturingTransport();
    DiscoveryNode localNode = new DiscoveryNode("node0", buildNewFakeTransportAddress(), Version.CURRENT);
    TransportService transportService = capturingTransport.createTransportService(Settings.EMPTY, deterministicTaskQueue.getThreadPool(), TransportService.NOOP_TRANSPORT_INTERCEPTOR, x -> localNode, null, Collections.emptySet());
    AtomicReference<StatusInfo> nodeHealthServiceStatus = new AtomicReference<>(new StatusInfo(UNHEALTHY, "unhealthy-info"));
    JoinHelper joinHelper = new JoinHelper(Settings.EMPTY, null, null, transportService, () -> 0L, () -> null, (joinRequest, joinCallback) -> {
        throw new AssertionError();
    }, startJoinRequest -> {
        throw new AssertionError();
    }, Collections.emptyList(), (s, p, r) -> {
    }, () -> nodeHealthServiceStatus.get());
    transportService.start();
    DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), Version.CURRENT);
    DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), Version.CURRENT);
    assertFalse(joinHelper.isJoinPending());
    // check that sending a join to node1 doesn't work
    Optional<Join> optionalJoin1 = randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node1, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
    joinHelper.sendJoinRequest(node1, randomNonNegativeLong(), optionalJoin1);
    CapturedRequest[] capturedRequests1 = capturingTransport.getCapturedRequestsAndClear();
    assertThat(capturedRequests1.length, equalTo(0));
    assertFalse(joinHelper.isJoinPending());
    // check that sending a join to node2 doesn't work
    Optional<Join> optionalJoin2 = randomBoolean() ? Optional.empty() : Optional.of(new Join(localNode, node2, randomNonNegativeLong(), randomNonNegativeLong(), randomNonNegativeLong()));
    transportService.start();
    joinHelper.sendJoinRequest(node2, randomNonNegativeLong(), optionalJoin2);
    CapturedRequest[] capturedRequests2 = capturingTransport.getCapturedRequestsAndClear();
    assertThat(capturedRequests2.length, equalTo(0));
    assertFalse(joinHelper.isJoinPending());
    nodeHealthServiceStatus.getAndSet(new StatusInfo(HEALTHY, "healthy-info"));
    // check that sending another join to node1 now works again
    joinHelper.sendJoinRequest(node1, 0L, optionalJoin1);
    CapturedRequest[] capturedRequests1a = capturingTransport.getCapturedRequestsAndClear();
    assertThat(capturedRequests1a.length, equalTo(1));
    CapturedRequest capturedRequest1a = capturedRequests1a[0];
    assertEquals(node1, capturedRequest1a.node);
}
Also used : DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) CapturingTransport(org.opensearch.test.transport.CapturingTransport) AtomicReference(java.util.concurrent.atomic.AtomicReference) CapturedRequest(org.opensearch.test.transport.CapturingTransport.CapturedRequest) TransportService(org.opensearch.transport.TransportService) StatusInfo(org.opensearch.monitor.StatusInfo)

Example 20 with StatusInfo

use of org.opensearch.monitor.StatusInfo in project OpenSearch by opensearch-project.

the class ClusterFormationFailureHelperTests method testDescriptionOnMasterIneligibleNodes.

public void testDescriptionOnMasterIneligibleNodes() {
    final DiscoveryNode localNode = new DiscoveryNode("local", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
    final ClusterState clusterState = ClusterState.builder(ClusterName.DEFAULT).version(12L).metadata(Metadata.builder().coordinationMetadata(CoordinationMetadata.builder().term(4L).build())).nodes(DiscoveryNodes.builder().add(localNode).localNodeId(localNode.getId())).build();
    assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), emptyList(), 15L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet: have discovered []; discovery will continue using [] from hosts providers " + "and [] from last-known cluster state; node term 15, last-accepted version 12 in term 4"));
    final TransportAddress otherAddress = buildNewFakeTransportAddress();
    assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, singletonList(otherAddress), emptyList(), 16L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet: have discovered []; discovery will continue using [" + otherAddress + "] from hosts providers and [] from last-known cluster state; node term 16, last-accepted version 12 in term 4"));
    final DiscoveryNode otherNode = new DiscoveryNode("other", buildNewFakeTransportAddress(), Version.CURRENT);
    assertThat(new ClusterFormationState(Settings.EMPTY, clusterState, emptyList(), singletonList(otherNode), 17L, electionStrategy, new StatusInfo(HEALTHY, "healthy-info")).getDescription(), is("master not discovered yet: have discovered [" + otherNode + "]; discovery will continue using [] from hosts providers " + "and [] from last-known cluster state; node term 17, last-accepted version 12 in term 4"));
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) StatusInfo(org.opensearch.monitor.StatusInfo) TransportAddress(org.opensearch.common.transport.TransportAddress) ClusterFormationState(org.opensearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState)

Aggregations

StatusInfo (org.opensearch.monitor.StatusInfo)45 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)33 Settings (org.opensearch.common.settings.Settings)18 TransportService (org.opensearch.transport.TransportService)14 ClusterState (org.opensearch.cluster.ClusterState)11 AtomicReference (java.util.concurrent.atomic.AtomicReference)10 TransportRequest (org.opensearch.transport.TransportRequest)10 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)9 MockTransport (org.opensearch.test.transport.MockTransport)9 ClusterFormationState (org.opensearch.cluster.coordination.ClusterFormationFailureHelper.ClusterFormationState)8 DiscoveryNodes (org.opensearch.cluster.node.DiscoveryNodes)7 ConnectTransportException (org.opensearch.transport.ConnectTransportException)6 TransportException (org.opensearch.transport.TransportException)6 TransportResponse (org.opensearch.transport.TransportResponse)6 Empty (org.opensearch.transport.TransportResponse.Empty)6 HashSet (java.util.HashSet)5 VotingConfiguration (org.opensearch.cluster.coordination.CoordinationMetadata.VotingConfiguration)5 TransportAddress (org.opensearch.common.transport.TransportAddress)5 CapturingTransport (org.opensearch.test.transport.CapturingTransport)5 Set (java.util.Set)4