Search in sources :

Example 11 with ClusterNode

use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.

the class CoordinatorTests method testLogsMessagesIfPublicationDelayed.

public void testLogsMessagesIfPublicationDelayed() throws IllegalAccessException {
    try (Cluster cluster = new Cluster(between(3, 5))) {
        cluster.runRandomly();
        cluster.stabilise();
        final ClusterNode brokenNode = cluster.getAnyNodeExcept(cluster.getAnyLeader());
        final MockLogAppender mockLogAppender = new MockLogAppender();
        try {
            mockLogAppender.start();
            Loggers.addAppender(LogManager.getLogger(Coordinator.CoordinatorPublication.class), mockLogAppender);
            Loggers.addAppender(LogManager.getLogger(LagDetector.class), mockLogAppender);
            mockLogAppender.addExpectation(new MockLogAppender.SeenEventExpectation("publication info message", Coordinator.CoordinatorPublication.class.getCanonicalName(), Level.INFO, "after [*] publication of cluster state version [*] is still waiting for " + brokenNode.getLocalNode() + " [" + Publication.PublicationTargetState.SENT_PUBLISH_REQUEST + ']'));
            mockLogAppender.addExpectation(new MockLogAppender.SeenEventExpectation("publication warning", Coordinator.CoordinatorPublication.class.getCanonicalName(), Level.WARN, "after [*] publication of cluster state version [*] is still waiting for " + brokenNode.getLocalNode() + " [" + Publication.PublicationTargetState.SENT_PUBLISH_REQUEST + ']'));
            mockLogAppender.addExpectation(new MockLogAppender.SeenEventExpectation("lag warning", LagDetector.class.getCanonicalName(), Level.WARN, "node [" + brokenNode + "] is lagging at cluster state version [*], " + "although publication of cluster state version [*] completed [*] ago"));
            // drop the publication messages to one node, but then restore connectivity so it remains in the cluster and does not fail
            // health checks
            brokenNode.blackhole();
            cluster.deterministicTaskQueue.scheduleAt(cluster.deterministicTaskQueue.getCurrentTimeMillis() + DEFAULT_CLUSTER_STATE_UPDATE_DELAY, new Runnable() {

                @Override
                public void run() {
                    brokenNode.heal();
                }

                @Override
                public String toString() {
                    return "healing " + brokenNode;
                }
            });
            cluster.getAnyLeader().submitValue(randomLong());
            cluster.runFor(defaultMillis(PUBLISH_TIMEOUT_SETTING) + 2 * DEFAULT_DELAY_VARIABILITY + defaultMillis(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING), "waiting for messages to be emitted");
            mockLogAppender.assertAllExpectationsMatched();
        } finally {
            Loggers.removeAppender(LogManager.getLogger(Coordinator.CoordinatorPublication.class), mockLogAppender);
            Loggers.removeAppender(LogManager.getLogger(LagDetector.class), mockLogAppender);
            mockLogAppender.stop();
        }
    }
}
Also used : ClusterNode(org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode) MockLogAppender(org.elasticsearch.test.MockLogAppender) Matchers.containsString(org.hamcrest.Matchers.containsString)

Example 12 with ClusterNode

use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.

the class CoordinatorTests method testLeaderDisconnectionWithDisconnectEventDetectedQuickly.

public void testLeaderDisconnectionWithDisconnectEventDetectedQuickly() {
    try (Cluster cluster = new Cluster(randomIntBetween(3, 5))) {
        cluster.runRandomly();
        cluster.stabilise();
        final ClusterNode originalLeader = cluster.getAnyLeader();
        logger.info("--> disconnecting leader {}", originalLeader);
        originalLeader.disconnect();
        logger.info("--> followers get disconnect event for leader {} ", originalLeader);
        cluster.getAllNodesExcept(originalLeader).forEach(cn -> cn.onDisconnectEventFrom(originalLeader));
        // turn leader into candidate, which stabilisation asserts at the end
        cluster.getAllNodesExcept(originalLeader).forEach(originalLeader::onDisconnectEventFrom);
        cluster.stabilise(// disconnect is scheduled
        DEFAULT_DELAY_VARIABILITY + // then wait for a new election
        DEFAULT_ELECTION_DELAY + // wait for the removal to be committed
        DEFAULT_CLUSTER_STATE_UPDATE_DELAY + // then wait for the followup reconfiguration
        DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
        assertThat(cluster.getAnyLeader().getId(), not(equalTo(originalLeader.getId())));
    }
}
Also used : ClusterNode(org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode)

Example 13 with ClusterNode

use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.

the class CoordinatorTests method testFollowerDisconnectionWithoutDisconnectEventDetectedQuickly.

public void testFollowerDisconnectionWithoutDisconnectEventDetectedQuickly() {
    try (Cluster cluster = new Cluster(randomIntBetween(3, 5))) {
        cluster.runRandomly();
        cluster.stabilise();
        final ClusterNode leader = cluster.getAnyLeader();
        final ClusterNode follower = cluster.getAnyNodeExcept(leader);
        logger.info("--> disconnecting follower {}", follower);
        follower.disconnect();
        cluster.stabilise(Math.max(// the leader may have just sent a follower check, which receives no response
        defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING) + // wait for the leader to check the follower
        defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + // then wait for the exception response
        DEFAULT_DELAY_VARIABILITY + // then wait for the removal to be committed
        DEFAULT_CLUSTER_STATE_UPDATE_DELAY + // then wait for the followup reconfiguration
        DEFAULT_CLUSTER_STATE_UPDATE_DELAY, // ALSO the follower may have just sent a leader check, which receives no response
        defaultMillis(LEADER_CHECK_TIMEOUT_SETTING) + // then wait for the follower to check the leader
        defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + // then wait for the exception response, causing the follower to become a candidate
        DEFAULT_DELAY_VARIABILITY));
        assertThat(cluster.getAnyLeader().getId(), equalTo(leader.getId()));
    }
}
Also used : ClusterNode(org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode)

Example 14 with ClusterNode

use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.

the class CoordinatorTests method testExpandsConfigurationWhenGrowingFromThreeToFiveNodesAndShrinksBackToThreeOnFailure.

public void testExpandsConfigurationWhenGrowingFromThreeToFiveNodesAndShrinksBackToThreeOnFailure() {
    try (Cluster cluster = new Cluster(3)) {
        cluster.runRandomly();
        cluster.stabilise();
        final ClusterNode leader = cluster.getAnyLeader();
        logger.info("setting auto-shrink reconfiguration to true");
        leader.submitSetAutoShrinkVotingConfiguration(true);
        cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
        assertTrue(CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION.get(leader.getLastAppliedClusterState().metadata().settings()));
        cluster.addNodesAndStabilise(2);
        {
            assertThat(leader.coordinator.getMode(), is(Mode.LEADER));
            final VotingConfiguration lastCommittedConfiguration = leader.getLastAppliedClusterState().getLastCommittedConfiguration();
            assertThat(lastCommittedConfiguration + " should be all nodes", lastCommittedConfiguration.getNodeIds(), equalTo(cluster.clusterNodes.stream().map(ClusterNode::getId).collect(Collectors.toSet())));
        }
        final ClusterNode disconnect1 = cluster.getAnyNode();
        final ClusterNode disconnect2 = cluster.getAnyNodeExcept(disconnect1);
        logger.info("--> disconnecting {} and {}", disconnect1, disconnect2);
        disconnect1.disconnect();
        disconnect2.disconnect();
        cluster.stabilise();
        {
            final ClusterNode newLeader = cluster.getAnyLeader();
            final VotingConfiguration lastCommittedConfiguration = newLeader.getLastAppliedClusterState().getLastCommittedConfiguration();
            assertThat(lastCommittedConfiguration + " should be 3 nodes", lastCommittedConfiguration.getNodeIds().size(), equalTo(3));
            assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect1.getId()));
            assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect2.getId()));
        }
        // we still tolerate the loss of one more node here
        final ClusterNode disconnect3 = cluster.getAnyNodeExcept(disconnect1, disconnect2);
        logger.info("--> disconnecting {}", disconnect3);
        disconnect3.disconnect();
        cluster.stabilise();
        {
            final ClusterNode newLeader = cluster.getAnyLeader();
            final VotingConfiguration lastCommittedConfiguration = newLeader.getLastAppliedClusterState().getLastCommittedConfiguration();
            assertThat(lastCommittedConfiguration + " should be 3 nodes", lastCommittedConfiguration.getNodeIds().size(), equalTo(3));
            assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect1.getId()));
            assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect2.getId()));
            assertTrue(lastCommittedConfiguration.getNodeIds().contains(disconnect3.getId()));
        }
        // however we do not tolerate the loss of yet another one
        final ClusterNode disconnect4 = cluster.getAnyNodeExcept(disconnect1, disconnect2, disconnect3);
        logger.info("--> disconnecting {}", disconnect4);
        disconnect4.disconnect();
        cluster.runFor(DEFAULT_STABILISATION_TIME, "allowing time for fault detection");
        for (final ClusterNode clusterNode : cluster.clusterNodes) {
            assertThat(clusterNode.getId() + " should be a candidate", clusterNode.coordinator.getMode(), equalTo(Mode.CANDIDATE));
        }
        // moreover we are still stuck even if two other nodes heal
        logger.info("--> healing {} and {}", disconnect1, disconnect2);
        disconnect1.heal();
        disconnect2.heal();
        cluster.runFor(DEFAULT_STABILISATION_TIME, "allowing time for fault detection");
        for (final ClusterNode clusterNode : cluster.clusterNodes) {
            assertThat(clusterNode.getId() + " should be a candidate", clusterNode.coordinator.getMode(), equalTo(Mode.CANDIDATE));
        }
        // we require another node to heal to recover
        final ClusterNode toHeal = randomBoolean() ? disconnect3 : disconnect4;
        logger.info("--> healing {}", toHeal);
        toHeal.heal();
        cluster.stabilise();
    }
}
Also used : ClusterNode(org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode) VotingConfiguration(org.elasticsearch.cluster.coordination.CoordinationMetadata.VotingConfiguration)

Example 15 with ClusterNode

use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.

the class CoordinatorTests method testCanUpdateClusterStateAfterStabilisation.

public void testCanUpdateClusterStateAfterStabilisation() {
    try (Cluster cluster = new Cluster(randomIntBetween(1, 5))) {
        cluster.runRandomly();
        cluster.stabilise();
        final ClusterNode leader = cluster.getAnyLeader();
        long finalValue = randomLong();
        logger.info("--> submitting value [{}] to [{}]", finalValue, leader);
        leader.submitValue(finalValue);
        cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
        for (final ClusterNode clusterNode : cluster.clusterNodes) {
            final String nodeId = clusterNode.getId();
            final ClusterState appliedState = clusterNode.getLastAppliedClusterState();
            assertThat(nodeId + " has the applied value", value(appliedState), is(finalValue));
        }
    }
}
Also used : ClusterNode(org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode) ClusterState(org.elasticsearch.cluster.ClusterState) Matchers.containsString(org.hamcrest.Matchers.containsString)

Aggregations

ClusterNode (org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode)35 Matchers.containsString (org.hamcrest.Matchers.containsString)8 VotingConfiguration (org.elasticsearch.cluster.coordination.CoordinationMetadata.VotingConfiguration)5 MockLogAppender (org.elasticsearch.test.MockLogAppender)5 HashSet (java.util.HashSet)4 ClusterState (org.elasticsearch.cluster.ClusterState)4 List (java.util.List)3 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)3 Logger (org.apache.logging.log4j.Logger)3 LogEvent (org.apache.logging.log4j.core.LogEvent)3 ElasticsearchException (org.elasticsearch.ElasticsearchException)3 IOException (java.io.IOException)2 Arrays (java.util.Arrays)2 Collections (java.util.Collections)2 Map (java.util.Map)2 Set (java.util.Set)2 Function (java.util.function.Function)2 Collectors (java.util.stream.Collectors)2 Level (org.apache.logging.log4j.Level)2 LogManager (org.apache.logging.log4j.LogManager)2