use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.
the class CoordinatorTests method testLogsMessagesIfPublicationDelayed.
public void testLogsMessagesIfPublicationDelayed() throws IllegalAccessException {
try (Cluster cluster = new Cluster(between(3, 5))) {
cluster.runRandomly();
cluster.stabilise();
final ClusterNode brokenNode = cluster.getAnyNodeExcept(cluster.getAnyLeader());
final MockLogAppender mockLogAppender = new MockLogAppender();
try {
mockLogAppender.start();
Loggers.addAppender(LogManager.getLogger(Coordinator.CoordinatorPublication.class), mockLogAppender);
Loggers.addAppender(LogManager.getLogger(LagDetector.class), mockLogAppender);
mockLogAppender.addExpectation(new MockLogAppender.SeenEventExpectation("publication info message", Coordinator.CoordinatorPublication.class.getCanonicalName(), Level.INFO, "after [*] publication of cluster state version [*] is still waiting for " + brokenNode.getLocalNode() + " [" + Publication.PublicationTargetState.SENT_PUBLISH_REQUEST + ']'));
mockLogAppender.addExpectation(new MockLogAppender.SeenEventExpectation("publication warning", Coordinator.CoordinatorPublication.class.getCanonicalName(), Level.WARN, "after [*] publication of cluster state version [*] is still waiting for " + brokenNode.getLocalNode() + " [" + Publication.PublicationTargetState.SENT_PUBLISH_REQUEST + ']'));
mockLogAppender.addExpectation(new MockLogAppender.SeenEventExpectation("lag warning", LagDetector.class.getCanonicalName(), Level.WARN, "node [" + brokenNode + "] is lagging at cluster state version [*], " + "although publication of cluster state version [*] completed [*] ago"));
// drop the publication messages to one node, but then restore connectivity so it remains in the cluster and does not fail
// health checks
brokenNode.blackhole();
cluster.deterministicTaskQueue.scheduleAt(cluster.deterministicTaskQueue.getCurrentTimeMillis() + DEFAULT_CLUSTER_STATE_UPDATE_DELAY, new Runnable() {
@Override
public void run() {
brokenNode.heal();
}
@Override
public String toString() {
return "healing " + brokenNode;
}
});
cluster.getAnyLeader().submitValue(randomLong());
cluster.runFor(defaultMillis(PUBLISH_TIMEOUT_SETTING) + 2 * DEFAULT_DELAY_VARIABILITY + defaultMillis(LagDetector.CLUSTER_FOLLOWER_LAG_TIMEOUT_SETTING), "waiting for messages to be emitted");
mockLogAppender.assertAllExpectationsMatched();
} finally {
Loggers.removeAppender(LogManager.getLogger(Coordinator.CoordinatorPublication.class), mockLogAppender);
Loggers.removeAppender(LogManager.getLogger(LagDetector.class), mockLogAppender);
mockLogAppender.stop();
}
}
}
use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.
the class CoordinatorTests method testLeaderDisconnectionWithDisconnectEventDetectedQuickly.
public void testLeaderDisconnectionWithDisconnectEventDetectedQuickly() {
try (Cluster cluster = new Cluster(randomIntBetween(3, 5))) {
cluster.runRandomly();
cluster.stabilise();
final ClusterNode originalLeader = cluster.getAnyLeader();
logger.info("--> disconnecting leader {}", originalLeader);
originalLeader.disconnect();
logger.info("--> followers get disconnect event for leader {} ", originalLeader);
cluster.getAllNodesExcept(originalLeader).forEach(cn -> cn.onDisconnectEventFrom(originalLeader));
// turn leader into candidate, which stabilisation asserts at the end
cluster.getAllNodesExcept(originalLeader).forEach(originalLeader::onDisconnectEventFrom);
cluster.stabilise(// disconnect is scheduled
DEFAULT_DELAY_VARIABILITY + // then wait for a new election
DEFAULT_ELECTION_DELAY + // wait for the removal to be committed
DEFAULT_CLUSTER_STATE_UPDATE_DELAY + // then wait for the followup reconfiguration
DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
assertThat(cluster.getAnyLeader().getId(), not(equalTo(originalLeader.getId())));
}
}
use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.
the class CoordinatorTests method testFollowerDisconnectionWithoutDisconnectEventDetectedQuickly.
public void testFollowerDisconnectionWithoutDisconnectEventDetectedQuickly() {
try (Cluster cluster = new Cluster(randomIntBetween(3, 5))) {
cluster.runRandomly();
cluster.stabilise();
final ClusterNode leader = cluster.getAnyLeader();
final ClusterNode follower = cluster.getAnyNodeExcept(leader);
logger.info("--> disconnecting follower {}", follower);
follower.disconnect();
cluster.stabilise(Math.max(// the leader may have just sent a follower check, which receives no response
defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING) + // wait for the leader to check the follower
defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + // then wait for the exception response
DEFAULT_DELAY_VARIABILITY + // then wait for the removal to be committed
DEFAULT_CLUSTER_STATE_UPDATE_DELAY + // then wait for the followup reconfiguration
DEFAULT_CLUSTER_STATE_UPDATE_DELAY, // ALSO the follower may have just sent a leader check, which receives no response
defaultMillis(LEADER_CHECK_TIMEOUT_SETTING) + // then wait for the follower to check the leader
defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + // then wait for the exception response, causing the follower to become a candidate
DEFAULT_DELAY_VARIABILITY));
assertThat(cluster.getAnyLeader().getId(), equalTo(leader.getId()));
}
}
use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.
the class CoordinatorTests method testExpandsConfigurationWhenGrowingFromThreeToFiveNodesAndShrinksBackToThreeOnFailure.
public void testExpandsConfigurationWhenGrowingFromThreeToFiveNodesAndShrinksBackToThreeOnFailure() {
try (Cluster cluster = new Cluster(3)) {
cluster.runRandomly();
cluster.stabilise();
final ClusterNode leader = cluster.getAnyLeader();
logger.info("setting auto-shrink reconfiguration to true");
leader.submitSetAutoShrinkVotingConfiguration(true);
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
assertTrue(CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION.get(leader.getLastAppliedClusterState().metadata().settings()));
cluster.addNodesAndStabilise(2);
{
assertThat(leader.coordinator.getMode(), is(Mode.LEADER));
final VotingConfiguration lastCommittedConfiguration = leader.getLastAppliedClusterState().getLastCommittedConfiguration();
assertThat(lastCommittedConfiguration + " should be all nodes", lastCommittedConfiguration.getNodeIds(), equalTo(cluster.clusterNodes.stream().map(ClusterNode::getId).collect(Collectors.toSet())));
}
final ClusterNode disconnect1 = cluster.getAnyNode();
final ClusterNode disconnect2 = cluster.getAnyNodeExcept(disconnect1);
logger.info("--> disconnecting {} and {}", disconnect1, disconnect2);
disconnect1.disconnect();
disconnect2.disconnect();
cluster.stabilise();
{
final ClusterNode newLeader = cluster.getAnyLeader();
final VotingConfiguration lastCommittedConfiguration = newLeader.getLastAppliedClusterState().getLastCommittedConfiguration();
assertThat(lastCommittedConfiguration + " should be 3 nodes", lastCommittedConfiguration.getNodeIds().size(), equalTo(3));
assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect1.getId()));
assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect2.getId()));
}
// we still tolerate the loss of one more node here
final ClusterNode disconnect3 = cluster.getAnyNodeExcept(disconnect1, disconnect2);
logger.info("--> disconnecting {}", disconnect3);
disconnect3.disconnect();
cluster.stabilise();
{
final ClusterNode newLeader = cluster.getAnyLeader();
final VotingConfiguration lastCommittedConfiguration = newLeader.getLastAppliedClusterState().getLastCommittedConfiguration();
assertThat(lastCommittedConfiguration + " should be 3 nodes", lastCommittedConfiguration.getNodeIds().size(), equalTo(3));
assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect1.getId()));
assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect2.getId()));
assertTrue(lastCommittedConfiguration.getNodeIds().contains(disconnect3.getId()));
}
// however we do not tolerate the loss of yet another one
final ClusterNode disconnect4 = cluster.getAnyNodeExcept(disconnect1, disconnect2, disconnect3);
logger.info("--> disconnecting {}", disconnect4);
disconnect4.disconnect();
cluster.runFor(DEFAULT_STABILISATION_TIME, "allowing time for fault detection");
for (final ClusterNode clusterNode : cluster.clusterNodes) {
assertThat(clusterNode.getId() + " should be a candidate", clusterNode.coordinator.getMode(), equalTo(Mode.CANDIDATE));
}
// moreover we are still stuck even if two other nodes heal
logger.info("--> healing {} and {}", disconnect1, disconnect2);
disconnect1.heal();
disconnect2.heal();
cluster.runFor(DEFAULT_STABILISATION_TIME, "allowing time for fault detection");
for (final ClusterNode clusterNode : cluster.clusterNodes) {
assertThat(clusterNode.getId() + " should be a candidate", clusterNode.coordinator.getMode(), equalTo(Mode.CANDIDATE));
}
// we require another node to heal to recover
final ClusterNode toHeal = randomBoolean() ? disconnect3 : disconnect4;
logger.info("--> healing {}", toHeal);
toHeal.heal();
cluster.stabilise();
}
}
use of org.elasticsearch.cluster.coordination.AbstractCoordinatorTestCase.Cluster.ClusterNode in project crate by crate.
the class CoordinatorTests method testCanUpdateClusterStateAfterStabilisation.
public void testCanUpdateClusterStateAfterStabilisation() {
try (Cluster cluster = new Cluster(randomIntBetween(1, 5))) {
cluster.runRandomly();
cluster.stabilise();
final ClusterNode leader = cluster.getAnyLeader();
long finalValue = randomLong();
logger.info("--> submitting value [{}] to [{}]", finalValue, leader);
leader.submitValue(finalValue);
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
for (final ClusterNode clusterNode : cluster.clusterNodes) {
final String nodeId = clusterNode.getId();
final ClusterState appliedState = clusterNode.getLastAppliedClusterState();
assertThat(nodeId + " has the applied value", value(appliedState), is(finalValue));
}
}
}
Aggregations