use of org.elasticsearch.test.disruption.SingleNodeDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testMasterNodeGCs.
/**
* Test that cluster recovers from a long GC on master that causes other nodes to elect a new one
*/
public void testMasterNodeGCs() throws Exception {
List<String> nodes = startCluster(3, -1);
String oldMasterNode = internalCluster().getMasterName();
// a very long GC, but it's OK as we remove the disruption when it has had an effect
SingleNodeDisruption masterNodeDisruption = new IntermittentLongGCDisruption(random(), oldMasterNode, 100, 200, 30000, 60000);
internalCluster().setDisruptionScheme(masterNodeDisruption);
masterNodeDisruption.startDisrupting();
Set<String> oldNonMasterNodesSet = new HashSet<>(nodes);
oldNonMasterNodesSet.remove(oldMasterNode);
List<String> oldNonMasterNodes = new ArrayList<>(oldNonMasterNodesSet);
logger.info("waiting for nodes to de-elect master [{}]", oldMasterNode);
for (String node : oldNonMasterNodesSet) {
assertDifferentMaster(node, oldMasterNode);
}
logger.info("waiting for nodes to elect a new master");
ensureStableCluster(2, oldNonMasterNodes.get(0));
logger.info("waiting for any pinging to stop");
assertDiscoveryCompleted(oldNonMasterNodes);
// restore GC
masterNodeDisruption.stopDisrupting();
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + masterNodeDisruption.expectedTimeToHeal().millis()), false, oldNonMasterNodes.get(0));
// make sure all nodes agree on master
String newMaster = internalCluster().getMasterName();
assertThat(newMaster, not(equalTo(oldMasterNode)));
assertMaster(newMaster, nodes);
}
use of org.elasticsearch.test.disruption.SingleNodeDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testStaleMasterNotHijackingMajority.
/**
* Tests that emulates a frozen elected master node that unfreezes and pushes his cluster state to other nodes
* that already are following another elected master node. These nodes should reject this cluster state and prevent
* them from following the stale master.
*/
@TestLogging("_root:DEBUG,org.elasticsearch.cluster.service:TRACE,org.elasticsearch.test.disruption:TRACE")
public void testStaleMasterNotHijackingMajority() throws Exception {
// 3 node cluster with unicast discovery and minimum_master_nodes set to 2:
final List<String> nodes = startCluster(3, 2);
// Save the current master node as old master node, because that node will get frozen
final String oldMasterNode = internalCluster().getMasterName();
for (String node : nodes) {
ensureStableCluster(3, node);
}
assertMaster(oldMasterNode, nodes);
// Simulating a painful gc by suspending all threads for a long time on the current elected master node.
SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(random(), oldMasterNode);
// Save the majority side
final List<String> majoritySide = new ArrayList<>(nodes);
majoritySide.remove(oldMasterNode);
// Keeps track of the previous and current master when a master node transition took place on each node on the majority side:
final Map<String, List<Tuple<String, String>>> masters = Collections.synchronizedMap(new HashMap<String, List<Tuple<String, String>>>());
for (final String node : majoritySide) {
masters.put(node, new ArrayList<Tuple<String, String>>());
internalCluster().getInstance(ClusterService.class, node).addListener(event -> {
DiscoveryNode previousMaster = event.previousState().nodes().getMasterNode();
DiscoveryNode currentMaster = event.state().nodes().getMasterNode();
if (!Objects.equals(previousMaster, currentMaster)) {
logger.info("node {} received new cluster state: {} \n and had previous cluster state: {}", node, event.state(), event.previousState());
String previousMasterNodeName = previousMaster != null ? previousMaster.getName() : null;
String currentMasterNodeName = currentMaster != null ? currentMaster.getName() : null;
masters.get(node).add(new Tuple<>(previousMasterNodeName, currentMasterNodeName));
}
});
}
final CountDownLatch oldMasterNodeSteppedDown = new CountDownLatch(1);
internalCluster().getInstance(ClusterService.class, oldMasterNode).addListener(event -> {
if (event.state().nodes().getMasterNodeId() == null) {
oldMasterNodeSteppedDown.countDown();
}
});
internalCluster().setDisruptionScheme(masterNodeDisruption);
logger.info("freezing node [{}]", oldMasterNode);
masterNodeDisruption.startDisrupting();
// Wait for the majority side to get stable
assertDifferentMaster(majoritySide.get(0), oldMasterNode);
assertDifferentMaster(majoritySide.get(1), oldMasterNode);
// the test is periodically tripping on the following assertion. To find out which threads are blocking the nodes from making
// progress we print a stack dump
boolean failed = true;
try {
assertDiscoveryCompleted(majoritySide);
failed = false;
} finally {
if (failed) {
logger.error("discovery failed to complete, probably caused by a blocked thread: {}", new HotThreads().busiestThreads(Integer.MAX_VALUE).ignoreIdleThreads(false).detect());
}
}
// The old master node is frozen, but here we submit a cluster state update task that doesn't get executed,
// but will be queued and once the old master node un-freezes it gets executed.
// The old master node will send this update + the cluster state where he is flagged as master to the other
// nodes that follow the new master. These nodes should ignore this update.
internalCluster().getInstance(ClusterService.class, oldMasterNode).submitStateUpdateTask("sneaky-update", new ClusterStateUpdateTask(Priority.IMMEDIATE) {
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
return ClusterState.builder(currentState).build();
}
@Override
public void onFailure(String source, Exception e) {
logger.warn((Supplier<?>) () -> new ParameterizedMessage("failure [{}]", source), e);
}
});
// Save the new elected master node
final String newMasterNode = internalCluster().getMasterName(majoritySide.get(0));
logger.info("new detected master node [{}]", newMasterNode);
// Stop disruption
logger.info("Unfreeze node [{}]", oldMasterNode);
masterNodeDisruption.stopDisrupting();
oldMasterNodeSteppedDown.await(30, TimeUnit.SECONDS);
// Make sure that the end state is consistent on all nodes:
assertDiscoveryCompleted(nodes);
assertMaster(newMasterNode, nodes);
assertThat(masters.size(), equalTo(2));
for (Map.Entry<String, List<Tuple<String, String>>> entry : masters.entrySet()) {
String nodeName = entry.getKey();
List<Tuple<String, String>> recordedMasterTransition = entry.getValue();
assertThat("[" + nodeName + "] Each node should only record two master node transitions", recordedMasterTransition.size(), equalTo(2));
assertThat("[" + nodeName + "] First transition's previous master should be [null]", recordedMasterTransition.get(0).v1(), equalTo(oldMasterNode));
assertThat("[" + nodeName + "] First transition's current master should be [" + newMasterNode + "]", recordedMasterTransition.get(0).v2(), nullValue());
assertThat("[" + nodeName + "] Second transition's previous master should be [null]", recordedMasterTransition.get(1).v1(), nullValue());
assertThat("[" + nodeName + "] Second transition's current master should be [" + newMasterNode + "]", recordedMasterTransition.get(1).v2(), equalTo(newMasterNode));
}
}
use of org.elasticsearch.test.disruption.SingleNodeDisruption in project crate by crate.
the class MasterDisruptionIT method testMasterNodeGCs.
/**
* Test that cluster recovers from a long GC on master that causes other nodes to elect a new one
*/
@Test
public void testMasterNodeGCs() throws Exception {
List<String> nodes = startCluster(3);
String oldMasterNode = internalCluster().getMasterName();
// a very long GC, but it's OK as we remove the disruption when it has had an effect
SingleNodeDisruption masterNodeDisruption = new IntermittentLongGCDisruption(random(), oldMasterNode, 100, 200, 30000, 60000);
internalCluster().setDisruptionScheme(masterNodeDisruption);
masterNodeDisruption.startDisrupting();
Set<String> oldNonMasterNodesSet = new HashSet<>(nodes);
oldNonMasterNodesSet.remove(oldMasterNode);
List<String> oldNonMasterNodes = new ArrayList<>(oldNonMasterNodesSet);
logger.info("waiting for nodes to de-elect master [{}]", oldMasterNode);
for (String node : oldNonMasterNodesSet) {
assertDifferentMaster(node, oldMasterNode);
}
logger.info("waiting for nodes to elect a new master");
ensureStableCluster(2, oldNonMasterNodes.get(0));
// restore GC
masterNodeDisruption.stopDisrupting();
final TimeValue waitTime = new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + masterNodeDisruption.expectedTimeToHeal().millis());
ensureStableCluster(3, waitTime, false, oldNonMasterNodes.get(0));
// make sure all nodes agree on master
String newMaster = internalCluster().getMasterName();
assertThat(newMaster, not(equalTo(oldMasterNode)));
assertMaster(newMaster, nodes);
}
Aggregations