Search in sources :

Example 1 with SingleNodeDisruption

use of org.elasticsearch.test.disruption.SingleNodeDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testMasterNodeGCs.

/**
     * Test that cluster recovers from a long GC on master that causes other nodes to elect a new one
     */
public void testMasterNodeGCs() throws Exception {
    List<String> nodes = startCluster(3, -1);
    String oldMasterNode = internalCluster().getMasterName();
    // a very long GC, but it's OK as we remove the disruption when it has had an effect
    SingleNodeDisruption masterNodeDisruption = new IntermittentLongGCDisruption(random(), oldMasterNode, 100, 200, 30000, 60000);
    internalCluster().setDisruptionScheme(masterNodeDisruption);
    masterNodeDisruption.startDisrupting();
    Set<String> oldNonMasterNodesSet = new HashSet<>(nodes);
    oldNonMasterNodesSet.remove(oldMasterNode);
    List<String> oldNonMasterNodes = new ArrayList<>(oldNonMasterNodesSet);
    logger.info("waiting for nodes to de-elect master [{}]", oldMasterNode);
    for (String node : oldNonMasterNodesSet) {
        assertDifferentMaster(node, oldMasterNode);
    }
    logger.info("waiting for nodes to elect a new master");
    ensureStableCluster(2, oldNonMasterNodes.get(0));
    logger.info("waiting for any pinging to stop");
    assertDiscoveryCompleted(oldNonMasterNodes);
    // restore GC
    masterNodeDisruption.stopDisrupting();
    ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + masterNodeDisruption.expectedTimeToHeal().millis()), false, oldNonMasterNodes.get(0));
    // make sure all nodes agree on master
    String newMaster = internalCluster().getMasterName();
    assertThat(newMaster, not(equalTo(oldMasterNode)));
    assertMaster(newMaster, nodes);
}
Also used : ArrayList(java.util.ArrayList) SingleNodeDisruption(org.elasticsearch.test.disruption.SingleNodeDisruption) IntermittentLongGCDisruption(org.elasticsearch.test.disruption.IntermittentLongGCDisruption) TimeValue(org.elasticsearch.common.unit.TimeValue) HashSet(java.util.HashSet)

Example 2 with SingleNodeDisruption

use of org.elasticsearch.test.disruption.SingleNodeDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testStaleMasterNotHijackingMajority.

/**
     * Tests that emulates a frozen elected master node that unfreezes and pushes his cluster state to other nodes
     * that already are following another elected master node. These nodes should reject this cluster state and prevent
     * them from following the stale master.
     */
@TestLogging("_root:DEBUG,org.elasticsearch.cluster.service:TRACE,org.elasticsearch.test.disruption:TRACE")
public void testStaleMasterNotHijackingMajority() throws Exception {
    // 3 node cluster with unicast discovery and minimum_master_nodes set to 2:
    final List<String> nodes = startCluster(3, 2);
    // Save the current master node as old master node, because that node will get frozen
    final String oldMasterNode = internalCluster().getMasterName();
    for (String node : nodes) {
        ensureStableCluster(3, node);
    }
    assertMaster(oldMasterNode, nodes);
    // Simulating a painful gc by suspending all threads for a long time on the current elected master node.
    SingleNodeDisruption masterNodeDisruption = new LongGCDisruption(random(), oldMasterNode);
    // Save the majority side
    final List<String> majoritySide = new ArrayList<>(nodes);
    majoritySide.remove(oldMasterNode);
    // Keeps track of the previous and current master when a master node transition took place on each node on the majority side:
    final Map<String, List<Tuple<String, String>>> masters = Collections.synchronizedMap(new HashMap<String, List<Tuple<String, String>>>());
    for (final String node : majoritySide) {
        masters.put(node, new ArrayList<Tuple<String, String>>());
        internalCluster().getInstance(ClusterService.class, node).addListener(event -> {
            DiscoveryNode previousMaster = event.previousState().nodes().getMasterNode();
            DiscoveryNode currentMaster = event.state().nodes().getMasterNode();
            if (!Objects.equals(previousMaster, currentMaster)) {
                logger.info("node {} received new cluster state: {} \n and had previous cluster state: {}", node, event.state(), event.previousState());
                String previousMasterNodeName = previousMaster != null ? previousMaster.getName() : null;
                String currentMasterNodeName = currentMaster != null ? currentMaster.getName() : null;
                masters.get(node).add(new Tuple<>(previousMasterNodeName, currentMasterNodeName));
            }
        });
    }
    final CountDownLatch oldMasterNodeSteppedDown = new CountDownLatch(1);
    internalCluster().getInstance(ClusterService.class, oldMasterNode).addListener(event -> {
        if (event.state().nodes().getMasterNodeId() == null) {
            oldMasterNodeSteppedDown.countDown();
        }
    });
    internalCluster().setDisruptionScheme(masterNodeDisruption);
    logger.info("freezing node [{}]", oldMasterNode);
    masterNodeDisruption.startDisrupting();
    // Wait for the majority side to get stable
    assertDifferentMaster(majoritySide.get(0), oldMasterNode);
    assertDifferentMaster(majoritySide.get(1), oldMasterNode);
    // the test is periodically tripping on the following assertion. To find out which threads are blocking the nodes from making
    // progress we print a stack dump
    boolean failed = true;
    try {
        assertDiscoveryCompleted(majoritySide);
        failed = false;
    } finally {
        if (failed) {
            logger.error("discovery failed to complete, probably caused by a blocked thread: {}", new HotThreads().busiestThreads(Integer.MAX_VALUE).ignoreIdleThreads(false).detect());
        }
    }
    // The old master node is frozen, but here we submit a cluster state update task that doesn't get executed,
    // but will be queued and once the old master node un-freezes it gets executed.
    // The old master node will send this update + the cluster state where he is flagged as master to the other
    // nodes that follow the new master. These nodes should ignore this update.
    internalCluster().getInstance(ClusterService.class, oldMasterNode).submitStateUpdateTask("sneaky-update", new ClusterStateUpdateTask(Priority.IMMEDIATE) {

        @Override
        public ClusterState execute(ClusterState currentState) throws Exception {
            return ClusterState.builder(currentState).build();
        }

        @Override
        public void onFailure(String source, Exception e) {
            logger.warn((Supplier<?>) () -> new ParameterizedMessage("failure [{}]", source), e);
        }
    });
    // Save the new elected master node
    final String newMasterNode = internalCluster().getMasterName(majoritySide.get(0));
    logger.info("new detected master node [{}]", newMasterNode);
    // Stop disruption
    logger.info("Unfreeze node [{}]", oldMasterNode);
    masterNodeDisruption.stopDisrupting();
    oldMasterNodeSteppedDown.await(30, TimeUnit.SECONDS);
    // Make sure that the end state is consistent on all nodes:
    assertDiscoveryCompleted(nodes);
    assertMaster(newMasterNode, nodes);
    assertThat(masters.size(), equalTo(2));
    for (Map.Entry<String, List<Tuple<String, String>>> entry : masters.entrySet()) {
        String nodeName = entry.getKey();
        List<Tuple<String, String>> recordedMasterTransition = entry.getValue();
        assertThat("[" + nodeName + "] Each node should only record two master node transitions", recordedMasterTransition.size(), equalTo(2));
        assertThat("[" + nodeName + "] First transition's previous master should be [null]", recordedMasterTransition.get(0).v1(), equalTo(oldMasterNode));
        assertThat("[" + nodeName + "] First transition's current master should be [" + newMasterNode + "]", recordedMasterTransition.get(0).v2(), nullValue());
        assertThat("[" + nodeName + "] Second transition's previous master should be [null]", recordedMasterTransition.get(1).v1(), nullValue());
        assertThat("[" + nodeName + "] Second transition's current master should be [" + newMasterNode + "]", recordedMasterTransition.get(1).v2(), equalTo(newMasterNode));
    }
}
Also used : DiscoveryNode(org.elasticsearch.cluster.node.DiscoveryNode) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) Supplier(org.apache.logging.log4j.util.Supplier) ClusterState(org.elasticsearch.cluster.ClusterState) HotThreads(org.elasticsearch.monitor.jvm.HotThreads) ClusterStateUpdateTask(org.elasticsearch.cluster.ClusterStateUpdateTask) CountDownLatch(java.util.concurrent.CountDownLatch) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) ElasticsearchException(org.elasticsearch.ElasticsearchException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) NoShardAvailableActionException(org.elasticsearch.action.NoShardAvailableActionException) ClusterService(org.elasticsearch.cluster.service.ClusterService) IntermittentLongGCDisruption(org.elasticsearch.test.disruption.IntermittentLongGCDisruption) LongGCDisruption(org.elasticsearch.test.disruption.LongGCDisruption) SingleNodeDisruption(org.elasticsearch.test.disruption.SingleNodeDisruption) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) Tuple(org.elasticsearch.common.collect.Tuple) TestLogging(org.elasticsearch.test.junit.annotations.TestLogging)

Example 3 with SingleNodeDisruption

use of org.elasticsearch.test.disruption.SingleNodeDisruption in project crate by crate.

the class MasterDisruptionIT method testMasterNodeGCs.

/**
 * Test that cluster recovers from a long GC on master that causes other nodes to elect a new one
 */
@Test
public void testMasterNodeGCs() throws Exception {
    List<String> nodes = startCluster(3);
    String oldMasterNode = internalCluster().getMasterName();
    // a very long GC, but it's OK as we remove the disruption when it has had an effect
    SingleNodeDisruption masterNodeDisruption = new IntermittentLongGCDisruption(random(), oldMasterNode, 100, 200, 30000, 60000);
    internalCluster().setDisruptionScheme(masterNodeDisruption);
    masterNodeDisruption.startDisrupting();
    Set<String> oldNonMasterNodesSet = new HashSet<>(nodes);
    oldNonMasterNodesSet.remove(oldMasterNode);
    List<String> oldNonMasterNodes = new ArrayList<>(oldNonMasterNodesSet);
    logger.info("waiting for nodes to de-elect master [{}]", oldMasterNode);
    for (String node : oldNonMasterNodesSet) {
        assertDifferentMaster(node, oldMasterNode);
    }
    logger.info("waiting for nodes to elect a new master");
    ensureStableCluster(2, oldNonMasterNodes.get(0));
    // restore GC
    masterNodeDisruption.stopDisrupting();
    final TimeValue waitTime = new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + masterNodeDisruption.expectedTimeToHeal().millis());
    ensureStableCluster(3, waitTime, false, oldNonMasterNodes.get(0));
    // make sure all nodes agree on master
    String newMaster = internalCluster().getMasterName();
    assertThat(newMaster, not(equalTo(oldMasterNode)));
    assertMaster(newMaster, nodes);
}
Also used : ArrayList(java.util.ArrayList) SingleNodeDisruption(org.elasticsearch.test.disruption.SingleNodeDisruption) IntermittentLongGCDisruption(org.elasticsearch.test.disruption.IntermittentLongGCDisruption) TimeValue(io.crate.common.unit.TimeValue) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

ArrayList (java.util.ArrayList)3 IntermittentLongGCDisruption (org.elasticsearch.test.disruption.IntermittentLongGCDisruption)3 SingleNodeDisruption (org.elasticsearch.test.disruption.SingleNodeDisruption)3 HashSet (java.util.HashSet)2 TimeValue (io.crate.common.unit.TimeValue)1 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 CountDownLatch (java.util.concurrent.CountDownLatch)1 ExecutionException (java.util.concurrent.ExecutionException)1 ParameterizedMessage (org.apache.logging.log4j.message.ParameterizedMessage)1 Supplier (org.apache.logging.log4j.util.Supplier)1 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)1 ElasticsearchException (org.elasticsearch.ElasticsearchException)1 NoShardAvailableActionException (org.elasticsearch.action.NoShardAvailableActionException)1 ClusterState (org.elasticsearch.cluster.ClusterState)1 ClusterStateUpdateTask (org.elasticsearch.cluster.ClusterStateUpdateTask)1 DiscoveryNode (org.elasticsearch.cluster.node.DiscoveryNode)1