Search in sources :

Example 1 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class MinimumMasterNodesIT method testCanNotPublishWithoutMinMastNodes.

public void testCanNotPublishWithoutMinMastNodes() throws Exception {
    Settings settings = Settings.builder().put(ZenDiscovery.PING_TIMEOUT_SETTING.getKey(), "200ms").put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), 2).put(DiscoverySettings.COMMIT_TIMEOUT_SETTING.getKey(), // speed things up
    "100ms").build();
    internalCluster().startNodes(3, settings);
    // ensure cluster state is recovered before we disrupt things
    ensureGreen();
    final String master = internalCluster().getMasterName();
    Set<String> otherNodes = new HashSet<>(Arrays.asList(internalCluster().getNodeNames()));
    otherNodes.remove(master);
    NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(Collections.singleton(master), otherNodes), new NetworkDisruption.NetworkDisconnect());
    internalCluster().setDisruptionScheme(partition);
    final CountDownLatch latch = new CountDownLatch(1);
    final AtomicReference<Exception> failure = new AtomicReference<>();
    logger.debug("--> submitting for cluster state to be rejected");
    final ClusterService masterClusterService = internalCluster().clusterService(master);
    masterClusterService.submitStateUpdateTask("test", new ClusterStateUpdateTask() {

        @Override
        public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
            latch.countDown();
        }

        @Override
        public ClusterState execute(ClusterState currentState) throws Exception {
            logger.debug("--> starting the disruption, preventing cluster state publishing");
            partition.startDisrupting();
            MetaData.Builder metaData = MetaData.builder(currentState.metaData()).persistentSettings(Settings.builder().put(currentState.metaData().persistentSettings()).put("_SHOULD_NOT_BE_THERE_", true).build());
            return ClusterState.builder(currentState).metaData(metaData).build();
        }

        @Override
        public void onFailure(String source, Exception e) {
            failure.set(e);
            latch.countDown();
        }
    });
    logger.debug("--> waiting for cluster state to be processed/rejected");
    latch.await();
    assertThat(failure.get(), instanceOf(Discovery.FailedToCommitClusterStateException.class));
    assertBusy(new Runnable() {

        @Override
        public void run() {
            assertThat(masterClusterService.state().nodes().getMasterNode(), nullValue());
        }
    });
    partition.stopDisrupting();
    logger.debug("--> waiting for cluster to heal");
    assertNoTimeout(client().admin().cluster().prepareHealth().setWaitForNodes("3").setWaitForEvents(Priority.LANGUID));
    for (String node : internalCluster().getNodeNames()) {
        Settings nodeSetting = internalCluster().clusterService(node).state().metaData().settings();
        assertThat(node + " processed the cluster state despite of a min master node violation", nodeSetting.get("_SHOULD_NOT_BE_THERE_"), nullValue());
    }
}
Also used : TwoPartitions(org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions) AtomicReference(java.util.concurrent.atomic.AtomicReference) CountDownLatch(java.util.concurrent.CountDownLatch) ExecutionException(java.util.concurrent.ExecutionException) ClusterService(org.elasticsearch.cluster.service.ClusterService) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) Settings(org.elasticsearch.common.settings.Settings) DiscoverySettings(org.elasticsearch.discovery.DiscoverySettings) HashSet(java.util.HashSet)

Example 2 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testElectMasterWithLatestVersion.

public void testElectMasterWithLatestVersion() throws Exception {
    configureCluster(3, null, 2);
    final Set<String> nodes = new HashSet<>(internalCluster().startNodes(3));
    ensureStableCluster(3);
    ServiceDisruptionScheme isolateAllNodes = new NetworkDisruption(new NetworkDisruption.IsolateAllNodes(nodes), new NetworkDisconnect());
    internalCluster().setDisruptionScheme(isolateAllNodes);
    logger.info("--> forcing a complete election to make sure \"preferred\" master is elected");
    isolateAllNodes.startDisrupting();
    for (String node : nodes) {
        assertNoMaster(node);
    }
    internalCluster().clearDisruptionScheme();
    ensureStableCluster(3);
    final String preferredMasterName = internalCluster().getMasterName();
    final DiscoveryNode preferredMaster = internalCluster().clusterService(preferredMasterName).localNode();
    for (String node : nodes) {
        DiscoveryNode discoveryNode = internalCluster().clusterService(node).localNode();
        assertThat(discoveryNode.getId(), greaterThanOrEqualTo(preferredMaster.getId()));
    }
    logger.info("--> preferred master is {}", preferredMaster);
    final Set<String> nonPreferredNodes = new HashSet<>(nodes);
    nonPreferredNodes.remove(preferredMasterName);
    final ServiceDisruptionScheme isolatePreferredMaster = new NetworkDisruption(new NetworkDisruption.TwoPartitions(Collections.singleton(preferredMasterName), nonPreferredNodes), new NetworkDisconnect());
    internalCluster().setDisruptionScheme(isolatePreferredMaster);
    isolatePreferredMaster.startDisrupting();
    assertAcked(client(randomFrom(nonPreferredNodes)).admin().indices().prepareCreate("test").setSettings(INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1, INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0));
    internalCluster().clearDisruptionScheme(false);
    internalCluster().setDisruptionScheme(isolateAllNodes);
    logger.info("--> forcing a complete election again");
    isolateAllNodes.startDisrupting();
    for (String node : nodes) {
        assertNoMaster(node);
    }
    isolateAllNodes.stopDisrupting();
    final ClusterState state = client().admin().cluster().prepareState().get().getState();
    if (state.metaData().hasIndex("test") == false) {
        fail("index 'test' was lost. current cluster state: " + state);
    }
}
Also used : ClusterState(org.elasticsearch.cluster.ClusterState) DiscoveryNode(org.elasticsearch.cluster.node.DiscoveryNode) TwoPartitions(org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions) ServiceDisruptionScheme(org.elasticsearch.test.disruption.ServiceDisruptionScheme) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) NetworkDisconnect(org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect) HashSet(java.util.HashSet)

Example 3 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testNodesFDAfterMasterReelection.

/**
     * Verify that nodes fault detection works after master (re) election
     */
public void testNodesFDAfterMasterReelection() throws Exception {
    startCluster(4);
    logger.info("--> stopping current master");
    internalCluster().stopCurrentMasterNode();
    ensureStableCluster(3);
    logger.info("--> reducing min master nodes to 2");
    assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), 2)).get());
    String master = internalCluster().getMasterName();
    String nonMaster = null;
    for (String node : internalCluster().getNodeNames()) {
        if (!node.equals(master)) {
            nonMaster = node;
        }
    }
    logger.info("--> isolating [{}]", nonMaster);
    TwoPartitions partitions = isolateNode(nonMaster);
    NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
    networkDisruption.startDisrupting();
    logger.info("--> waiting for master to remove it");
    ensureStableCluster(2, master);
}
Also used : TwoPartitions(org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption)

Example 4 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testUnicastSinglePingResponseContainsMaster.

/**
     * A 4 node cluster with m_m_n set to 3 and each node has one unicast endpoint. One node partitions from the master node.
     * The temporal unicast responses is empty. When partition is solved the one ping response contains a master node.
     * The rejoining node should take this master node and connect.
     */
public void testUnicastSinglePingResponseContainsMaster() throws Exception {
    List<String> nodes = startCluster(4, -1, new int[] { 0 });
    // Figure out what is the elected master node
    final String masterNode = internalCluster().getMasterName();
    logger.info("---> legit elected master node={}", masterNode);
    List<String> otherNodes = new ArrayList<>(nodes);
    otherNodes.remove(masterNode);
    // <-- Don't isolate the node that is in the unicast endpoint for all the other nodes.
    otherNodes.remove(nodes.get(0));
    final String isolatedNode = otherNodes.get(0);
    // Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
    // includes all the other nodes that have pinged it and the issue doesn't manifest
    ZenPing zenPing = ((TestZenDiscovery) internalCluster().getInstance(Discovery.class)).getZenPing();
    if (zenPing instanceof UnicastZenPing) {
        ((UnicastZenPing) zenPing).clearTemporalResponses();
    }
    // Simulate a network issue between the unlucky node and elected master node in both directions.
    NetworkDisruption networkDisconnect = new NetworkDisruption(new TwoPartitions(masterNode, isolatedNode), new NetworkDisconnect());
    setDisruptionScheme(networkDisconnect);
    networkDisconnect.startDisrupting();
    // Wait until elected master has removed that the unlucky node...
    ensureStableCluster(3, masterNode);
    // The isolate master node must report no master, so it starts with pinging
    assertNoMaster(isolatedNode);
    networkDisconnect.stopDisrupting();
    // Wait until the master node sees all 4 nodes again.
    ensureStableCluster(4);
    // The elected master shouldn't have changed, since the isolated node never could have elected himself as
    // master since m_m_n of 3 could never be satisfied.
    assertMaster(masterNode, nodes);
}
Also used : UnicastZenPing(org.elasticsearch.discovery.zen.UnicastZenPing) TwoPartitions(org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions) ArrayList(java.util.ArrayList) ZenDiscovery(org.elasticsearch.discovery.zen.ZenDiscovery) TestZenDiscovery(org.elasticsearch.test.discovery.TestZenDiscovery) ZenPing(org.elasticsearch.discovery.zen.ZenPing) UnicastZenPing(org.elasticsearch.discovery.zen.UnicastZenPing) TestZenDiscovery(org.elasticsearch.test.discovery.TestZenDiscovery) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) NetworkDisconnect(org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect)

Example 5 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testFailWithMinimumMasterNodesConfigured.

/**
     * Test that no split brain occurs under partial network partition. See https://github.com/elastic/elasticsearch/issues/2488
     */
public void testFailWithMinimumMasterNodesConfigured() throws Exception {
    List<String> nodes = startCluster(3);
    // Figure out what is the elected master node
    final String masterNode = internalCluster().getMasterName();
    logger.info("---> legit elected master node={}", masterNode);
    // Pick a node that isn't the elected master.
    Set<String> nonMasters = new HashSet<>(nodes);
    nonMasters.remove(masterNode);
    final String unluckyNode = randomFrom(nonMasters.toArray(Strings.EMPTY_ARRAY));
    // Simulate a network issue between the unlucky node and elected master node in both directions.
    NetworkDisruption networkDisconnect = new NetworkDisruption(new TwoPartitions(masterNode, unluckyNode), new NetworkDisconnect());
    setDisruptionScheme(networkDisconnect);
    networkDisconnect.startDisrupting();
    // Wait until elected master has removed that the unlucky node...
    ensureStableCluster(2, masterNode);
    // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
    // continuously ping until network failures have been resolved. However
    // It may a take a bit before the node detects it has been cut off from the elected master
    assertNoMaster(unluckyNode);
    networkDisconnect.stopDisrupting();
    // Wait until the master node sees all 3 nodes again.
    ensureStableCluster(3);
    // The elected master shouldn't have changed, since the unlucky node never could have elected himself as
    // master since m_m_n of 2 could never be satisfied.
    assertMaster(masterNode, nodes);
}
Also used : TwoPartitions(org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) NetworkDisconnect(org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect) HashSet(java.util.HashSet)

Aggregations

NetworkDisruption (org.elasticsearch.test.disruption.NetworkDisruption)27 TwoPartitions (org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions)17 NetworkDisconnect (org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect)14 ClusterState (org.elasticsearch.cluster.ClusterState)12 HashSet (java.util.HashSet)8 Test (org.junit.Test)8 ServiceDisruptionScheme (org.elasticsearch.test.disruption.ServiceDisruptionScheme)7 Settings (org.elasticsearch.common.settings.Settings)6 NetworkLinkDisruptionType (org.elasticsearch.test.disruption.NetworkDisruption.NetworkLinkDisruptionType)6 TestLogging (org.elasticsearch.test.junit.annotations.TestLogging)6 ArrayList (java.util.ArrayList)5 TimeValue (io.crate.common.unit.TimeValue)4 CountDownLatch (java.util.concurrent.CountDownLatch)4 AtomicReference (java.util.concurrent.atomic.AtomicReference)4 NetworkUnresponsive (org.elasticsearch.test.disruption.NetworkDisruption.NetworkUnresponsive)4 Collection (java.util.Collection)3 List (java.util.List)3 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)3 Semaphore (java.util.concurrent.Semaphore)3 TimeUnit (java.util.concurrent.TimeUnit)3