Search in sources :

Example 16 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testIsolateMasterAndVerifyClusterStateConsensus.

/**
     * This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
     * and verifies that all node agree on the new cluster state
     */
@TestLogging("_root:DEBUG,org.elasticsearch.cluster.service:TRACE,org.elasticsearch.gateway:TRACE,org.elasticsearch.indices.store:TRACE")
public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
    final List<String> nodes = startCluster(3);
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2)).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))));
    ensureGreen();
    String isolatedNode = internalCluster().getMasterName();
    TwoPartitions partitions = isolateNode(isolatedNode);
    NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
    networkDisruption.startDisrupting();
    String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
    // make sure cluster reforms
    ensureStableCluster(2, nonIsolatedNode);
    // make sure isolated need picks up on things.
    assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));
    // restore isolation
    networkDisruption.stopDisrupting();
    for (String node : nodes) {
        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()), true, node);
    }
    logger.info("issue a reroute");
    // trigger a reroute now, instead of waiting for the background reroute of RerouteService
    assertAcked(client().admin().cluster().prepareReroute());
    // and wait for it to finish and for the cluster to stabilize
    ensureGreen("test");
    // verify all cluster states are the same
    ClusterState state = null;
    for (String node : nodes) {
        ClusterState nodeState = getNodeClusterState(node);
        if (state == null) {
            state = nodeState;
            continue;
        }
        // assert nodes are identical
        try {
            assertEquals("unequal versions", state.version(), nodeState.version());
            assertEquals("unequal node count", state.nodes().getSize(), nodeState.nodes().getSize());
            assertEquals("different masters ", state.nodes().getMasterNodeId(), nodeState.nodes().getMasterNodeId());
            assertEquals("different meta data version", state.metaData().version(), nodeState.metaData().version());
            if (!state.routingTable().toString().equals(nodeState.routingTable().toString())) {
                fail("different routing");
            }
        } catch (AssertionError t) {
            fail("failed comparing cluster state: " + t.getMessage() + "\n" + "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state + "\n--- cluster state [" + node + "]: ---\n" + nodeState);
        }
    }
}
Also used : ClusterState(org.elasticsearch.cluster.ClusterState) TwoPartitions(org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) TimeValue(org.elasticsearch.common.unit.TimeValue) TestLogging(org.elasticsearch.test.junit.annotations.TestLogging)

Example 17 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testVerifyApiBlocksDuringPartition.

/**
     * Verify that the proper block is applied when nodes loose their master
     */
public void testVerifyApiBlocksDuringPartition() throws Exception {
    startCluster(3);
    // Makes sure that the get request can be executed on each node locally:
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)));
    // Everything is stable now, it is now time to simulate evil...
    // but first make sure we have no initializing shards and all is green
    // (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
    ensureGreen("test");
    TwoPartitions partitions = TwoPartitions.random(random(), internalCluster().getNodeNames());
    NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
    assertEquals(1, partitions.getMinoritySide().size());
    final String isolatedNode = partitions.getMinoritySide().iterator().next();
    assertEquals(2, partitions.getMajoritySide().size());
    final String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
    // Simulate a network issue between the unlucky node and the rest of the cluster.
    networkDisruption.startDisrupting();
    // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
    // continuously ping until network failures have been resolved. However
    // It may a take a bit before the node detects it has been cut off from the elected master
    logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
    assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_WRITES, TimeValue.timeValueSeconds(10));
    logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
    ensureStableCluster(2, nonIsolatedNode);
    for (String node : partitions.getMajoritySide()) {
        ClusterState nodeState = getNodeClusterState(node);
        boolean success = true;
        if (nodeState.nodes().getMasterNode() == null) {
            success = false;
        }
        if (!nodeState.blocks().global().isEmpty()) {
            success = false;
        }
        if (!success) {
            fail("node [" + node + "] has no master or has blocks, despite of being on the right side of the partition. State dump:\n" + nodeState);
        }
    }
    networkDisruption.stopDisrupting();
    // Wait until the master node sees al 3 nodes again.
    ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()));
    logger.info("Verify no master block with {} set to {}", DiscoverySettings.NO_MASTER_BLOCK_SETTING.getKey(), "all");
    client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(DiscoverySettings.NO_MASTER_BLOCK_SETTING.getKey(), "all")).get();
    networkDisruption.startDisrupting();
    // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
    // continuously ping until network failures have been resolved. However
    // It may a take a bit before the node detects it has been cut off from the elected master
    logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
    assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_ALL, TimeValue.timeValueSeconds(10));
    // make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
    // the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
    // the test to fail due to unfreed resources
    ensureStableCluster(2, nonIsolatedNode);
}
Also used : ClusterState(org.elasticsearch.cluster.ClusterState) TwoPartitions(org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) TimeValue(org.elasticsearch.common.unit.TimeValue)

Example 18 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method testIndicesDeleted.

/**
     * Tests that indices are properly deleted even if there is a master transition in between.
     * Test for https://github.com/elastic/elasticsearch/issues/11665
     */
public void testIndicesDeleted() throws Exception {
    final Settings settings = Settings.builder().put(DEFAULT_SETTINGS).put(DiscoverySettings.PUBLISH_TIMEOUT_SETTING.getKey(), // don't wait on isolated data node
    "0s").put(DiscoverySettings.COMMIT_TIMEOUT_SETTING.getKey(), // wait till cluster state is committed
    "30s").build();
    final String idxName = "test";
    configureCluster(settings, 3, null, 2);
    final List<String> allMasterEligibleNodes = internalCluster().startMasterOnlyNodes(2);
    final String dataNode = internalCluster().startDataOnlyNode();
    ensureStableCluster(3);
    assertAcked(prepareCreate("test"));
    final String masterNode1 = internalCluster().getMasterName();
    NetworkDisruption networkDisruption = new NetworkDisruption(new TwoPartitions(masterNode1, dataNode), new NetworkUnresponsive());
    internalCluster().setDisruptionScheme(networkDisruption);
    networkDisruption.startDisrupting();
    // We know this will time out due to the partition, we check manually below to not proceed until
    // the delete has been applied to the master node and the master eligible node.
    internalCluster().client(masterNode1).admin().indices().prepareDelete(idxName).setTimeout("0s").get();
    // Don't restart the master node until we know the index deletion has taken effect on master and the master eligible node.
    assertBusy(() -> {
        for (String masterNode : allMasterEligibleNodes) {
            final ClusterState masterState = internalCluster().clusterService(masterNode).state();
            assertTrue("index not deleted on " + masterNode, masterState.metaData().hasIndex(idxName) == false);
        }
    });
    internalCluster().restartNode(masterNode1, InternalTestCluster.EMPTY_CALLBACK);
    ensureYellow();
    assertFalse(client().admin().indices().prepareExists(idxName).get().isExists());
}
Also used : ClusterState(org.elasticsearch.cluster.ClusterState) TwoPartitions(org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkUnresponsive(org.elasticsearch.test.disruption.NetworkDisruption.NetworkUnresponsive) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings)

Example 19 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.

the class DiscoveryWithServiceDisruptionsIT method addRandomDisruptionType.

protected NetworkDisruption addRandomDisruptionType(TwoPartitions partitions) {
    final NetworkLinkDisruptionType disruptionType;
    if (randomBoolean()) {
        disruptionType = new NetworkUnresponsive();
    } else {
        disruptionType = new NetworkDisconnect();
    }
    NetworkDisruption partition = new NetworkDisruption(partitions, disruptionType);
    setDisruptionScheme(partition);
    return partition;
}
Also used : NetworkLinkDisruptionType(org.elasticsearch.test.disruption.NetworkDisruption.NetworkLinkDisruptionType) NetworkUnresponsive(org.elasticsearch.test.disruption.NetworkDisruption.NetworkUnresponsive) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) NetworkDisconnect(org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect)

Example 20 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.

the class SysNodeResiliencyIntegrationTest method testTimingOutNode.

/**
 * Test that basic information from cluster state is used if a sys node
 * request is timing out
 */
@Test
public void testTimingOutNode() throws Exception {
    // wait until no master cluster state tasks are pending, otherwise this test may fail due to master task timeouts
    waitNoPendingTasksOnAll();
    String[] nodeNames = internalCluster().getNodeNames();
    String n1 = nodeNames[0];
    String n2 = nodeNames[1];
    NetworkDisruption partition = new NetworkDisruption(new NetworkDisruption.TwoPartitions(n1, n2), new NetworkDisruption.NetworkUnresponsive());
    setDisruptionScheme(partition);
    partition.startDisrupting();
    try {
        execute("select version['number'], hostname, id, name from sys.nodes where name = ?", new Object[] { n2 }, createSessionOnNode(n1));
        assertThat(response.rowCount(), is(1L));
        assertThat(response.rows()[0][0], is(nullValue()));
        assertThat(response.rows()[0][1], is(nullValue()));
        assertThat(response.rows()[0][2], is(notNullValue()));
        assertThat(response.rows()[0][3], is(n2));
    } finally {
        partition.stopDisrupting();
        internalCluster().clearDisruptionScheme(true);
        waitNoPendingTasksOnAll();
    }
}
Also used : NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) Test(org.junit.Test)

Aggregations

NetworkDisruption (org.elasticsearch.test.disruption.NetworkDisruption)27 TwoPartitions (org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions)17 NetworkDisconnect (org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect)14 ClusterState (org.elasticsearch.cluster.ClusterState)12 HashSet (java.util.HashSet)8 Test (org.junit.Test)8 ServiceDisruptionScheme (org.elasticsearch.test.disruption.ServiceDisruptionScheme)7 Settings (org.elasticsearch.common.settings.Settings)6 NetworkLinkDisruptionType (org.elasticsearch.test.disruption.NetworkDisruption.NetworkLinkDisruptionType)6 TestLogging (org.elasticsearch.test.junit.annotations.TestLogging)6 ArrayList (java.util.ArrayList)5 TimeValue (io.crate.common.unit.TimeValue)4 CountDownLatch (java.util.concurrent.CountDownLatch)4 AtomicReference (java.util.concurrent.atomic.AtomicReference)4 NetworkUnresponsive (org.elasticsearch.test.disruption.NetworkDisruption.NetworkUnresponsive)4 Collection (java.util.Collection)3 List (java.util.List)3 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)3 Semaphore (java.util.concurrent.Semaphore)3 TimeUnit (java.util.concurrent.TimeUnit)3