use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testIsolateMasterAndVerifyClusterStateConsensus.
/**
* This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
* and verifies that all node agree on the new cluster state
*/
@TestLogging("_root:DEBUG,org.elasticsearch.cluster.service:TRACE,org.elasticsearch.gateway:TRACE,org.elasticsearch.indices.store:TRACE")
public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
final List<String> nodes = startCluster(3);
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2)).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))));
ensureGreen();
String isolatedNode = internalCluster().getMasterName();
TwoPartitions partitions = isolateNode(isolatedNode);
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
networkDisruption.startDisrupting();
String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
// make sure cluster reforms
ensureStableCluster(2, nonIsolatedNode);
// make sure isolated need picks up on things.
assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));
// restore isolation
networkDisruption.stopDisrupting();
for (String node : nodes) {
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()), true, node);
}
logger.info("issue a reroute");
// trigger a reroute now, instead of waiting for the background reroute of RerouteService
assertAcked(client().admin().cluster().prepareReroute());
// and wait for it to finish and for the cluster to stabilize
ensureGreen("test");
// verify all cluster states are the same
ClusterState state = null;
for (String node : nodes) {
ClusterState nodeState = getNodeClusterState(node);
if (state == null) {
state = nodeState;
continue;
}
// assert nodes are identical
try {
assertEquals("unequal versions", state.version(), nodeState.version());
assertEquals("unequal node count", state.nodes().getSize(), nodeState.nodes().getSize());
assertEquals("different masters ", state.nodes().getMasterNodeId(), nodeState.nodes().getMasterNodeId());
assertEquals("different meta data version", state.metaData().version(), nodeState.metaData().version());
if (!state.routingTable().toString().equals(nodeState.routingTable().toString())) {
fail("different routing");
}
} catch (AssertionError t) {
fail("failed comparing cluster state: " + t.getMessage() + "\n" + "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state + "\n--- cluster state [" + node + "]: ---\n" + nodeState);
}
}
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testVerifyApiBlocksDuringPartition.
/**
* Verify that the proper block is applied when nodes loose their master
*/
public void testVerifyApiBlocksDuringPartition() throws Exception {
startCluster(3);
// Makes sure that the get request can be executed on each node locally:
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)));
// Everything is stable now, it is now time to simulate evil...
// but first make sure we have no initializing shards and all is green
// (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
ensureGreen("test");
TwoPartitions partitions = TwoPartitions.random(random(), internalCluster().getNodeNames());
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
assertEquals(1, partitions.getMinoritySide().size());
final String isolatedNode = partitions.getMinoritySide().iterator().next();
assertEquals(2, partitions.getMajoritySide().size());
final String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
// Simulate a network issue between the unlucky node and the rest of the cluster.
networkDisruption.startDisrupting();
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_WRITES, TimeValue.timeValueSeconds(10));
logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
ensureStableCluster(2, nonIsolatedNode);
for (String node : partitions.getMajoritySide()) {
ClusterState nodeState = getNodeClusterState(node);
boolean success = true;
if (nodeState.nodes().getMasterNode() == null) {
success = false;
}
if (!nodeState.blocks().global().isEmpty()) {
success = false;
}
if (!success) {
fail("node [" + node + "] has no master or has blocks, despite of being on the right side of the partition. State dump:\n" + nodeState);
}
}
networkDisruption.stopDisrupting();
// Wait until the master node sees al 3 nodes again.
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()));
logger.info("Verify no master block with {} set to {}", DiscoverySettings.NO_MASTER_BLOCK_SETTING.getKey(), "all");
client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(DiscoverySettings.NO_MASTER_BLOCK_SETTING.getKey(), "all")).get();
networkDisruption.startDisrupting();
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_ALL, TimeValue.timeValueSeconds(10));
// make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
// the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
// the test to fail due to unfreed resources
ensureStableCluster(2, nonIsolatedNode);
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testIndicesDeleted.
/**
* Tests that indices are properly deleted even if there is a master transition in between.
* Test for https://github.com/elastic/elasticsearch/issues/11665
*/
public void testIndicesDeleted() throws Exception {
final Settings settings = Settings.builder().put(DEFAULT_SETTINGS).put(DiscoverySettings.PUBLISH_TIMEOUT_SETTING.getKey(), // don't wait on isolated data node
"0s").put(DiscoverySettings.COMMIT_TIMEOUT_SETTING.getKey(), // wait till cluster state is committed
"30s").build();
final String idxName = "test";
configureCluster(settings, 3, null, 2);
final List<String> allMasterEligibleNodes = internalCluster().startMasterOnlyNodes(2);
final String dataNode = internalCluster().startDataOnlyNode();
ensureStableCluster(3);
assertAcked(prepareCreate("test"));
final String masterNode1 = internalCluster().getMasterName();
NetworkDisruption networkDisruption = new NetworkDisruption(new TwoPartitions(masterNode1, dataNode), new NetworkUnresponsive());
internalCluster().setDisruptionScheme(networkDisruption);
networkDisruption.startDisrupting();
// We know this will time out due to the partition, we check manually below to not proceed until
// the delete has been applied to the master node and the master eligible node.
internalCluster().client(masterNode1).admin().indices().prepareDelete(idxName).setTimeout("0s").get();
// Don't restart the master node until we know the index deletion has taken effect on master and the master eligible node.
assertBusy(() -> {
for (String masterNode : allMasterEligibleNodes) {
final ClusterState masterState = internalCluster().clusterService(masterNode).state();
assertTrue("index not deleted on " + masterNode, masterState.metaData().hasIndex(idxName) == false);
}
});
internalCluster().restartNode(masterNode1, InternalTestCluster.EMPTY_CALLBACK);
ensureYellow();
assertFalse(client().admin().indices().prepareExists(idxName).get().isExists());
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method addRandomDisruptionType.
protected NetworkDisruption addRandomDisruptionType(TwoPartitions partitions) {
final NetworkLinkDisruptionType disruptionType;
if (randomBoolean()) {
disruptionType = new NetworkUnresponsive();
} else {
disruptionType = new NetworkDisconnect();
}
NetworkDisruption partition = new NetworkDisruption(partitions, disruptionType);
setDisruptionScheme(partition);
return partition;
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.
the class SysNodeResiliencyIntegrationTest method testTimingOutNode.
/**
* Test that basic information from cluster state is used if a sys node
* request is timing out
*/
@Test
public void testTimingOutNode() throws Exception {
// wait until no master cluster state tasks are pending, otherwise this test may fail due to master task timeouts
waitNoPendingTasksOnAll();
String[] nodeNames = internalCluster().getNodeNames();
String n1 = nodeNames[0];
String n2 = nodeNames[1];
NetworkDisruption partition = new NetworkDisruption(new NetworkDisruption.TwoPartitions(n1, n2), new NetworkDisruption.NetworkUnresponsive());
setDisruptionScheme(partition);
partition.startDisrupting();
try {
execute("select version['number'], hostname, id, name from sys.nodes where name = ?", new Object[] { n2 }, createSessionOnNode(n1));
assertThat(response.rowCount(), is(1L));
assertThat(response.rows()[0][0], is(nullValue()));
assertThat(response.rows()[0][1], is(nullValue()));
assertThat(response.rows()[0][2], is(notNullValue()));
assertThat(response.rows()[0][3], is(n2));
} finally {
partition.stopDisrupting();
internalCluster().clearDisruptionScheme(true);
waitNoPendingTasksOnAll();
}
}
Aggregations