use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testIsolatedUnicastNodes.
public void testIsolatedUnicastNodes() throws Exception {
List<String> nodes = startCluster(4, -1, new int[] { 0 });
// Figure out what is the elected master node
final String unicastTarget = nodes.get(0);
Set<String> unicastTargetSide = new HashSet<>();
unicastTargetSide.add(unicastTarget);
Set<String> restOfClusterSide = new HashSet<>();
restOfClusterSide.addAll(nodes);
restOfClusterSide.remove(unicastTarget);
// Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
// includes all the other nodes that have pinged it and the issue doesn't manifest
ZenPing zenPing = ((TestZenDiscovery) internalCluster().getInstance(Discovery.class)).getZenPing();
if (zenPing instanceof UnicastZenPing) {
((UnicastZenPing) zenPing).clearTemporalResponses();
}
// Simulate a network issue between the unicast target node and the rest of the cluster
NetworkDisruption networkDisconnect = new NetworkDisruption(new TwoPartitions(unicastTargetSide, restOfClusterSide), new NetworkDisconnect());
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
// Wait until elected master has removed that the unlucky node...
ensureStableCluster(3, nodes.get(1));
// The isolate master node must report no master, so it starts with pinging
assertNoMaster(unicastTarget);
networkDisconnect.stopDisrupting();
// Wait until the master node sees all 3 nodes again.
ensureStableCluster(4);
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testIsolateMasterAndVerifyClusterStateConsensus.
/**
* This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
* and verifies that all node agree on the new cluster state
*/
@TestLogging("_root:DEBUG,org.elasticsearch.cluster.service:TRACE,org.elasticsearch.gateway:TRACE,org.elasticsearch.indices.store:TRACE")
public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
final List<String> nodes = startCluster(3);
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2)).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, randomInt(2))));
ensureGreen();
String isolatedNode = internalCluster().getMasterName();
TwoPartitions partitions = isolateNode(isolatedNode);
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
networkDisruption.startDisrupting();
String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
// make sure cluster reforms
ensureStableCluster(2, nonIsolatedNode);
// make sure isolated need picks up on things.
assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));
// restore isolation
networkDisruption.stopDisrupting();
for (String node : nodes) {
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()), true, node);
}
logger.info("issue a reroute");
// trigger a reroute now, instead of waiting for the background reroute of RerouteService
assertAcked(client().admin().cluster().prepareReroute());
// and wait for it to finish and for the cluster to stabilize
ensureGreen("test");
// verify all cluster states are the same
ClusterState state = null;
for (String node : nodes) {
ClusterState nodeState = getNodeClusterState(node);
if (state == null) {
state = nodeState;
continue;
}
// assert nodes are identical
try {
assertEquals("unequal versions", state.version(), nodeState.version());
assertEquals("unequal node count", state.nodes().getSize(), nodeState.nodes().getSize());
assertEquals("different masters ", state.nodes().getMasterNodeId(), nodeState.nodes().getMasterNodeId());
assertEquals("different meta data version", state.metaData().version(), nodeState.metaData().version());
if (!state.routingTable().toString().equals(nodeState.routingTable().toString())) {
fail("different routing");
}
} catch (AssertionError t) {
fail("failed comparing cluster state: " + t.getMessage() + "\n" + "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state + "\n--- cluster state [" + node + "]: ---\n" + nodeState);
}
}
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testVerifyApiBlocksDuringPartition.
/**
* Verify that the proper block is applied when nodes loose their master
*/
public void testVerifyApiBlocksDuringPartition() throws Exception {
startCluster(3);
// Makes sure that the get request can be executed on each node locally:
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)));
// Everything is stable now, it is now time to simulate evil...
// but first make sure we have no initializing shards and all is green
// (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
ensureGreen("test");
TwoPartitions partitions = TwoPartitions.random(random(), internalCluster().getNodeNames());
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
assertEquals(1, partitions.getMinoritySide().size());
final String isolatedNode = partitions.getMinoritySide().iterator().next();
assertEquals(2, partitions.getMajoritySide().size());
final String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
// Simulate a network issue between the unlucky node and the rest of the cluster.
networkDisruption.startDisrupting();
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_WRITES, TimeValue.timeValueSeconds(10));
logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
ensureStableCluster(2, nonIsolatedNode);
for (String node : partitions.getMajoritySide()) {
ClusterState nodeState = getNodeClusterState(node);
boolean success = true;
if (nodeState.nodes().getMasterNode() == null) {
success = false;
}
if (!nodeState.blocks().global().isEmpty()) {
success = false;
}
if (!success) {
fail("node [" + node + "] has no master or has blocks, despite of being on the right side of the partition. State dump:\n" + nodeState);
}
}
networkDisruption.stopDisrupting();
// Wait until the master node sees al 3 nodes again.
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()));
logger.info("Verify no master block with {} set to {}", DiscoverySettings.NO_MASTER_BLOCK_SETTING.getKey(), "all");
client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(DiscoverySettings.NO_MASTER_BLOCK_SETTING.getKey(), "all")).get();
networkDisruption.startDisrupting();
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
assertNoMaster(isolatedNode, DiscoverySettings.NO_MASTER_BLOCK_ALL, TimeValue.timeValueSeconds(10));
// make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
// the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
// the test to fail due to unfreed resources
ensureStableCluster(2, nonIsolatedNode);
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testIndicesDeleted.
/**
* Tests that indices are properly deleted even if there is a master transition in between.
* Test for https://github.com/elastic/elasticsearch/issues/11665
*/
public void testIndicesDeleted() throws Exception {
final Settings settings = Settings.builder().put(DEFAULT_SETTINGS).put(DiscoverySettings.PUBLISH_TIMEOUT_SETTING.getKey(), // don't wait on isolated data node
"0s").put(DiscoverySettings.COMMIT_TIMEOUT_SETTING.getKey(), // wait till cluster state is committed
"30s").build();
final String idxName = "test";
configureCluster(settings, 3, null, 2);
final List<String> allMasterEligibleNodes = internalCluster().startMasterOnlyNodes(2);
final String dataNode = internalCluster().startDataOnlyNode();
ensureStableCluster(3);
assertAcked(prepareCreate("test"));
final String masterNode1 = internalCluster().getMasterName();
NetworkDisruption networkDisruption = new NetworkDisruption(new TwoPartitions(masterNode1, dataNode), new NetworkUnresponsive());
internalCluster().setDisruptionScheme(networkDisruption);
networkDisruption.startDisrupting();
// We know this will time out due to the partition, we check manually below to not proceed until
// the delete has been applied to the master node and the master eligible node.
internalCluster().client(masterNode1).admin().indices().prepareDelete(idxName).setTimeout("0s").get();
// Don't restart the master node until we know the index deletion has taken effect on master and the master eligible node.
assertBusy(() -> {
for (String masterNode : allMasterEligibleNodes) {
final ClusterState masterState = internalCluster().clusterService(masterNode).state();
assertTrue("index not deleted on " + masterNode, masterState.metaData().hasIndex(idxName) == false);
}
});
internalCluster().restartNode(masterNode1, InternalTestCluster.EMPTY_CALLBACK);
ensureYellow();
assertFalse(client().admin().indices().prepareExists(idxName).get().isExists());
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method addRandomDisruptionType.
protected NetworkDisruption addRandomDisruptionType(TwoPartitions partitions) {
final NetworkLinkDisruptionType disruptionType;
if (randomBoolean()) {
disruptionType = new NetworkUnresponsive();
} else {
disruptionType = new NetworkDisconnect();
}
NetworkDisruption partition = new NetworkDisruption(partitions, disruptionType);
setDisruptionScheme(partition);
return partition;
}
Aggregations