use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class MinimumMasterNodesIT method testCanNotPublishWithoutMinMastNodes.
public void testCanNotPublishWithoutMinMastNodes() throws Exception {
Settings settings = Settings.builder().put(ZenDiscovery.PING_TIMEOUT_SETTING.getKey(), "200ms").put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), 2).put(DiscoverySettings.COMMIT_TIMEOUT_SETTING.getKey(), // speed things up
"100ms").build();
internalCluster().startNodes(3, settings);
// ensure cluster state is recovered before we disrupt things
ensureGreen();
final String master = internalCluster().getMasterName();
Set<String> otherNodes = new HashSet<>(Arrays.asList(internalCluster().getNodeNames()));
otherNodes.remove(master);
NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(Collections.singleton(master), otherNodes), new NetworkDisruption.NetworkDisconnect());
internalCluster().setDisruptionScheme(partition);
final CountDownLatch latch = new CountDownLatch(1);
final AtomicReference<Exception> failure = new AtomicReference<>();
logger.debug("--> submitting for cluster state to be rejected");
final ClusterService masterClusterService = internalCluster().clusterService(master);
masterClusterService.submitStateUpdateTask("test", new ClusterStateUpdateTask() {
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
latch.countDown();
}
@Override
public ClusterState execute(ClusterState currentState) throws Exception {
logger.debug("--> starting the disruption, preventing cluster state publishing");
partition.startDisrupting();
MetaData.Builder metaData = MetaData.builder(currentState.metaData()).persistentSettings(Settings.builder().put(currentState.metaData().persistentSettings()).put("_SHOULD_NOT_BE_THERE_", true).build());
return ClusterState.builder(currentState).metaData(metaData).build();
}
@Override
public void onFailure(String source, Exception e) {
failure.set(e);
latch.countDown();
}
});
logger.debug("--> waiting for cluster state to be processed/rejected");
latch.await();
assertThat(failure.get(), instanceOf(Discovery.FailedToCommitClusterStateException.class));
assertBusy(new Runnable() {
@Override
public void run() {
assertThat(masterClusterService.state().nodes().getMasterNode(), nullValue());
}
});
partition.stopDisrupting();
logger.debug("--> waiting for cluster to heal");
assertNoTimeout(client().admin().cluster().prepareHealth().setWaitForNodes("3").setWaitForEvents(Priority.LANGUID));
for (String node : internalCluster().getNodeNames()) {
Settings nodeSetting = internalCluster().clusterService(node).state().metaData().settings();
assertThat(node + " processed the cluster state despite of a min master node violation", nodeSetting.get("_SHOULD_NOT_BE_THERE_"), nullValue());
}
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testElectMasterWithLatestVersion.
public void testElectMasterWithLatestVersion() throws Exception {
configureCluster(3, null, 2);
final Set<String> nodes = new HashSet<>(internalCluster().startNodes(3));
ensureStableCluster(3);
ServiceDisruptionScheme isolateAllNodes = new NetworkDisruption(new NetworkDisruption.IsolateAllNodes(nodes), new NetworkDisconnect());
internalCluster().setDisruptionScheme(isolateAllNodes);
logger.info("--> forcing a complete election to make sure \"preferred\" master is elected");
isolateAllNodes.startDisrupting();
for (String node : nodes) {
assertNoMaster(node);
}
internalCluster().clearDisruptionScheme();
ensureStableCluster(3);
final String preferredMasterName = internalCluster().getMasterName();
final DiscoveryNode preferredMaster = internalCluster().clusterService(preferredMasterName).localNode();
for (String node : nodes) {
DiscoveryNode discoveryNode = internalCluster().clusterService(node).localNode();
assertThat(discoveryNode.getId(), greaterThanOrEqualTo(preferredMaster.getId()));
}
logger.info("--> preferred master is {}", preferredMaster);
final Set<String> nonPreferredNodes = new HashSet<>(nodes);
nonPreferredNodes.remove(preferredMasterName);
final ServiceDisruptionScheme isolatePreferredMaster = new NetworkDisruption(new NetworkDisruption.TwoPartitions(Collections.singleton(preferredMasterName), nonPreferredNodes), new NetworkDisconnect());
internalCluster().setDisruptionScheme(isolatePreferredMaster);
isolatePreferredMaster.startDisrupting();
assertAcked(client(randomFrom(nonPreferredNodes)).admin().indices().prepareCreate("test").setSettings(INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1, INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0));
internalCluster().clearDisruptionScheme(false);
internalCluster().setDisruptionScheme(isolateAllNodes);
logger.info("--> forcing a complete election again");
isolateAllNodes.startDisrupting();
for (String node : nodes) {
assertNoMaster(node);
}
isolateAllNodes.stopDisrupting();
final ClusterState state = client().admin().cluster().prepareState().get().getState();
if (state.metaData().hasIndex("test") == false) {
fail("index 'test' was lost. current cluster state: " + state);
}
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testNodesFDAfterMasterReelection.
/**
* Verify that nodes fault detection works after master (re) election
*/
public void testNodesFDAfterMasterReelection() throws Exception {
startCluster(4);
logger.info("--> stopping current master");
internalCluster().stopCurrentMasterNode();
ensureStableCluster(3);
logger.info("--> reducing min master nodes to 2");
assertAcked(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), 2)).get());
String master = internalCluster().getMasterName();
String nonMaster = null;
for (String node : internalCluster().getNodeNames()) {
if (!node.equals(master)) {
nonMaster = node;
}
}
logger.info("--> isolating [{}]", nonMaster);
TwoPartitions partitions = isolateNode(nonMaster);
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
networkDisruption.startDisrupting();
logger.info("--> waiting for master to remove it");
ensureStableCluster(2, master);
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testUnicastSinglePingResponseContainsMaster.
/**
* A 4 node cluster with m_m_n set to 3 and each node has one unicast endpoint. One node partitions from the master node.
* The temporal unicast responses is empty. When partition is solved the one ping response contains a master node.
* The rejoining node should take this master node and connect.
*/
public void testUnicastSinglePingResponseContainsMaster() throws Exception {
List<String> nodes = startCluster(4, -1, new int[] { 0 });
// Figure out what is the elected master node
final String masterNode = internalCluster().getMasterName();
logger.info("---> legit elected master node={}", masterNode);
List<String> otherNodes = new ArrayList<>(nodes);
otherNodes.remove(masterNode);
// <-- Don't isolate the node that is in the unicast endpoint for all the other nodes.
otherNodes.remove(nodes.get(0));
final String isolatedNode = otherNodes.get(0);
// Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
// includes all the other nodes that have pinged it and the issue doesn't manifest
ZenPing zenPing = ((TestZenDiscovery) internalCluster().getInstance(Discovery.class)).getZenPing();
if (zenPing instanceof UnicastZenPing) {
((UnicastZenPing) zenPing).clearTemporalResponses();
}
// Simulate a network issue between the unlucky node and elected master node in both directions.
NetworkDisruption networkDisconnect = new NetworkDisruption(new TwoPartitions(masterNode, isolatedNode), new NetworkDisconnect());
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
// Wait until elected master has removed that the unlucky node...
ensureStableCluster(3, masterNode);
// The isolate master node must report no master, so it starts with pinging
assertNoMaster(isolatedNode);
networkDisconnect.stopDisrupting();
// Wait until the master node sees all 4 nodes again.
ensureStableCluster(4);
// The elected master shouldn't have changed, since the isolated node never could have elected himself as
// master since m_m_n of 3 could never be satisfied.
assertMaster(masterNode, nodes);
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testFailWithMinimumMasterNodesConfigured.
/**
* Test that no split brain occurs under partial network partition. See https://github.com/elastic/elasticsearch/issues/2488
*/
public void testFailWithMinimumMasterNodesConfigured() throws Exception {
List<String> nodes = startCluster(3);
// Figure out what is the elected master node
final String masterNode = internalCluster().getMasterName();
logger.info("---> legit elected master node={}", masterNode);
// Pick a node that isn't the elected master.
Set<String> nonMasters = new HashSet<>(nodes);
nonMasters.remove(masterNode);
final String unluckyNode = randomFrom(nonMasters.toArray(Strings.EMPTY_ARRAY));
// Simulate a network issue between the unlucky node and elected master node in both directions.
NetworkDisruption networkDisconnect = new NetworkDisruption(new TwoPartitions(masterNode, unluckyNode), new NetworkDisconnect());
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
// Wait until elected master has removed that the unlucky node...
ensureStableCluster(2, masterNode);
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
assertNoMaster(unluckyNode);
networkDisconnect.stopDisrupting();
// Wait until the master node sees all 3 nodes again.
ensureStableCluster(3);
// The elected master shouldn't have changed, since the unlucky node never could have elected himself as
// master since m_m_n of 2 could never be satisfied.
assertMaster(masterNode, nodes);
}
Aggregations