use of org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testElectMasterWithLatestVersion.
public void testElectMasterWithLatestVersion() throws Exception {
configureCluster(3, null, 2);
final Set<String> nodes = new HashSet<>(internalCluster().startNodes(3));
ensureStableCluster(3);
ServiceDisruptionScheme isolateAllNodes = new NetworkDisruption(new NetworkDisruption.IsolateAllNodes(nodes), new NetworkDisconnect());
internalCluster().setDisruptionScheme(isolateAllNodes);
logger.info("--> forcing a complete election to make sure \"preferred\" master is elected");
isolateAllNodes.startDisrupting();
for (String node : nodes) {
assertNoMaster(node);
}
internalCluster().clearDisruptionScheme();
ensureStableCluster(3);
final String preferredMasterName = internalCluster().getMasterName();
final DiscoveryNode preferredMaster = internalCluster().clusterService(preferredMasterName).localNode();
for (String node : nodes) {
DiscoveryNode discoveryNode = internalCluster().clusterService(node).localNode();
assertThat(discoveryNode.getId(), greaterThanOrEqualTo(preferredMaster.getId()));
}
logger.info("--> preferred master is {}", preferredMaster);
final Set<String> nonPreferredNodes = new HashSet<>(nodes);
nonPreferredNodes.remove(preferredMasterName);
final ServiceDisruptionScheme isolatePreferredMaster = new NetworkDisruption(new NetworkDisruption.TwoPartitions(Collections.singleton(preferredMasterName), nonPreferredNodes), new NetworkDisconnect());
internalCluster().setDisruptionScheme(isolatePreferredMaster);
isolatePreferredMaster.startDisrupting();
assertAcked(client(randomFrom(nonPreferredNodes)).admin().indices().prepareCreate("test").setSettings(INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1, INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0));
internalCluster().clearDisruptionScheme(false);
internalCluster().setDisruptionScheme(isolateAllNodes);
logger.info("--> forcing a complete election again");
isolateAllNodes.startDisrupting();
for (String node : nodes) {
assertNoMaster(node);
}
isolateAllNodes.stopDisrupting();
final ClusterState state = client().admin().cluster().prepareState().get().getState();
if (state.metaData().hasIndex("test") == false) {
fail("index 'test' was lost. current cluster state: " + state);
}
}
use of org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testUnicastSinglePingResponseContainsMaster.
/**
* A 4 node cluster with m_m_n set to 3 and each node has one unicast endpoint. One node partitions from the master node.
* The temporal unicast responses is empty. When partition is solved the one ping response contains a master node.
* The rejoining node should take this master node and connect.
*/
public void testUnicastSinglePingResponseContainsMaster() throws Exception {
List<String> nodes = startCluster(4, -1, new int[] { 0 });
// Figure out what is the elected master node
final String masterNode = internalCluster().getMasterName();
logger.info("---> legit elected master node={}", masterNode);
List<String> otherNodes = new ArrayList<>(nodes);
otherNodes.remove(masterNode);
// <-- Don't isolate the node that is in the unicast endpoint for all the other nodes.
otherNodes.remove(nodes.get(0));
final String isolatedNode = otherNodes.get(0);
// Forcefully clean temporal response lists on all nodes. Otherwise the node in the unicast host list
// includes all the other nodes that have pinged it and the issue doesn't manifest
ZenPing zenPing = ((TestZenDiscovery) internalCluster().getInstance(Discovery.class)).getZenPing();
if (zenPing instanceof UnicastZenPing) {
((UnicastZenPing) zenPing).clearTemporalResponses();
}
// Simulate a network issue between the unlucky node and elected master node in both directions.
NetworkDisruption networkDisconnect = new NetworkDisruption(new TwoPartitions(masterNode, isolatedNode), new NetworkDisconnect());
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
// Wait until elected master has removed that the unlucky node...
ensureStableCluster(3, masterNode);
// The isolate master node must report no master, so it starts with pinging
assertNoMaster(isolatedNode);
networkDisconnect.stopDisrupting();
// Wait until the master node sees all 4 nodes again.
ensureStableCluster(4);
// The elected master shouldn't have changed, since the isolated node never could have elected himself as
// master since m_m_n of 3 could never be satisfied.
assertMaster(masterNode, nodes);
}
use of org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testFailWithMinimumMasterNodesConfigured.
/**
* Test that no split brain occurs under partial network partition. See https://github.com/elastic/elasticsearch/issues/2488
*/
public void testFailWithMinimumMasterNodesConfigured() throws Exception {
List<String> nodes = startCluster(3);
// Figure out what is the elected master node
final String masterNode = internalCluster().getMasterName();
logger.info("---> legit elected master node={}", masterNode);
// Pick a node that isn't the elected master.
Set<String> nonMasters = new HashSet<>(nodes);
nonMasters.remove(masterNode);
final String unluckyNode = randomFrom(nonMasters.toArray(Strings.EMPTY_ARRAY));
// Simulate a network issue between the unlucky node and elected master node in both directions.
NetworkDisruption networkDisconnect = new NetworkDisruption(new TwoPartitions(masterNode, unluckyNode), new NetworkDisconnect());
setDisruptionScheme(networkDisconnect);
networkDisconnect.startDisrupting();
// Wait until elected master has removed that the unlucky node...
ensureStableCluster(2, masterNode);
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
assertNoMaster(unluckyNode);
networkDisconnect.stopDisrupting();
// Wait until the master node sees all 3 nodes again.
ensureStableCluster(3);
// The elected master shouldn't have changed, since the unlucky node never could have elected himself as
// master since m_m_n of 2 could never be satisfied.
assertMaster(masterNode, nodes);
}
use of org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect in project elasticsearch by elastic.
the class PrimaryAllocationIT method createStaleReplicaScenario.
private void createStaleReplicaScenario() throws Exception {
logger.info("--> starting 3 nodes, 1 master, 2 data");
String master = internalCluster().startMasterOnlyNode(Settings.EMPTY);
internalCluster().startDataOnlyNodes(2);
assertAcked(client().admin().indices().prepareCreate("test").setSettings(Settings.builder().put("index.number_of_shards", 1).put("index.number_of_replicas", 1)).get());
ensureGreen();
logger.info("--> indexing...");
client().prepareIndex("test", "type1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
refresh();
ClusterState state = client().admin().cluster().prepareState().all().get().getState();
List<ShardRouting> shards = state.routingTable().allShards("test");
assertThat(shards.size(), equalTo(2));
final String primaryNode;
final String replicaNode;
if (shards.get(0).primary()) {
primaryNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
replicaNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
} else {
primaryNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
replicaNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
}
NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(Sets.newHashSet(master, replicaNode), Collections.singleton(primaryNode)), new NetworkDisconnect());
internalCluster().setDisruptionScheme(partition);
logger.info("--> partitioning node with primary shard from rest of cluster");
partition.startDisrupting();
ensureStableCluster(2, master);
logger.info("--> index a document into previous replica shard (that is now primary)");
client(replicaNode).prepareIndex("test", "type1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
logger.info("--> shut down node that has new acknowledged document");
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode));
ensureStableCluster(1, master);
partition.stopDisrupting();
logger.info("--> waiting for node with old primary shard to rejoin the cluster");
ensureStableCluster(2, master);
logger.info("--> check that old primary shard does not get promoted to primary again");
// kick reroute and wait for all shard states to be fetched
client(master).admin().cluster().prepareReroute().get();
assertBusy(() -> assertThat(internalCluster().getInstance(GatewayAllocator.class, master).getNumberOfInFlightFetch(), equalTo(0)));
// kick reroute a second time and check that all shards are unassigned
assertThat(client(master).admin().cluster().prepareReroute().get().getState().getRoutingNodes().unassigned().size(), equalTo(2));
}
use of org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect in project elasticsearch by elastic.
the class IndexingMasterFailoverIT method testMasterFailoverDuringIndexingWithMappingChanges.
/**
* Indexing operations which entail mapping changes require a blocking request to the master node to update the mapping.
* If the master node is being disrupted or if it cannot commit cluster state changes, it needs to retry within timeout limits.
* This retry logic is implemented in TransportMasterNodeAction and tested by the following master failover scenario.
*/
public void testMasterFailoverDuringIndexingWithMappingChanges() throws Throwable {
logger.info("--> start 4 nodes, 3 master, 1 data");
final Settings sharedSettings = Settings.builder().put(FaultDetection.PING_TIMEOUT_SETTING.getKey(), // for hitting simulated network failures quickly
"1s").put(FaultDetection.PING_RETRIES_SETTING.getKey(), // for hitting simulated network failures quickly
"1").put("discovery.zen.join_timeout", // still long to induce failures but to long so test won't time out
"10s").put(DiscoverySettings.PUBLISH_TIMEOUT_SETTING.getKey(), // <-- for hitting simulated network failures quickly
"1s").put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), 2).build();
internalCluster().startMasterOnlyNodes(3, sharedSettings);
String dataNode = internalCluster().startDataOnlyNode(sharedSettings);
logger.info("--> wait for all nodes to join the cluster");
ensureStableCluster(4);
// We index data with mapping changes into cluster and have master failover at same time
client().admin().indices().prepareCreate("myindex").setSettings(Settings.builder().put("index.number_of_shards", 1).put("index.number_of_replicas", 0)).get();
ensureGreen("myindex");
final CyclicBarrier barrier = new CyclicBarrier(2);
Thread indexingThread = new Thread(new Runnable() {
@Override
public void run() {
try {
barrier.await();
} catch (InterruptedException e) {
logger.warn("Barrier interrupted", e);
return;
} catch (BrokenBarrierException e) {
logger.warn("Broken barrier", e);
return;
}
for (int i = 0; i < 10; i++) {
// index data with mapping changes
IndexResponse response = client(dataNode).prepareIndex("myindex", "mytype").setSource("field_" + i, "val").get();
assertEquals(DocWriteResponse.Result.CREATED, response.getResult());
}
}
});
indexingThread.setName("indexingThread");
indexingThread.start();
barrier.await();
// interrupt communication between master and other nodes in cluster
String master = internalCluster().getMasterName();
Set<String> otherNodes = new HashSet<>(Arrays.asList(internalCluster().getNodeNames()));
otherNodes.remove(master);
NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(Collections.singleton(master), otherNodes), new NetworkDisconnect());
internalCluster().setDisruptionScheme(partition);
logger.info("--> disrupting network");
partition.startDisrupting();
logger.info("--> waiting for new master to be elected");
ensureStableCluster(3, dataNode);
partition.stopDisrupting();
logger.info("--> waiting to heal");
ensureStableCluster(4);
indexingThread.join();
ensureGreen("myindex");
refresh();
assertThat(client().prepareSearch("myindex").get().getHits().getTotalHits(), equalTo(10L));
}
Aggregations