use of org.opensearch.test.disruption.NetworkDisruption.TwoPartitions in project OpenSearch by opensearch-project.
the class PrimaryAllocationIT method createStaleReplicaScenario.
// returns data paths settings of in-sync shard copy
private Settings createStaleReplicaScenario(String master) throws Exception {
client().prepareIndex("test").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
refresh();
ClusterState state = client().admin().cluster().prepareState().all().get().getState();
List<ShardRouting> shards = state.routingTable().allShards("test");
assertThat(shards.size(), equalTo(2));
final String primaryNode;
final String replicaNode;
if (shards.get(0).primary()) {
primaryNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
replicaNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
} else {
primaryNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
replicaNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
}
NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(Sets.newHashSet(master, replicaNode), Collections.singleton(primaryNode)), NetworkDisruption.DISCONNECT);
internalCluster().setDisruptionScheme(partition);
logger.info("--> partitioning node with primary shard from rest of cluster");
partition.startDisrupting();
ensureStableCluster(2, master);
logger.info("--> index a document into previous replica shard (that is now primary)");
client(replicaNode).prepareIndex("test").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
logger.info("--> shut down node that has new acknowledged document");
final Settings inSyncDataPathSettings = internalCluster().dataPathSettings(replicaNode);
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode));
ensureStableCluster(1, master);
partition.stopDisrupting();
logger.info("--> waiting for node with old primary shard to rejoin the cluster");
ensureStableCluster(2, master);
logger.info("--> check that old primary shard does not get promoted to primary again");
// kick reroute and wait for all shard states to be fetched
client(master).admin().cluster().prepareReroute().get();
assertBusy(() -> assertThat(internalCluster().getInstance(GatewayAllocator.class, master).getNumberOfInFlightFetches(), equalTo(0)));
// kick reroute a second time and check that all shards are unassigned
assertThat(client(master).admin().cluster().prepareReroute().get().getState().getRoutingNodes().unassigned().size(), equalTo(2));
return inSyncDataPathSettings;
}
use of org.opensearch.test.disruption.NetworkDisruption.TwoPartitions in project OpenSearch by opensearch-project.
the class MasterDisruptionIT method testVerifyApiBlocksDuringPartition.
/**
* Verify that the proper block is applied when nodes lose their master
*/
public void testVerifyApiBlocksDuringPartition() throws Exception {
internalCluster().startNodes(3, Settings.builder().putNull(NoMasterBlockService.NO_MASTER_BLOCK_SETTING.getKey()).build());
// Makes sure that the get request can be executed on each node locally:
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 2)));
// Everything is stable now, it is now time to simulate evil...
// but first make sure we have no initializing shards and all is green
// (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
ensureGreen("test");
TwoPartitions partitions = TwoPartitions.random(random(), internalCluster().getNodeNames());
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
assertEquals(1, partitions.getMinoritySide().size());
final String isolatedNode = partitions.getMinoritySide().iterator().next();
assertEquals(2, partitions.getMajoritySide().size());
final String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
// Simulate a network issue between the unlucky node and the rest of the cluster.
networkDisruption.startDisrupting();
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
assertNoMaster(isolatedNode, NoMasterBlockService.NO_MASTER_BLOCK_WRITES, TimeValue.timeValueSeconds(30));
logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
ensureStableCluster(2, nonIsolatedNode);
for (String node : partitions.getMajoritySide()) {
ClusterState nodeState = getNodeClusterState(node);
boolean success = true;
if (nodeState.nodes().getMasterNode() == null) {
success = false;
}
if (!nodeState.blocks().global().isEmpty()) {
success = false;
}
if (!success) {
fail("node [" + node + "] has no master or has blocks, despite of being on the right side of the partition. State dump:\n" + nodeState);
}
}
networkDisruption.stopDisrupting();
// Wait until the master node sees al 3 nodes again.
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()));
logger.info("Verify no master block with {} set to {}", NoMasterBlockService.NO_MASTER_BLOCK_SETTING.getKey(), "all");
client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(NoMasterBlockService.NO_MASTER_BLOCK_SETTING.getKey(), "all")).get();
networkDisruption.startDisrupting();
// The unlucky node must report *no* master node, since it can't connect to master and in fact it should
// continuously ping until network failures have been resolved. However
// It may a take a bit before the node detects it has been cut off from the elected master
logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
assertNoMaster(isolatedNode, NoMasterBlockService.NO_MASTER_BLOCK_ALL, TimeValue.timeValueSeconds(30));
// make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
// the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
// the test to fail due to unfreed resources
ensureStableCluster(2, nonIsolatedNode);
}
use of org.opensearch.test.disruption.NetworkDisruption.TwoPartitions in project OpenSearch by opensearch-project.
the class MasterDisruptionIT method testIsolateMasterAndVerifyClusterStateConsensus.
/**
* This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
* and verifies that all node agree on the new cluster state
*/
public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
final List<String> nodes = startCluster(3);
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2)).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomInt(2))));
ensureGreen();
String isolatedNode = internalCluster().getMasterName();
TwoPartitions partitions = isolateNode(isolatedNode);
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
networkDisruption.startDisrupting();
String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
// make sure cluster reforms
ensureStableCluster(2, nonIsolatedNode);
// make sure isolated need picks up on things.
assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));
// restore isolation
networkDisruption.stopDisrupting();
for (String node : nodes) {
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()), true, node);
}
logger.info("issue a reroute");
// trigger a reroute now, instead of waiting for the background reroute of RerouteService
assertAcked(client().admin().cluster().prepareReroute());
// and wait for it to finish and for the cluster to stabilize
ensureGreen("test");
// verify all cluster states are the same
// use assert busy to wait for cluster states to be applied (as publish_timeout has low value)
assertBusy(() -> {
ClusterState state = null;
for (String node : nodes) {
ClusterState nodeState = getNodeClusterState(node);
if (state == null) {
state = nodeState;
continue;
}
// assert nodes are identical
try {
assertEquals("unequal versions", state.version(), nodeState.version());
assertEquals("unequal node count", state.nodes().getSize(), nodeState.nodes().getSize());
assertEquals("different masters ", state.nodes().getMasterNodeId(), nodeState.nodes().getMasterNodeId());
assertEquals("different meta data version", state.metadata().version(), nodeState.metadata().version());
assertEquals("different routing", state.routingTable().toString(), nodeState.routingTable().toString());
} catch (AssertionError t) {
fail("failed comparing cluster state: " + t.getMessage() + "\n" + "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state + "\n--- cluster state [" + node + "]: ---\n" + nodeState);
}
}
});
}
use of org.opensearch.test.disruption.NetworkDisruption.TwoPartitions in project OpenSearch by opensearch-project.
the class NetworkDisruptionTests method testTwoPartitions.
public void testTwoPartitions() {
Set<String> partition1 = generateRandomStringSet(1, 10);
Set<String> partition2 = generateRandomStringSet(1, 10);
TwoPartitions topology = new TwoPartitions(partition1, partition2);
checkTwoPartitions(topology, partition1, partition2);
}
use of org.opensearch.test.disruption.NetworkDisruption.TwoPartitions in project OpenSearch by opensearch-project.
the class NetworkDisruptionIT method prepareDisruptedCluster.
/**
* Creates 3 to 5 mixed-node cluster and splits it into 2 parts.
* The first part is guaranteed to have at least the majority of the nodes,
* so that master could be elected on this side.
*/
private Tuple<Set<String>, Set<String>> prepareDisruptedCluster() {
int numOfNodes = randomIntBetween(3, 5);
internalCluster().setBootstrapMasterNodeIndex(numOfNodes - 1);
Set<String> nodes = new HashSet<>(internalCluster().startNodes(numOfNodes, DISRUPTION_TUNED_SETTINGS));
ensureGreen();
assertThat(nodes.size(), greaterThanOrEqualTo(3));
int majority = nodes.size() / 2 + 1;
Set<String> side1 = new HashSet<>(randomSubsetOf(randomIntBetween(majority, nodes.size() - 1), nodes));
assertThat(side1.size(), greaterThanOrEqualTo(majority));
Set<String> side2 = new HashSet<>(nodes);
side2.removeAll(side1);
assertThat(side2.size(), greaterThanOrEqualTo(1));
NetworkDisruption networkDisruption = new NetworkDisruption(new TwoPartitions(side1, side2), NetworkDisruption.DISCONNECT);
internalCluster().setDisruptionScheme(networkDisruption);
networkDisruption.startDisrupting();
return Tuple.tuple(side1, side2);
}
Aggregations