use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testRejoinDocumentExistsInAllShardCopies.
/**
* Test that a document which is indexed on the majority side of a partition, is available from the minority side,
* once the partition is healed
*/
public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
List<String> nodes = startCluster(3);
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)).get());
ensureGreen("test");
nodes = new ArrayList<>(nodes);
Collections.shuffle(nodes, random());
String isolatedNode = nodes.get(0);
String notIsolatedNode = nodes.get(1);
TwoPartitions partitions = isolateNode(isolatedNode);
NetworkDisruption scheme = addRandomDisruptionType(partitions);
scheme.startDisrupting();
ensureStableCluster(2, notIsolatedNode);
assertFalse(client(notIsolatedNode).admin().cluster().prepareHealth("test").setWaitForYellowStatus().get().isTimedOut());
IndexResponse indexResponse = internalCluster().client(notIsolatedNode).prepareIndex("test", "type").setSource("field", "value").get();
assertThat(indexResponse.getVersion(), equalTo(1L));
logger.info("Verifying if document exists via node[{}]", notIsolatedNode);
GetResponse getResponse = internalCluster().client(notIsolatedNode).prepareGet("test", "type", indexResponse.getId()).setPreference("_local").get();
assertThat(getResponse.isExists(), is(true));
assertThat(getResponse.getVersion(), equalTo(1L));
assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
scheme.stopDisrupting();
ensureStableCluster(3);
ensureGreen("test");
for (String node : nodes) {
logger.info("Verifying if document exists after isolating node[{}] via node[{}]", isolatedNode, node);
getResponse = internalCluster().client(node).prepareGet("test", "type", indexResponse.getId()).setPreference("_local").get();
assertThat(getResponse.isExists(), is(true));
assertThat(getResponse.getVersion(), equalTo(1L));
assertThat(getResponse.getId(), equalTo(indexResponse.getId()));
}
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class PrimaryAllocationIT method createStaleReplicaScenario.
private void createStaleReplicaScenario() throws Exception {
logger.info("--> starting 3 nodes, 1 master, 2 data");
String master = internalCluster().startMasterOnlyNode(Settings.EMPTY);
internalCluster().startDataOnlyNodes(2);
assertAcked(client().admin().indices().prepareCreate("test").setSettings(Settings.builder().put("index.number_of_shards", 1).put("index.number_of_replicas", 1)).get());
ensureGreen();
logger.info("--> indexing...");
client().prepareIndex("test", "type1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
refresh();
ClusterState state = client().admin().cluster().prepareState().all().get().getState();
List<ShardRouting> shards = state.routingTable().allShards("test");
assertThat(shards.size(), equalTo(2));
final String primaryNode;
final String replicaNode;
if (shards.get(0).primary()) {
primaryNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
replicaNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
} else {
primaryNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
replicaNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
}
NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(Sets.newHashSet(master, replicaNode), Collections.singleton(primaryNode)), new NetworkDisconnect());
internalCluster().setDisruptionScheme(partition);
logger.info("--> partitioning node with primary shard from rest of cluster");
partition.startDisrupting();
ensureStableCluster(2, master);
logger.info("--> index a document into previous replica shard (that is now primary)");
client(replicaNode).prepareIndex("test", "type1").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
logger.info("--> shut down node that has new acknowledged document");
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode));
ensureStableCluster(1, master);
partition.stopDisrupting();
logger.info("--> waiting for node with old primary shard to rejoin the cluster");
ensureStableCluster(2, master);
logger.info("--> check that old primary shard does not get promoted to primary again");
// kick reroute and wait for all shard states to be fetched
client(master).admin().cluster().prepareReroute().get();
assertBusy(() -> assertThat(internalCluster().getInstance(GatewayAllocator.class, master).getNumberOfInFlightFetch(), equalTo(0)));
// kick reroute a second time and check that all shards are unassigned
assertThat(client(master).admin().cluster().prepareReroute().get().getState().getRoutingNodes().unassigned().size(), equalTo(2));
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class IndexingMasterFailoverIT method testMasterFailoverDuringIndexingWithMappingChanges.
/**
* Indexing operations which entail mapping changes require a blocking request to the master node to update the mapping.
* If the master node is being disrupted or if it cannot commit cluster state changes, it needs to retry within timeout limits.
* This retry logic is implemented in TransportMasterNodeAction and tested by the following master failover scenario.
*/
public void testMasterFailoverDuringIndexingWithMappingChanges() throws Throwable {
logger.info("--> start 4 nodes, 3 master, 1 data");
final Settings sharedSettings = Settings.builder().put(FaultDetection.PING_TIMEOUT_SETTING.getKey(), // for hitting simulated network failures quickly
"1s").put(FaultDetection.PING_RETRIES_SETTING.getKey(), // for hitting simulated network failures quickly
"1").put("discovery.zen.join_timeout", // still long to induce failures but to long so test won't time out
"10s").put(DiscoverySettings.PUBLISH_TIMEOUT_SETTING.getKey(), // <-- for hitting simulated network failures quickly
"1s").put(ElectMasterService.DISCOVERY_ZEN_MINIMUM_MASTER_NODES_SETTING.getKey(), 2).build();
internalCluster().startMasterOnlyNodes(3, sharedSettings);
String dataNode = internalCluster().startDataOnlyNode(sharedSettings);
logger.info("--> wait for all nodes to join the cluster");
ensureStableCluster(4);
// We index data with mapping changes into cluster and have master failover at same time
client().admin().indices().prepareCreate("myindex").setSettings(Settings.builder().put("index.number_of_shards", 1).put("index.number_of_replicas", 0)).get();
ensureGreen("myindex");
final CyclicBarrier barrier = new CyclicBarrier(2);
Thread indexingThread = new Thread(new Runnable() {
@Override
public void run() {
try {
barrier.await();
} catch (InterruptedException e) {
logger.warn("Barrier interrupted", e);
return;
} catch (BrokenBarrierException e) {
logger.warn("Broken barrier", e);
return;
}
for (int i = 0; i < 10; i++) {
// index data with mapping changes
IndexResponse response = client(dataNode).prepareIndex("myindex", "mytype").setSource("field_" + i, "val").get();
assertEquals(DocWriteResponse.Result.CREATED, response.getResult());
}
}
});
indexingThread.setName("indexingThread");
indexingThread.start();
barrier.await();
// interrupt communication between master and other nodes in cluster
String master = internalCluster().getMasterName();
Set<String> otherNodes = new HashSet<>(Arrays.asList(internalCluster().getNodeNames()));
otherNodes.remove(master);
NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(Collections.singleton(master), otherNodes), new NetworkDisconnect());
internalCluster().setDisruptionScheme(partition);
logger.info("--> disrupting network");
partition.startDisrupting();
logger.info("--> waiting for new master to be elected");
ensureStableCluster(3, dataNode);
partition.stopDisrupting();
logger.info("--> waiting to heal");
ensureStableCluster(4);
indexingThread.join();
ensureGreen("myindex");
refresh();
assertThat(client().prepareSearch("myindex").get().getHits().getTotalHits(), equalTo(10L));
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method testSendingShardFailure.
// simulate handling of sending shard failure during an isolation
public void testSendingShardFailure() throws Exception {
List<String> nodes = startCluster(3, 2);
String masterNode = internalCluster().getMasterName();
List<String> nonMasterNodes = nodes.stream().filter(node -> !node.equals(masterNode)).collect(Collectors.toList());
String nonMasterNode = randomFrom(nonMasterNodes);
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 3).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 2)));
ensureGreen();
String nonMasterNodeId = internalCluster().clusterService(nonMasterNode).localNode().getId();
// fail a random shard
ShardRouting failedShard = randomFrom(clusterService().state().getRoutingNodes().node(nonMasterNodeId).shardsWithState(ShardRoutingState.STARTED));
ShardStateAction service = internalCluster().getInstance(ShardStateAction.class, nonMasterNode);
CountDownLatch latch = new CountDownLatch(1);
AtomicBoolean success = new AtomicBoolean();
String isolatedNode = randomBoolean() ? masterNode : nonMasterNode;
TwoPartitions partitions = isolateNode(isolatedNode);
// we cannot use the NetworkUnresponsive disruption type here as it will swallow the "shard failed" request, calling neither
// onSuccess nor onFailure on the provided listener.
NetworkLinkDisruptionType disruptionType = new NetworkDisconnect();
NetworkDisruption networkDisruption = new NetworkDisruption(partitions, disruptionType);
setDisruptionScheme(networkDisruption);
networkDisruption.startDisrupting();
service.localShardFailed(failedShard, "simulated", new CorruptIndexException("simulated", (String) null), new ShardStateAction.Listener() {
@Override
public void onSuccess() {
success.set(true);
latch.countDown();
}
@Override
public void onFailure(Exception e) {
success.set(false);
latch.countDown();
assert false;
}
});
if (isolatedNode.equals(nonMasterNode)) {
assertNoMaster(nonMasterNode);
} else {
ensureStableCluster(2, nonMasterNode);
}
// heal the partition
networkDisruption.removeAndEnsureHealthy(internalCluster());
// the cluster should stabilize
ensureStableCluster(3);
latch.await();
// the listener should be notified
assertTrue(success.get());
// the failed shard should be gone
List<ShardRouting> shards = clusterService().state().getRoutingTable().allShards("test");
for (ShardRouting shard : shards) {
assertThat(shard.allocationId(), not(equalTo(failedShard.allocationId())));
}
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project elasticsearch by elastic.
the class DiscoveryWithServiceDisruptionsIT method addRandomDisruptionScheme.
private ServiceDisruptionScheme addRandomDisruptionScheme() {
// TODO: add partial partitions
final DisruptedLinks disruptedLinks;
if (randomBoolean()) {
disruptedLinks = TwoPartitions.random(random(), internalCluster().getNodeNames());
} else {
disruptedLinks = Bridge.random(random(), internalCluster().getNodeNames());
}
final NetworkLinkDisruptionType disruptionType;
switch(randomInt(2)) {
case 0:
disruptionType = new NetworkUnresponsive();
break;
case 1:
disruptionType = new NetworkDisconnect();
break;
case 2:
disruptionType = NetworkDelay.random(random());
break;
default:
throw new IllegalArgumentException();
}
final ServiceDisruptionScheme scheme;
if (rarely()) {
scheme = new SlowClusterStateProcessing(random());
} else {
scheme = new NetworkDisruption(disruptedLinks, disruptionType);
}
setDisruptionScheme(scheme);
return scheme;
}
Aggregations