use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.
the class NoClusterManagerNodeIT method testNoClusterManagerActionsWriteClusterManagerBlock.
public void testNoClusterManagerActionsWriteClusterManagerBlock() throws Exception {
Settings settings = Settings.builder().put(AutoCreateIndex.AUTO_CREATE_INDEX_SETTING.getKey(), false).put(NoMasterBlockService.NO_CLUSTER_MANAGER_BLOCK_SETTING.getKey(), "write").build();
final List<String> nodes = internalCluster().startNodes(3, settings);
prepareCreate("test1").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 2)).get();
prepareCreate("test2").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 3).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)).get();
client().admin().cluster().prepareHealth("_all").setWaitForGreenStatus().get();
client().prepareIndex("test1").setId("1").setSource("field", "value1").get();
client().prepareIndex("test2").setId("1").setSource("field", "value1").get();
refresh();
ensureSearchable("test1", "test2");
ClusterStateResponse clusterState = client().admin().cluster().prepareState().get();
logger.info("Cluster state:\n{}", clusterState.getState());
final NetworkDisruption disruptionScheme = new NetworkDisruption(new IsolateAllNodes(new HashSet<>(nodes)), NetworkDisruption.DISCONNECT);
internalCluster().setDisruptionScheme(disruptionScheme);
disruptionScheme.startDisrupting();
final Client clientToClusterManagerlessNode = client();
assertBusy(() -> {
ClusterState state = clientToClusterManagerlessNode.admin().cluster().prepareState().setLocal(true).get().getState();
assertTrue(state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID));
});
GetResponse getResponse = clientToClusterManagerlessNode.prepareGet("test1", "1").get();
assertExists(getResponse);
SearchResponse countResponse = clientToClusterManagerlessNode.prepareSearch("test1").setAllowPartialSearchResults(true).setSize(0).get();
assertHitCount(countResponse, 1L);
logger.info("--> here 3");
SearchResponse searchResponse = clientToClusterManagerlessNode.prepareSearch("test1").setAllowPartialSearchResults(true).get();
assertHitCount(searchResponse, 1L);
countResponse = clientToClusterManagerlessNode.prepareSearch("test2").setAllowPartialSearchResults(true).setSize(0).get();
assertThat(countResponse.getTotalShards(), equalTo(3));
assertThat(countResponse.getSuccessfulShards(), equalTo(1));
TimeValue timeout = TimeValue.timeValueMillis(200);
long now = System.currentTimeMillis();
try {
clientToClusterManagerlessNode.prepareUpdate("test1", "1").setDoc(Requests.INDEX_CONTENT_TYPE, "field", "value2").setTimeout(timeout).get();
fail("Expected ClusterBlockException");
} catch (ClusterBlockException e) {
assertThat(System.currentTimeMillis() - now, greaterThan(timeout.millis() - 50));
assertThat(e.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
} catch (Exception e) {
logger.info("unexpected", e);
throw e;
}
try {
clientToClusterManagerlessNode.prepareIndex("test1").setId("1").setSource(XContentFactory.jsonBuilder().startObject().endObject()).setTimeout(timeout).get();
fail("Expected ClusterBlockException");
} catch (ClusterBlockException e) {
assertThat(e.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
}
internalCluster().clearDisruptionScheme(true);
}
use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.
the class PrimaryAllocationIT method testPrimaryReplicaResyncFailed.
/**
* This test asserts that replicas failed to execute resync operations will be failed but not marked as stale.
*/
public void testPrimaryReplicaResyncFailed() throws Exception {
String clusterManager = internalCluster().startClusterManagerOnlyNode(Settings.EMPTY);
final int numberOfReplicas = between(2, 3);
final String oldPrimary = internalCluster().startDataOnlyNode();
assertAcked(prepareCreate("test", Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1).put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicas)));
final ShardId shardId = new ShardId(clusterService().state().metadata().index("test").getIndex(), 0);
final Set<String> replicaNodes = new HashSet<>(internalCluster().startDataOnlyNodes(numberOfReplicas));
ensureGreen();
String timeout = randomFrom("0s", "1s", "2s");
assertAcked(client(clusterManager).admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put("cluster.routing.allocation.enable", "none")).setPersistentSettings(Settings.builder().put("indices.replication.retry_timeout", timeout)).get());
logger.info("--> Indexing with gap in seqno to ensure that some operations will be replayed in resync");
long numDocs = scaledRandomIntBetween(5, 50);
for (int i = 0; i < numDocs; i++) {
IndexResponse indexResult = index("test", "doc", Long.toString(i));
assertThat(indexResult.getShardInfo().getSuccessful(), equalTo(numberOfReplicas + 1));
}
final IndexShard oldPrimaryShard = internalCluster().getInstance(IndicesService.class, oldPrimary).getShardOrNull(shardId);
// Make gap in seqno.
EngineTestCase.generateNewSeqNo(IndexShardTestCase.getEngine(oldPrimaryShard));
long moreDocs = scaledRandomIntBetween(1, 10);
for (int i = 0; i < moreDocs; i++) {
IndexResponse indexResult = index("test", "doc", Long.toString(numDocs + i));
assertThat(indexResult.getShardInfo().getSuccessful(), equalTo(numberOfReplicas + 1));
}
final Set<String> replicasSide1 = Sets.newHashSet(randomSubsetOf(between(1, numberOfReplicas - 1), replicaNodes));
final Set<String> replicasSide2 = Sets.difference(replicaNodes, replicasSide1);
NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(replicasSide1, replicasSide2), NetworkDisruption.DISCONNECT);
internalCluster().setDisruptionScheme(partition);
logger.info("--> isolating some replicas during primary-replica resync");
partition.startDisrupting();
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(oldPrimary));
// Checks that we fails replicas in one side but not mark them as stale.
assertBusy(() -> {
ClusterState state = client(clusterManager).admin().cluster().prepareState().get().getState();
final IndexShardRoutingTable shardRoutingTable = state.routingTable().shardRoutingTable(shardId);
final String newPrimaryNode = state.getRoutingNodes().node(shardRoutingTable.primary.currentNodeId()).node().getName();
assertThat(newPrimaryNode, not(equalTo(oldPrimary)));
Set<String> selectedPartition = replicasSide1.contains(newPrimaryNode) ? replicasSide1 : replicasSide2;
assertThat(shardRoutingTable.activeShards(), hasSize(selectedPartition.size()));
for (ShardRouting activeShard : shardRoutingTable.activeShards()) {
assertThat(state.getRoutingNodes().node(activeShard.currentNodeId()).node().getName(), is(in(selectedPartition)));
}
assertThat(state.metadata().index("test").inSyncAllocationIds(shardId.id()), hasSize(numberOfReplicas + 1));
}, 1, TimeUnit.MINUTES);
assertAcked(client(clusterManager).admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put("cluster.routing.allocation.enable", "all")).get());
partition.stopDisrupting();
partition.ensureHealthy(internalCluster());
logger.info("--> stop disrupting network and re-enable allocation");
assertBusy(() -> {
ClusterState state = client(clusterManager).admin().cluster().prepareState().get().getState();
assertThat(state.routingTable().shardRoutingTable(shardId).activeShards(), hasSize(numberOfReplicas));
assertThat(state.metadata().index("test").inSyncAllocationIds(shardId.id()), hasSize(numberOfReplicas + 1));
for (String node : replicaNodes) {
IndexShard shard = internalCluster().getInstance(IndicesService.class, node).getShardOrNull(shardId);
assertThat(shard.getLocalCheckpoint(), equalTo(numDocs + moreDocs));
}
}, 30, TimeUnit.SECONDS);
internalCluster().assertConsistentHistoryBetweenTranslogAndLuceneIndex();
}
use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.
the class SnapshotShardsServiceIT method testRetryPostingSnapshotStatusMessages.
public void testRetryPostingSnapshotStatusMessages() throws Exception {
internalCluster().startClusterManagerOnlyNode();
internalCluster().startDataOnlyNode();
createRepository("test-repo", "mock");
final int shards = between(1, 10);
assertAcked(prepareCreate("test-index", 0, indexSettingsNoReplicas(shards)));
ensureGreen();
indexRandomDocs("test-index", scaledRandomIntBetween(50, 100));
logger.info("--> blocking repository");
String blockedNode = blockNodeWithIndex("test-repo", "test-index");
dataNodeClient().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap").setWaitForCompletion(false).setIndices("test-index").get();
waitForBlock(blockedNode, "test-repo", TimeValue.timeValueSeconds(60));
final SnapshotId snapshotId = getSnapshot("test-repo", "test-snap").snapshotId();
logger.info("--> start disrupting cluster");
final NetworkDisruption networkDisruption = isolateClusterManagerDisruption(NetworkDisruption.NetworkDelay.random(random()));
internalCluster().setDisruptionScheme(networkDisruption);
networkDisruption.startDisrupting();
logger.info("--> unblocking repository");
unblockNode("test-repo", blockedNode);
// Retrieve snapshot status from the data node.
SnapshotShardsService snapshotShardsService = internalCluster().getInstance(SnapshotShardsService.class, blockedNode);
assertBusy(() -> {
final Snapshot snapshot = new Snapshot("test-repo", snapshotId);
List<IndexShardSnapshotStatus.Stage> stages = snapshotShardsService.currentSnapshotShards(snapshot).values().stream().map(status -> status.asCopy().getStage()).collect(Collectors.toList());
assertThat(stages, hasSize(shards));
assertThat(stages, everyItem(equalTo(IndexShardSnapshotStatus.Stage.DONE)));
}, 30L, TimeUnit.SECONDS);
logger.info("--> stop disrupting cluster");
networkDisruption.stopDisrupting();
internalCluster().clearDisruptionScheme(true);
assertBusy(() -> {
SnapshotInfo snapshotInfo = getSnapshot("test-repo", "test-snap");
logger.info("Snapshot status [{}], successfulShards [{}]", snapshotInfo.state(), snapshotInfo.successfulShards());
assertThat(snapshotInfo.state(), equalTo(SnapshotState.SUCCESS));
assertThat(snapshotInfo.successfulShards(), equalTo(shards));
}, 30L, TimeUnit.SECONDS);
}
use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.
the class ConcurrentSnapshotsIT method testQueuedSnapshotOperationsAndBrokenRepoOnClusterManagerFailOver2.
public void testQueuedSnapshotOperationsAndBrokenRepoOnClusterManagerFailOver2() throws Exception {
disableRepoConsistencyCheck("This test corrupts the repository on purpose");
internalCluster().startMasterOnlyNodes(3);
final String dataNode = internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
final Path repoPath = randomRepoPath();
createRepository(repoName, "mock", repoPath);
createIndexWithContent("index-one");
createNSnapshots(repoName, randomIntBetween(2, 5));
final long generation = getRepositoryData(repoName).getGenId();
final String clusterManagerNode = internalCluster().getMasterName();
blockMasterFromFinalizingSnapshotOnIndexFile(repoName);
final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshotFromNonClusterManagerClient(repoName, "snapshot-three");
waitForBlock(clusterManagerNode, repoName, TimeValue.timeValueSeconds(30L));
corruptIndexN(repoPath, generation);
final ActionFuture<CreateSnapshotResponse> snapshotFour = startFullSnapshotFromNonClusterManagerClient(repoName, "snapshot-four");
awaitNumberOfSnapshotsInProgress(2);
final NetworkDisruption networkDisruption = isolateClusterManagerDisruption(NetworkDisruption.DISCONNECT);
internalCluster().setDisruptionScheme(networkDisruption);
networkDisruption.startDisrupting();
ensureStableCluster(3, dataNode);
unblockNode(repoName, clusterManagerNode);
networkDisruption.stopDisrupting();
awaitNoMoreRunningOperations();
expectThrows(OpenSearchException.class, snapshotThree::actionGet);
expectThrows(OpenSearchException.class, snapshotFour::actionGet);
}
use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.
the class ConcurrentSnapshotsIT method testClusterManagerFailoverOnFinalizationLoop.
public void testClusterManagerFailoverOnFinalizationLoop() throws Exception {
internalCluster().startMasterOnlyNodes(3);
final String dataNode = internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "mock");
createIndexWithContent("index-test");
final NetworkDisruption networkDisruption = isolateClusterManagerDisruption(NetworkDisruption.DISCONNECT);
internalCluster().setDisruptionScheme(networkDisruption);
final List<String> snapshotNames = createNSnapshots(repoName, randomIntBetween(2, 5));
final String clusterManagerName = internalCluster().getMasterName();
blockMasterFromDeletingIndexNFile(repoName);
final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshotFromClusterManagerClient(repoName, "snap-other");
waitForBlock(clusterManagerName, repoName, TimeValue.timeValueSeconds(30L));
final String snapshotOne = snapshotNames.get(0);
final ActionFuture<AcknowledgedResponse> deleteSnapshotOne = startDeleteSnapshot(repoName, snapshotOne);
awaitNDeletionsInProgress(1);
networkDisruption.startDisrupting();
ensureStableCluster(3, dataNode);
unblockNode(repoName, clusterManagerName);
networkDisruption.stopDisrupting();
ensureStableCluster(4);
assertSuccessful(snapshotThree);
try {
deleteSnapshotOne.actionGet();
} catch (RepositoryException re) {
// ignored
}
awaitNoMoreRunningOperations();
}
Aggregations