Search in sources :

Example 26 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class NoClusterManagerNodeIT method testNoClusterManagerActionsWriteClusterManagerBlock.

public void testNoClusterManagerActionsWriteClusterManagerBlock() throws Exception {
    Settings settings = Settings.builder().put(AutoCreateIndex.AUTO_CREATE_INDEX_SETTING.getKey(), false).put(NoMasterBlockService.NO_CLUSTER_MANAGER_BLOCK_SETTING.getKey(), "write").build();
    final List<String> nodes = internalCluster().startNodes(3, settings);
    prepareCreate("test1").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 2)).get();
    prepareCreate("test2").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 3).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)).get();
    client().admin().cluster().prepareHealth("_all").setWaitForGreenStatus().get();
    client().prepareIndex("test1").setId("1").setSource("field", "value1").get();
    client().prepareIndex("test2").setId("1").setSource("field", "value1").get();
    refresh();
    ensureSearchable("test1", "test2");
    ClusterStateResponse clusterState = client().admin().cluster().prepareState().get();
    logger.info("Cluster state:\n{}", clusterState.getState());
    final NetworkDisruption disruptionScheme = new NetworkDisruption(new IsolateAllNodes(new HashSet<>(nodes)), NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(disruptionScheme);
    disruptionScheme.startDisrupting();
    final Client clientToClusterManagerlessNode = client();
    assertBusy(() -> {
        ClusterState state = clientToClusterManagerlessNode.admin().cluster().prepareState().setLocal(true).get().getState();
        assertTrue(state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID));
    });
    GetResponse getResponse = clientToClusterManagerlessNode.prepareGet("test1", "1").get();
    assertExists(getResponse);
    SearchResponse countResponse = clientToClusterManagerlessNode.prepareSearch("test1").setAllowPartialSearchResults(true).setSize(0).get();
    assertHitCount(countResponse, 1L);
    logger.info("--> here 3");
    SearchResponse searchResponse = clientToClusterManagerlessNode.prepareSearch("test1").setAllowPartialSearchResults(true).get();
    assertHitCount(searchResponse, 1L);
    countResponse = clientToClusterManagerlessNode.prepareSearch("test2").setAllowPartialSearchResults(true).setSize(0).get();
    assertThat(countResponse.getTotalShards(), equalTo(3));
    assertThat(countResponse.getSuccessfulShards(), equalTo(1));
    TimeValue timeout = TimeValue.timeValueMillis(200);
    long now = System.currentTimeMillis();
    try {
        clientToClusterManagerlessNode.prepareUpdate("test1", "1").setDoc(Requests.INDEX_CONTENT_TYPE, "field", "value2").setTimeout(timeout).get();
        fail("Expected ClusterBlockException");
    } catch (ClusterBlockException e) {
        assertThat(System.currentTimeMillis() - now, greaterThan(timeout.millis() - 50));
        assertThat(e.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
    } catch (Exception e) {
        logger.info("unexpected", e);
        throw e;
    }
    try {
        clientToClusterManagerlessNode.prepareIndex("test1").setId("1").setSource(XContentFactory.jsonBuilder().startObject().endObject()).setTimeout(timeout).get();
        fail("Expected ClusterBlockException");
    } catch (ClusterBlockException e) {
        assertThat(e.status(), equalTo(RestStatus.SERVICE_UNAVAILABLE));
    }
    internalCluster().clearDisruptionScheme(true);
}
Also used : IsolateAllNodes(org.opensearch.test.disruption.NetworkDisruption.IsolateAllNodes) ClusterStateResponse(org.opensearch.action.admin.cluster.state.ClusterStateResponse) GetResponse(org.opensearch.action.get.GetResponse) ClusterBlockException(org.opensearch.cluster.block.ClusterBlockException) MasterNotDiscoveredException(org.opensearch.discovery.MasterNotDiscoveredException) ClusterBlockException(org.opensearch.cluster.block.ClusterBlockException) SearchResponse(org.opensearch.action.search.SearchResponse) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) Client(org.opensearch.client.Client) Settings(org.opensearch.common.settings.Settings) TimeValue(org.opensearch.common.unit.TimeValue) HashSet(java.util.HashSet)

Example 27 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class PrimaryAllocationIT method testPrimaryReplicaResyncFailed.

/**
 * This test asserts that replicas failed to execute resync operations will be failed but not marked as stale.
 */
public void testPrimaryReplicaResyncFailed() throws Exception {
    String clusterManager = internalCluster().startClusterManagerOnlyNode(Settings.EMPTY);
    final int numberOfReplicas = between(2, 3);
    final String oldPrimary = internalCluster().startDataOnlyNode();
    assertAcked(prepareCreate("test", Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, 1).put(SETTING_NUMBER_OF_REPLICAS, numberOfReplicas)));
    final ShardId shardId = new ShardId(clusterService().state().metadata().index("test").getIndex(), 0);
    final Set<String> replicaNodes = new HashSet<>(internalCluster().startDataOnlyNodes(numberOfReplicas));
    ensureGreen();
    String timeout = randomFrom("0s", "1s", "2s");
    assertAcked(client(clusterManager).admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put("cluster.routing.allocation.enable", "none")).setPersistentSettings(Settings.builder().put("indices.replication.retry_timeout", timeout)).get());
    logger.info("--> Indexing with gap in seqno to ensure that some operations will be replayed in resync");
    long numDocs = scaledRandomIntBetween(5, 50);
    for (int i = 0; i < numDocs; i++) {
        IndexResponse indexResult = index("test", "doc", Long.toString(i));
        assertThat(indexResult.getShardInfo().getSuccessful(), equalTo(numberOfReplicas + 1));
    }
    final IndexShard oldPrimaryShard = internalCluster().getInstance(IndicesService.class, oldPrimary).getShardOrNull(shardId);
    // Make gap in seqno.
    EngineTestCase.generateNewSeqNo(IndexShardTestCase.getEngine(oldPrimaryShard));
    long moreDocs = scaledRandomIntBetween(1, 10);
    for (int i = 0; i < moreDocs; i++) {
        IndexResponse indexResult = index("test", "doc", Long.toString(numDocs + i));
        assertThat(indexResult.getShardInfo().getSuccessful(), equalTo(numberOfReplicas + 1));
    }
    final Set<String> replicasSide1 = Sets.newHashSet(randomSubsetOf(between(1, numberOfReplicas - 1), replicaNodes));
    final Set<String> replicasSide2 = Sets.difference(replicaNodes, replicasSide1);
    NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(replicasSide1, replicasSide2), NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(partition);
    logger.info("--> isolating some replicas during primary-replica resync");
    partition.startDisrupting();
    internalCluster().stopRandomNode(InternalTestCluster.nameFilter(oldPrimary));
    // Checks that we fails replicas in one side but not mark them as stale.
    assertBusy(() -> {
        ClusterState state = client(clusterManager).admin().cluster().prepareState().get().getState();
        final IndexShardRoutingTable shardRoutingTable = state.routingTable().shardRoutingTable(shardId);
        final String newPrimaryNode = state.getRoutingNodes().node(shardRoutingTable.primary.currentNodeId()).node().getName();
        assertThat(newPrimaryNode, not(equalTo(oldPrimary)));
        Set<String> selectedPartition = replicasSide1.contains(newPrimaryNode) ? replicasSide1 : replicasSide2;
        assertThat(shardRoutingTable.activeShards(), hasSize(selectedPartition.size()));
        for (ShardRouting activeShard : shardRoutingTable.activeShards()) {
            assertThat(state.getRoutingNodes().node(activeShard.currentNodeId()).node().getName(), is(in(selectedPartition)));
        }
        assertThat(state.metadata().index("test").inSyncAllocationIds(shardId.id()), hasSize(numberOfReplicas + 1));
    }, 1, TimeUnit.MINUTES);
    assertAcked(client(clusterManager).admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put("cluster.routing.allocation.enable", "all")).get());
    partition.stopDisrupting();
    partition.ensureHealthy(internalCluster());
    logger.info("--> stop disrupting network and re-enable allocation");
    assertBusy(() -> {
        ClusterState state = client(clusterManager).admin().cluster().prepareState().get().getState();
        assertThat(state.routingTable().shardRoutingTable(shardId).activeShards(), hasSize(numberOfReplicas));
        assertThat(state.metadata().index("test").inSyncAllocationIds(shardId.id()), hasSize(numberOfReplicas + 1));
        for (String node : replicaNodes) {
            IndexShard shard = internalCluster().getInstance(IndicesService.class, node).getShardOrNull(shardId);
            assertThat(shard.getLocalCheckpoint(), equalTo(numDocs + moreDocs));
        }
    }, 30, TimeUnit.SECONDS);
    internalCluster().assertConsistentHistoryBetweenTranslogAndLuceneIndex();
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) IndexShard(org.opensearch.index.shard.IndexShard) IndicesService(org.opensearch.indices.IndicesService) ShardId(org.opensearch.index.shard.ShardId) IndexResponse(org.opensearch.action.index.IndexResponse) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) HashSet(java.util.HashSet)

Example 28 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class SnapshotShardsServiceIT method testRetryPostingSnapshotStatusMessages.

public void testRetryPostingSnapshotStatusMessages() throws Exception {
    internalCluster().startClusterManagerOnlyNode();
    internalCluster().startDataOnlyNode();
    createRepository("test-repo", "mock");
    final int shards = between(1, 10);
    assertAcked(prepareCreate("test-index", 0, indexSettingsNoReplicas(shards)));
    ensureGreen();
    indexRandomDocs("test-index", scaledRandomIntBetween(50, 100));
    logger.info("--> blocking repository");
    String blockedNode = blockNodeWithIndex("test-repo", "test-index");
    dataNodeClient().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap").setWaitForCompletion(false).setIndices("test-index").get();
    waitForBlock(blockedNode, "test-repo", TimeValue.timeValueSeconds(60));
    final SnapshotId snapshotId = getSnapshot("test-repo", "test-snap").snapshotId();
    logger.info("--> start disrupting cluster");
    final NetworkDisruption networkDisruption = isolateClusterManagerDisruption(NetworkDisruption.NetworkDelay.random(random()));
    internalCluster().setDisruptionScheme(networkDisruption);
    networkDisruption.startDisrupting();
    logger.info("--> unblocking repository");
    unblockNode("test-repo", blockedNode);
    // Retrieve snapshot status from the data node.
    SnapshotShardsService snapshotShardsService = internalCluster().getInstance(SnapshotShardsService.class, blockedNode);
    assertBusy(() -> {
        final Snapshot snapshot = new Snapshot("test-repo", snapshotId);
        List<IndexShardSnapshotStatus.Stage> stages = snapshotShardsService.currentSnapshotShards(snapshot).values().stream().map(status -> status.asCopy().getStage()).collect(Collectors.toList());
        assertThat(stages, hasSize(shards));
        assertThat(stages, everyItem(equalTo(IndexShardSnapshotStatus.Stage.DONE)));
    }, 30L, TimeUnit.SECONDS);
    logger.info("--> stop disrupting cluster");
    networkDisruption.stopDisrupting();
    internalCluster().clearDisruptionScheme(true);
    assertBusy(() -> {
        SnapshotInfo snapshotInfo = getSnapshot("test-repo", "test-snap");
        logger.info("Snapshot status [{}], successfulShards [{}]", snapshotInfo.state(), snapshotInfo.successfulShards());
        assertThat(snapshotInfo.state(), equalTo(SnapshotState.SUCCESS));
        assertThat(snapshotInfo.successfulShards(), equalTo(shards));
    }, 30L, TimeUnit.SECONDS);
}
Also used : OpenSearchAssertions.assertAcked(org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked) Arrays(java.util.Arrays) TimeValue(org.opensearch.common.unit.TimeValue) Collection(java.util.Collection) MockTransportService(org.opensearch.test.transport.MockTransportService) Collectors(java.util.stream.Collectors) Plugin(org.opensearch.plugins.Plugin) MockRepository(org.opensearch.snapshots.mockstore.MockRepository) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) IndexShardSnapshotStatus(org.opensearch.index.snapshots.IndexShardSnapshotStatus) Matchers.everyItem(org.hamcrest.Matchers.everyItem) Matchers.equalTo(org.hamcrest.Matchers.equalTo) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) Matchers.hasSize(org.hamcrest.Matchers.hasSize) OpenSearchIntegTestCase(org.opensearch.test.OpenSearchIntegTestCase) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption)

Example 29 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class ConcurrentSnapshotsIT method testQueuedSnapshotOperationsAndBrokenRepoOnClusterManagerFailOver2.

public void testQueuedSnapshotOperationsAndBrokenRepoOnClusterManagerFailOver2() throws Exception {
    disableRepoConsistencyCheck("This test corrupts the repository on purpose");
    internalCluster().startMasterOnlyNodes(3);
    final String dataNode = internalCluster().startDataOnlyNode();
    final String repoName = "test-repo";
    final Path repoPath = randomRepoPath();
    createRepository(repoName, "mock", repoPath);
    createIndexWithContent("index-one");
    createNSnapshots(repoName, randomIntBetween(2, 5));
    final long generation = getRepositoryData(repoName).getGenId();
    final String clusterManagerNode = internalCluster().getMasterName();
    blockMasterFromFinalizingSnapshotOnIndexFile(repoName);
    final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshotFromNonClusterManagerClient(repoName, "snapshot-three");
    waitForBlock(clusterManagerNode, repoName, TimeValue.timeValueSeconds(30L));
    corruptIndexN(repoPath, generation);
    final ActionFuture<CreateSnapshotResponse> snapshotFour = startFullSnapshotFromNonClusterManagerClient(repoName, "snapshot-four");
    awaitNumberOfSnapshotsInProgress(2);
    final NetworkDisruption networkDisruption = isolateClusterManagerDisruption(NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(networkDisruption);
    networkDisruption.startDisrupting();
    ensureStableCluster(3, dataNode);
    unblockNode(repoName, clusterManagerNode);
    networkDisruption.stopDisrupting();
    awaitNoMoreRunningOperations();
    expectThrows(OpenSearchException.class, snapshotThree::actionGet);
    expectThrows(OpenSearchException.class, snapshotFour::actionGet);
}
Also used : Path(java.nio.file.Path) CreateSnapshotResponse(org.opensearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse) Matchers.containsString(org.hamcrest.Matchers.containsString) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption)

Example 30 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class ConcurrentSnapshotsIT method testClusterManagerFailoverOnFinalizationLoop.

public void testClusterManagerFailoverOnFinalizationLoop() throws Exception {
    internalCluster().startMasterOnlyNodes(3);
    final String dataNode = internalCluster().startDataOnlyNode();
    final String repoName = "test-repo";
    createRepository(repoName, "mock");
    createIndexWithContent("index-test");
    final NetworkDisruption networkDisruption = isolateClusterManagerDisruption(NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(networkDisruption);
    final List<String> snapshotNames = createNSnapshots(repoName, randomIntBetween(2, 5));
    final String clusterManagerName = internalCluster().getMasterName();
    blockMasterFromDeletingIndexNFile(repoName);
    final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshotFromClusterManagerClient(repoName, "snap-other");
    waitForBlock(clusterManagerName, repoName, TimeValue.timeValueSeconds(30L));
    final String snapshotOne = snapshotNames.get(0);
    final ActionFuture<AcknowledgedResponse> deleteSnapshotOne = startDeleteSnapshot(repoName, snapshotOne);
    awaitNDeletionsInProgress(1);
    networkDisruption.startDisrupting();
    ensureStableCluster(3, dataNode);
    unblockNode(repoName, clusterManagerName);
    networkDisruption.stopDisrupting();
    ensureStableCluster(4);
    assertSuccessful(snapshotThree);
    try {
        deleteSnapshotOne.actionGet();
    } catch (RepositoryException re) {
    // ignored
    }
    awaitNoMoreRunningOperations();
}
Also used : CreateSnapshotResponse(org.opensearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse) AcknowledgedResponse(org.opensearch.action.support.master.AcknowledgedResponse) RepositoryException(org.opensearch.repositories.RepositoryException) Matchers.containsString(org.hamcrest.Matchers.containsString) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption)

Aggregations

NetworkDisruption (org.opensearch.test.disruption.NetworkDisruption)41 HashSet (java.util.HashSet)15 TwoPartitions (org.opensearch.test.disruption.NetworkDisruption.TwoPartitions)14 ClusterState (org.opensearch.cluster.ClusterState)13 Settings (org.opensearch.common.settings.Settings)13 TimeValue (org.opensearch.common.unit.TimeValue)13 CreateSnapshotResponse (org.opensearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse)12 Matchers.containsString (org.hamcrest.Matchers.containsString)8 Client (org.opensearch.client.Client)8 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)8 List (java.util.List)7 CountDownLatch (java.util.concurrent.CountDownLatch)7 Collectors (java.util.stream.Collectors)7 Matchers.equalTo (org.hamcrest.Matchers.equalTo)7 GetResponse (org.opensearch.action.get.GetResponse)7 Collections (java.util.Collections)6 IndexResponse (org.opensearch.action.index.IndexResponse)6 AcknowledgedResponse (org.opensearch.action.support.master.AcknowledgedResponse)6 OpenSearchIntegTestCase (org.opensearch.test.OpenSearchIntegTestCase)6 Collection (java.util.Collection)5