Search in sources :

Example 1 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class PrimaryAllocationIT method createStaleReplicaScenario.

// returns data paths settings of in-sync shard copy
private Settings createStaleReplicaScenario(String master) throws Exception {
    client().prepareIndex("test").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
    refresh();
    ClusterState state = client().admin().cluster().prepareState().all().get().getState();
    List<ShardRouting> shards = state.routingTable().allShards("test");
    assertThat(shards.size(), equalTo(2));
    final String primaryNode;
    final String replicaNode;
    if (shards.get(0).primary()) {
        primaryNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
        replicaNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
    } else {
        primaryNode = state.getRoutingNodes().node(shards.get(1).currentNodeId()).node().getName();
        replicaNode = state.getRoutingNodes().node(shards.get(0).currentNodeId()).node().getName();
    }
    NetworkDisruption partition = new NetworkDisruption(new TwoPartitions(Sets.newHashSet(master, replicaNode), Collections.singleton(primaryNode)), NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(partition);
    logger.info("--> partitioning node with primary shard from rest of cluster");
    partition.startDisrupting();
    ensureStableCluster(2, master);
    logger.info("--> index a document into previous replica shard (that is now primary)");
    client(replicaNode).prepareIndex("test").setSource(jsonBuilder().startObject().field("field", "value1").endObject()).get();
    logger.info("--> shut down node that has new acknowledged document");
    final Settings inSyncDataPathSettings = internalCluster().dataPathSettings(replicaNode);
    internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode));
    ensureStableCluster(1, master);
    partition.stopDisrupting();
    logger.info("--> waiting for node with old primary shard to rejoin the cluster");
    ensureStableCluster(2, master);
    logger.info("--> check that old primary shard does not get promoted to primary again");
    // kick reroute and wait for all shard states to be fetched
    client(master).admin().cluster().prepareReroute().get();
    assertBusy(() -> assertThat(internalCluster().getInstance(GatewayAllocator.class, master).getNumberOfInFlightFetches(), equalTo(0)));
    // kick reroute a second time and check that all shards are unassigned
    assertThat(client(master).admin().cluster().prepareReroute().get().getState().getRoutingNodes().unassigned().size(), equalTo(2));
    return inSyncDataPathSettings;
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) GatewayAllocator(org.opensearch.gateway.GatewayAllocator) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) Settings(org.opensearch.common.settings.Settings)

Example 2 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class MasterDisruptionIT method testVerifyApiBlocksDuringPartition.

/**
 * Verify that the proper block is applied when nodes lose their master
 */
public void testVerifyApiBlocksDuringPartition() throws Exception {
    internalCluster().startNodes(3, Settings.builder().putNull(NoMasterBlockService.NO_MASTER_BLOCK_SETTING.getKey()).build());
    // Makes sure that the get request can be executed on each node locally:
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 2)));
    // Everything is stable now, it is now time to simulate evil...
    // but first make sure we have no initializing shards and all is green
    // (waiting for green here, because indexing / search in a yellow index is fine as long as no other nodes go down)
    ensureGreen("test");
    TwoPartitions partitions = TwoPartitions.random(random(), internalCluster().getNodeNames());
    NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
    assertEquals(1, partitions.getMinoritySide().size());
    final String isolatedNode = partitions.getMinoritySide().iterator().next();
    assertEquals(2, partitions.getMajoritySide().size());
    final String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
    // Simulate a network issue between the unlucky node and the rest of the cluster.
    networkDisruption.startDisrupting();
    // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
    // continuously ping until network failures have been resolved. However
    // It may a take a bit before the node detects it has been cut off from the elected master
    logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
    assertNoMaster(isolatedNode, NoMasterBlockService.NO_MASTER_BLOCK_WRITES, TimeValue.timeValueSeconds(30));
    logger.info("wait until elected master has been removed and a new 2 node cluster was from (via [{}])", isolatedNode);
    ensureStableCluster(2, nonIsolatedNode);
    for (String node : partitions.getMajoritySide()) {
        ClusterState nodeState = getNodeClusterState(node);
        boolean success = true;
        if (nodeState.nodes().getMasterNode() == null) {
            success = false;
        }
        if (!nodeState.blocks().global().isEmpty()) {
            success = false;
        }
        if (!success) {
            fail("node [" + node + "] has no master or has blocks, despite of being on the right side of the partition. State dump:\n" + nodeState);
        }
    }
    networkDisruption.stopDisrupting();
    // Wait until the master node sees al 3 nodes again.
    ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()));
    logger.info("Verify no master block with {} set to {}", NoMasterBlockService.NO_MASTER_BLOCK_SETTING.getKey(), "all");
    client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(NoMasterBlockService.NO_MASTER_BLOCK_SETTING.getKey(), "all")).get();
    networkDisruption.startDisrupting();
    // The unlucky node must report *no* master node, since it can't connect to master and in fact it should
    // continuously ping until network failures have been resolved. However
    // It may a take a bit before the node detects it has been cut off from the elected master
    logger.info("waiting for isolated node [{}] to have no master", isolatedNode);
    assertNoMaster(isolatedNode, NoMasterBlockService.NO_MASTER_BLOCK_ALL, TimeValue.timeValueSeconds(30));
    // make sure we have stable cluster & cross partition recoveries are canceled by the removal of the missing node
    // the unresponsive partition causes recoveries to only time out after 15m (default) and these will cause
    // the test to fail due to unfreed resources
    ensureStableCluster(2, nonIsolatedNode);
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) TimeValue(org.opensearch.common.unit.TimeValue)

Example 3 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class MasterDisruptionIT method testIsolateMasterAndVerifyClusterStateConsensus.

/**
 * This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
 * and verifies that all node agree on the new cluster state
 */
public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
    final List<String> nodes = startCluster(3);
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2)).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomInt(2))));
    ensureGreen();
    String isolatedNode = internalCluster().getMasterName();
    TwoPartitions partitions = isolateNode(isolatedNode);
    NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
    networkDisruption.startDisrupting();
    String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
    // make sure cluster reforms
    ensureStableCluster(2, nonIsolatedNode);
    // make sure isolated need picks up on things.
    assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));
    // restore isolation
    networkDisruption.stopDisrupting();
    for (String node : nodes) {
        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()), true, node);
    }
    logger.info("issue a reroute");
    // trigger a reroute now, instead of waiting for the background reroute of RerouteService
    assertAcked(client().admin().cluster().prepareReroute());
    // and wait for it to finish and for the cluster to stabilize
    ensureGreen("test");
    // verify all cluster states are the same
    // use assert busy to wait for cluster states to be applied (as publish_timeout has low value)
    assertBusy(() -> {
        ClusterState state = null;
        for (String node : nodes) {
            ClusterState nodeState = getNodeClusterState(node);
            if (state == null) {
                state = nodeState;
                continue;
            }
            // assert nodes are identical
            try {
                assertEquals("unequal versions", state.version(), nodeState.version());
                assertEquals("unequal node count", state.nodes().getSize(), nodeState.nodes().getSize());
                assertEquals("different masters ", state.nodes().getMasterNodeId(), nodeState.nodes().getMasterNodeId());
                assertEquals("different meta data version", state.metadata().version(), nodeState.metadata().version());
                assertEquals("different routing", state.routingTable().toString(), nodeState.routingTable().toString());
            } catch (AssertionError t) {
                fail("failed comparing cluster state: " + t.getMessage() + "\n" + "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state + "\n--- cluster state [" + node + "]: ---\n" + nodeState);
            }
        }
    });
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) TimeValue(org.opensearch.common.unit.TimeValue)

Example 4 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class ConcurrentSnapshotsIT method testQueuedSnapshotOperationsAndBrokenRepoOnMasterFailOver2.

public void testQueuedSnapshotOperationsAndBrokenRepoOnMasterFailOver2() throws Exception {
    disableRepoConsistencyCheck("This test corrupts the repository on purpose");
    internalCluster().startMasterOnlyNodes(3);
    final String dataNode = internalCluster().startDataOnlyNode();
    final String repoName = "test-repo";
    final Path repoPath = randomRepoPath();
    createRepository(repoName, "mock", repoPath);
    createIndexWithContent("index-one");
    createNSnapshots(repoName, randomIntBetween(2, 5));
    final long generation = getRepositoryData(repoName).getGenId();
    final String masterNode = internalCluster().getMasterName();
    blockMasterFromFinalizingSnapshotOnIndexFile(repoName);
    final ActionFuture<CreateSnapshotResponse> snapshotThree = startFullSnapshotFromNonMasterClient(repoName, "snapshot-three");
    waitForBlock(masterNode, repoName, TimeValue.timeValueSeconds(30L));
    corruptIndexN(repoPath, generation);
    final ActionFuture<CreateSnapshotResponse> snapshotFour = startFullSnapshotFromNonMasterClient(repoName, "snapshot-four");
    awaitNumberOfSnapshotsInProgress(2);
    final NetworkDisruption networkDisruption = isolateMasterDisruption(NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(networkDisruption);
    networkDisruption.startDisrupting();
    ensureStableCluster(3, dataNode);
    unblockNode(repoName, masterNode);
    networkDisruption.stopDisrupting();
    awaitNoMoreRunningOperations();
    expectThrows(OpenSearchException.class, snapshotThree::actionGet);
    expectThrows(OpenSearchException.class, snapshotFour::actionGet);
}
Also used : Path(java.nio.file.Path) CreateSnapshotResponse(org.opensearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse) Matchers.containsString(org.hamcrest.Matchers.containsString) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption)

Example 5 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class IndexingMasterFailoverIT method testMasterFailoverDuringIndexingWithMappingChanges.

/**
 * Indexing operations which entail mapping changes require a blocking request to the master node to update the mapping.
 * If the master node is being disrupted or if it cannot commit cluster state changes, it needs to retry within timeout limits.
 * This retry logic is implemented in TransportMasterNodeAction and tested by the following master failover scenario.
 */
public void testMasterFailoverDuringIndexingWithMappingChanges() throws Throwable {
    logger.info("--> start 4 nodes, 3 master, 1 data");
    internalCluster().setBootstrapMasterNodeIndex(2);
    internalCluster().startMasterOnlyNodes(3, Settings.EMPTY);
    String dataNode = internalCluster().startDataOnlyNode(Settings.EMPTY);
    logger.info("--> wait for all nodes to join the cluster");
    ensureStableCluster(4);
    // We index data with mapping changes into cluster and have master failover at same time
    client().admin().indices().prepareCreate("myindex").setSettings(Settings.builder().put("index.number_of_shards", 1).put("index.number_of_replicas", 0)).get();
    ensureGreen("myindex");
    final CyclicBarrier barrier = new CyclicBarrier(2);
    Thread indexingThread = new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                barrier.await();
            } catch (InterruptedException e) {
                logger.warn("Barrier interrupted", e);
                return;
            } catch (BrokenBarrierException e) {
                logger.warn("Broken barrier", e);
                return;
            }
            for (int i = 0; i < 10; i++) {
                // index data
                IndexResponse response = client(dataNode).prepareIndex("myindex").setSource("field_" + i, "val").get();
                assertEquals(DocWriteResponse.Result.CREATED, response.getResult());
            }
        }
    });
    indexingThread.setName("indexingThread");
    indexingThread.start();
    barrier.await();
    // interrupt communication between master and other nodes in cluster
    NetworkDisruption partition = isolateMasterDisruption(NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(partition);
    logger.info("--> disrupting network");
    partition.startDisrupting();
    logger.info("--> waiting for new master to be elected");
    ensureStableCluster(3, dataNode);
    partition.stopDisrupting();
    logger.info("--> waiting to heal");
    ensureStableCluster(4);
    indexingThread.join();
    ensureGreen("myindex");
    refresh();
    assertThat(client().prepareSearch("myindex").get().getHits().getTotalHits().value, equalTo(10L));
}
Also used : BrokenBarrierException(java.util.concurrent.BrokenBarrierException) IndexResponse(org.opensearch.action.index.IndexResponse) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) CyclicBarrier(java.util.concurrent.CyclicBarrier)

Aggregations

NetworkDisruption (org.opensearch.test.disruption.NetworkDisruption)26 TwoPartitions (org.opensearch.test.disruption.NetworkDisruption.TwoPartitions)10 ClusterState (org.opensearch.cluster.ClusterState)9 HashSet (java.util.HashSet)8 Settings (org.opensearch.common.settings.Settings)8 TimeValue (org.opensearch.common.unit.TimeValue)8 CreateSnapshotResponse (org.opensearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse)7 List (java.util.List)5 CountDownLatch (java.util.concurrent.CountDownLatch)5 Collectors (java.util.stream.Collectors)5 Matchers.equalTo (org.hamcrest.Matchers.equalTo)5 GetResponse (org.opensearch.action.get.GetResponse)5 IndexResponse (org.opensearch.action.index.IndexResponse)5 Client (org.opensearch.client.Client)5 OpenSearchIntegTestCase (org.opensearch.test.OpenSearchIntegTestCase)5 Collections (java.util.Collections)4 TimeUnit (java.util.concurrent.TimeUnit)4 Matchers.containsString (org.hamcrest.Matchers.containsString)4 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)4 ServiceDisruptionScheme (org.opensearch.test.disruption.ServiceDisruptionScheme)4