Search in sources :

Example 16 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class ClusterDisruptionIT method testSendingShardFailure.

// simulate handling of sending shard failure during an isolation
public void testSendingShardFailure() throws Exception {
    List<String> nodes = startCluster(3);
    String clusterManagerNode = internalCluster().getMasterName();
    List<String> nonClusterManagerNodes = nodes.stream().filter(node -> !node.equals(clusterManagerNode)).collect(Collectors.toList());
    String nonClusterManagerNode = randomFrom(nonClusterManagerNodes);
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 3).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 2)));
    ensureGreen();
    String nonClusterManagerNodeId = internalCluster().clusterService(nonClusterManagerNode).localNode().getId();
    // fail a random shard
    ShardRouting failedShard = randomFrom(clusterService().state().getRoutingNodes().node(nonClusterManagerNodeId).shardsWithState(ShardRoutingState.STARTED));
    ShardStateAction service = internalCluster().getInstance(ShardStateAction.class, nonClusterManagerNode);
    CountDownLatch latch = new CountDownLatch(1);
    AtomicBoolean success = new AtomicBoolean();
    String isolatedNode = randomBoolean() ? clusterManagerNode : nonClusterManagerNode;
    TwoPartitions partitions = isolateNode(isolatedNode);
    // we cannot use the NetworkUnresponsive disruption type here as it will swallow the "shard failed" request, calling neither
    // onSuccess nor onFailure on the provided listener.
    NetworkDisruption networkDisruption = new NetworkDisruption(partitions, NetworkDisruption.DISCONNECT);
    setDisruptionScheme(networkDisruption);
    networkDisruption.startDisrupting();
    service.localShardFailed(failedShard, "simulated", new CorruptIndexException("simulated", (String) null), new ActionListener<Void>() {

        @Override
        public void onResponse(final Void aVoid) {
            success.set(true);
            latch.countDown();
        }

        @Override
        public void onFailure(Exception e) {
            success.set(false);
            latch.countDown();
            assert false;
        }
    });
    if (isolatedNode.equals(nonClusterManagerNode)) {
        assertNoClusterManager(nonClusterManagerNode);
    } else {
        ensureStableCluster(2, nonClusterManagerNode);
    }
    // heal the partition
    networkDisruption.removeAndEnsureHealthy(internalCluster());
    // the cluster should stabilize
    ensureStableCluster(3);
    latch.await();
    // the listener should be notified
    assertTrue(success.get());
    // the failed shard should be gone
    List<ShardRouting> shards = clusterService().state().getRoutingTable().allShards("test");
    for (ShardRouting shard : shards) {
        assertThat(shard.allocationId(), not(equalTo(failedShard.allocationId())));
    }
}
Also used : IndexRequestBuilder(org.opensearch.action.index.IndexRequestBuilder) ClusterBootstrapService(org.opensearch.cluster.coordination.ClusterBootstrapService) IndexResponse(org.opensearch.action.index.IndexResponse) Matchers.not(org.hamcrest.Matchers.not) OpenSearchException(org.opensearch.OpenSearchException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) ConcurrentCollections(org.opensearch.common.util.concurrent.ConcurrentCollections) Bridge(org.opensearch.test.disruption.NetworkDisruption.Bridge) IndexShardTestCase(org.opensearch.index.shard.IndexShardTestCase) Matchers.everyItem(org.hamcrest.Matchers.everyItem) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ActionListener(org.opensearch.action.ActionListener) GetResponse(org.opensearch.action.get.GetResponse) ShardStateAction(org.opensearch.cluster.action.shard.ShardStateAction) Client(org.opensearch.client.Client) TimeValue(org.opensearch.common.unit.TimeValue) IndicesService(org.opensearch.indices.IndicesService) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) Settings(org.opensearch.common.settings.Settings) ServiceDisruptionScheme(org.opensearch.test.disruption.ServiceDisruptionScheme) Collectors(java.util.stream.Collectors) CountDownLatch(java.util.concurrent.CountDownLatch) VersionType(org.opensearch.index.VersionType) List(java.util.List) UPDATED(org.opensearch.action.DocWriteResponse.Result.UPDATED) CREATED(org.opensearch.action.DocWriteResponse.Result.CREATED) Matchers.equalTo(org.hamcrest.Matchers.equalTo) LagDetector(org.opensearch.cluster.coordination.LagDetector) XContentType(org.opensearch.common.xcontent.XContentType) Matchers.is(org.hamcrest.Matchers.is) OpenSearchIntegTestCase(org.opensearch.test.OpenSearchIntegTestCase) Matchers.in(org.hamcrest.Matchers.in) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) IntStream(java.util.stream.IntStream) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) InternalTestCluster(org.opensearch.test.InternalTestCluster) AtomicReference(java.util.concurrent.atomic.AtomicReference) ArrayList(java.util.ArrayList) ClusterState(org.opensearch.cluster.ClusterState) IndexShard(org.opensearch.index.shard.IndexShard) Murmur3HashFunction(org.opensearch.cluster.routing.Murmur3HashFunction) ShardRoutingState(org.opensearch.cluster.routing.ShardRoutingState) TestIssueLogging(org.opensearch.test.junit.annotations.TestIssueLogging) OpenSearchAssertions.assertAcked(org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked) Matchers.oneOf(org.hamcrest.Matchers.oneOf) Matchers.greaterThanOrEqualTo(org.hamcrest.Matchers.greaterThanOrEqualTo) Semaphore(java.util.concurrent.Semaphore) ShardRouting(org.opensearch.cluster.routing.ShardRouting) TimeUnit(java.util.concurrent.TimeUnit) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) NoShardAvailableActionException(org.opensearch.action.NoShardAvailableActionException) Collections(java.util.Collections) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) ShardStateAction(org.opensearch.cluster.action.shard.ShardStateAction) CountDownLatch(java.util.concurrent.CountDownLatch) OpenSearchException(org.opensearch.OpenSearchException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) NoShardAvailableActionException(org.opensearch.action.NoShardAvailableActionException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ShardRouting(org.opensearch.cluster.routing.ShardRouting) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption)

Example 17 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class ClusterDisruptionIT method testIndicesDeleted.

/**
 * Tests that indices are properly deleted even if there is a cluster-manager transition in between.
 * Test for https://github.com/elastic/elasticsearch/issues/11665
 */
public void testIndicesDeleted() throws Exception {
    final String idxName = "test";
    final List<String> allClusterManagerEligibleNodes = internalCluster().startMasterOnlyNodes(2);
    final String dataNode = internalCluster().startDataOnlyNode();
    ensureStableCluster(3);
    assertAcked(prepareCreate("test"));
    final String clusterManagerNode1 = internalCluster().getMasterName();
    NetworkDisruption networkDisruption = new NetworkDisruption(new TwoPartitions(clusterManagerNode1, dataNode), NetworkDisruption.UNRESPONSIVE);
    internalCluster().setDisruptionScheme(networkDisruption);
    networkDisruption.startDisrupting();
    // We know this will time out due to the partition, we check manually below to not proceed until
    // the delete has been applied to the cluster-manager node and the cluster-manager eligible node.
    internalCluster().client(clusterManagerNode1).admin().indices().prepareDelete(idxName).setTimeout("0s").get();
    // Don't restart the cluster-manager node until we know the index deletion has taken effect on cluster-manager and the
    // cluster-manager eligible node.
    assertBusy(() -> {
        for (String clusterManagerNode : allClusterManagerEligibleNodes) {
            final ClusterState clusterManagerState = internalCluster().clusterService(clusterManagerNode).state();
            assertTrue("index not deleted on " + clusterManagerNode, clusterManagerState.metadata().hasIndex(idxName) == false);
        }
    });
    internalCluster().restartNode(clusterManagerNode1, InternalTestCluster.EMPTY_CALLBACK);
    ensureYellow();
    assertFalse(client().admin().indices().prepareExists(idxName).get().isExists());
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption)

Example 18 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class ClusterDisruptionIT method testAckedIndexing.

/**
 * Test that we do not loose document whose indexing request was successful, under a randomly selected disruption scheme
 * We also collect &amp; report the type of indexing failures that occur.
 * <p>
 * This test is a superset of tests run in the Jepsen test suite, with the exception of versioned updates
 */
@TestIssueLogging(value = "_root:DEBUG,org.opensearch.action.bulk:TRACE,org.opensearch.action.get:TRACE," + "org.opensearch.discovery:TRACE,org.opensearch.action.support.replication:TRACE," + "org.opensearch.cluster.service:TRACE,org.opensearch.indices.recovery:TRACE," + "org.opensearch.indices.cluster:TRACE,org.opensearch.index.shard:TRACE", issueUrl = "https://github.com/elastic/elasticsearch/issues/41068")
public void testAckedIndexing() throws Exception {
    final int seconds = !(TEST_NIGHTLY && rarely()) ? 1 : 5;
    final String timeout = seconds + "s";
    final List<String> nodes = startCluster(rarely() ? 5 : 3);
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(indexSettings()).put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2)).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomInt(2))));
    ensureGreen();
    ServiceDisruptionScheme disruptionScheme = addRandomDisruptionScheme();
    logger.info("disruption scheme [{}] added", disruptionScheme);
    // id -> node sent.
    final ConcurrentHashMap<String, String> ackedDocs = new ConcurrentHashMap<>();
    final AtomicBoolean stop = new AtomicBoolean(false);
    List<Thread> indexers = new ArrayList<>(nodes.size());
    List<Semaphore> semaphores = new ArrayList<>(nodes.size());
    final AtomicInteger idGenerator = new AtomicInteger(0);
    final AtomicReference<CountDownLatch> countDownLatchRef = new AtomicReference<>();
    final List<Exception> exceptedExceptions = new CopyOnWriteArrayList<>();
    final ConflictMode conflictMode = ConflictMode.randomMode();
    final List<String> fieldNames = IntStream.rangeClosed(0, randomInt(10)).mapToObj(n -> "f" + n).collect(Collectors.toList());
    logger.info("starting indexers using conflict mode " + conflictMode);
    try {
        for (final String node : nodes) {
            final Semaphore semaphore = new Semaphore(0);
            semaphores.add(semaphore);
            final Client client = client(node);
            final String name = "indexer_" + indexers.size();
            final int numPrimaries = getNumShards("test").numPrimaries;
            Thread thread = new Thread(() -> {
                while (!stop.get()) {
                    String id = null;
                    try {
                        if (!semaphore.tryAcquire(10, TimeUnit.SECONDS)) {
                            continue;
                        }
                        logger.info("[{}] Acquired semaphore and it has {} permits left", name, semaphore.availablePermits());
                        try {
                            id = Integer.toString(idGenerator.incrementAndGet());
                            int shard = Math.floorMod(Murmur3HashFunction.hash(id), numPrimaries);
                            logger.trace("[{}] indexing id [{}] through node [{}] targeting shard [{}]", name, id, node, shard);
                            IndexRequestBuilder indexRequestBuilder = client.prepareIndex("test").setId(id).setSource(Collections.singletonMap(randomFrom(fieldNames), randomNonNegativeLong()), XContentType.JSON).setTimeout(timeout);
                            if (conflictMode == ConflictMode.external) {
                                indexRequestBuilder.setVersion(randomIntBetween(1, 10)).setVersionType(VersionType.EXTERNAL);
                            } else if (conflictMode == ConflictMode.create) {
                                indexRequestBuilder.setCreate(true);
                            }
                            IndexResponse response = indexRequestBuilder.get(timeout);
                            assertThat(response.getResult(), is(oneOf(CREATED, UPDATED)));
                            ackedDocs.put(id, node);
                            logger.trace("[{}] indexed id [{}] through node [{}], response [{}]", name, id, node, response);
                        } catch (OpenSearchException e) {
                            exceptedExceptions.add(e);
                            final String docId = id;
                            logger.trace(() -> new ParameterizedMessage("[{}] failed id [{}] through node [{}]", name, docId, node), e);
                        } finally {
                            countDownLatchRef.get().countDown();
                            logger.trace("[{}] decreased counter : {}", name, countDownLatchRef.get().getCount());
                        }
                    } catch (InterruptedException e) {
                    // fine - semaphore interrupt
                    } catch (AssertionError | Exception e) {
                        logger.info(() -> new ParameterizedMessage("unexpected exception in background thread of [{}]", node), e);
                    }
                }
            });
            thread.setName(name);
            thread.start();
            indexers.add(thread);
        }
        int docsPerIndexer = randomInt(3);
        logger.info("indexing {} docs per indexer before partition", docsPerIndexer);
        countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
        for (Semaphore semaphore : semaphores) {
            semaphore.release(docsPerIndexer);
        }
        assertTrue(countDownLatchRef.get().await(1, TimeUnit.MINUTES));
        for (int iter = 1 + randomInt(2); iter > 0; iter--) {
            logger.info("starting disruptions & indexing (iteration [{}])", iter);
            disruptionScheme.startDisrupting();
            docsPerIndexer = 1 + randomInt(5);
            logger.info("indexing {} docs per indexer during partition", docsPerIndexer);
            countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
            Collections.shuffle(semaphores, random());
            for (Semaphore semaphore : semaphores) {
                assertThat(semaphore.availablePermits(), equalTo(0));
                semaphore.release(docsPerIndexer);
            }
            logger.info("waiting for indexing requests to complete");
            assertTrue(countDownLatchRef.get().await(docsPerIndexer * seconds * 1000 + 2000, TimeUnit.MILLISECONDS));
            logger.info("stopping disruption");
            disruptionScheme.stopDisrupting();
            for (String node : internalCluster().getNodeNames()) {
                ensureStableCluster(nodes.size(), TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + DISRUPTION_HEALING_OVERHEAD.millis()), true, node);
            }
            // is the super-connected node and recovery source and target are on opposite sides of the bridge
            if (disruptionScheme instanceof NetworkDisruption && ((NetworkDisruption) disruptionScheme).getDisruptedLinks() instanceof Bridge) {
                assertBusy(() -> assertAcked(client().admin().cluster().prepareReroute().setRetryFailed(true)));
            }
            ensureGreen("test");
            logger.info("validating successful docs");
            assertBusy(() -> {
                for (String node : nodes) {
                    try {
                        logger.debug("validating through node [{}] ([{}] acked docs)", node, ackedDocs.size());
                        for (String id : ackedDocs.keySet()) {
                            assertTrue("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found", client(node).prepareGet("test", id).setPreference("_local").get().isExists());
                        }
                    } catch (AssertionError | NoShardAvailableActionException e) {
                        throw new AssertionError(e.getMessage() + " (checked via node [" + node + "]", e);
                    }
                }
            }, 30, TimeUnit.SECONDS);
            logger.info("done validating (iteration [{}])", iter);
        }
    } finally {
        logger.info("shutting down indexers");
        stop.set(true);
        for (Thread indexer : indexers) {
            indexer.interrupt();
            indexer.join(60000);
        }
        if (exceptedExceptions.size() > 0) {
            StringBuilder sb = new StringBuilder();
            for (Exception e : exceptedExceptions) {
                sb.append("\n").append(e.getMessage());
            }
            logger.debug("Indexing exceptions during disruption: {}", sb);
        }
    }
}
Also used : IndexRequestBuilder(org.opensearch.action.index.IndexRequestBuilder) ClusterBootstrapService(org.opensearch.cluster.coordination.ClusterBootstrapService) IndexResponse(org.opensearch.action.index.IndexResponse) Matchers.not(org.hamcrest.Matchers.not) OpenSearchException(org.opensearch.OpenSearchException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) ConcurrentCollections(org.opensearch.common.util.concurrent.ConcurrentCollections) Bridge(org.opensearch.test.disruption.NetworkDisruption.Bridge) IndexShardTestCase(org.opensearch.index.shard.IndexShardTestCase) Matchers.everyItem(org.hamcrest.Matchers.everyItem) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ActionListener(org.opensearch.action.ActionListener) GetResponse(org.opensearch.action.get.GetResponse) ShardStateAction(org.opensearch.cluster.action.shard.ShardStateAction) Client(org.opensearch.client.Client) TimeValue(org.opensearch.common.unit.TimeValue) IndicesService(org.opensearch.indices.IndicesService) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) Settings(org.opensearch.common.settings.Settings) ServiceDisruptionScheme(org.opensearch.test.disruption.ServiceDisruptionScheme) Collectors(java.util.stream.Collectors) CountDownLatch(java.util.concurrent.CountDownLatch) VersionType(org.opensearch.index.VersionType) List(java.util.List) UPDATED(org.opensearch.action.DocWriteResponse.Result.UPDATED) CREATED(org.opensearch.action.DocWriteResponse.Result.CREATED) Matchers.equalTo(org.hamcrest.Matchers.equalTo) LagDetector(org.opensearch.cluster.coordination.LagDetector) XContentType(org.opensearch.common.xcontent.XContentType) Matchers.is(org.hamcrest.Matchers.is) OpenSearchIntegTestCase(org.opensearch.test.OpenSearchIntegTestCase) Matchers.in(org.hamcrest.Matchers.in) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) IntStream(java.util.stream.IntStream) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) InternalTestCluster(org.opensearch.test.InternalTestCluster) AtomicReference(java.util.concurrent.atomic.AtomicReference) ArrayList(java.util.ArrayList) ClusterState(org.opensearch.cluster.ClusterState) IndexShard(org.opensearch.index.shard.IndexShard) Murmur3HashFunction(org.opensearch.cluster.routing.Murmur3HashFunction) ShardRoutingState(org.opensearch.cluster.routing.ShardRoutingState) TestIssueLogging(org.opensearch.test.junit.annotations.TestIssueLogging) OpenSearchAssertions.assertAcked(org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked) Matchers.oneOf(org.hamcrest.Matchers.oneOf) Matchers.greaterThanOrEqualTo(org.hamcrest.Matchers.greaterThanOrEqualTo) Semaphore(java.util.concurrent.Semaphore) ShardRouting(org.opensearch.cluster.routing.ShardRouting) TimeUnit(java.util.concurrent.TimeUnit) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) NoShardAvailableActionException(org.opensearch.action.NoShardAvailableActionException) Collections(java.util.Collections) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ArrayList(java.util.ArrayList) ServiceDisruptionScheme(org.opensearch.test.disruption.ServiceDisruptionScheme) Semaphore(java.util.concurrent.Semaphore) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Client(org.opensearch.client.Client) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) AtomicReference(java.util.concurrent.atomic.AtomicReference) CountDownLatch(java.util.concurrent.CountDownLatch) OpenSearchException(org.opensearch.OpenSearchException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) NoShardAvailableActionException(org.opensearch.action.NoShardAvailableActionException) IndexRequestBuilder(org.opensearch.action.index.IndexRequestBuilder) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) NoShardAvailableActionException(org.opensearch.action.NoShardAvailableActionException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IndexResponse(org.opensearch.action.index.IndexResponse) OpenSearchException(org.opensearch.OpenSearchException) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) Bridge(org.opensearch.test.disruption.NetworkDisruption.Bridge) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) TestIssueLogging(org.opensearch.test.junit.annotations.TestIssueLogging)

Example 19 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class ClusterManagerDisruptionIT method testIsolateClusterManagerAndVerifyClusterStateConsensus.

/**
 * This test isolates the cluster-manager from rest of the cluster, waits for a new cluster-manager to be elected, restores the partition
 * and verifies that all node agree on the new cluster state
 */
public void testIsolateClusterManagerAndVerifyClusterStateConsensus() throws Exception {
    final List<String> nodes = startCluster(3);
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1 + randomInt(2)).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, randomInt(2))));
    ensureGreen();
    String isolatedNode = internalCluster().getMasterName();
    TwoPartitions partitions = isolateNode(isolatedNode);
    NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
    networkDisruption.startDisrupting();
    String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
    // make sure cluster reforms
    ensureStableCluster(2, nonIsolatedNode);
    // make sure isolated need picks up on things.
    assertNoClusterManager(isolatedNode, TimeValue.timeValueSeconds(40));
    // restore isolation
    networkDisruption.stopDisrupting();
    for (String node : nodes) {
        ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()), true, node);
    }
    logger.info("issue a reroute");
    // trigger a reroute now, instead of waiting for the background reroute of RerouteService
    assertAcked(client().admin().cluster().prepareReroute());
    // and wait for it to finish and for the cluster to stabilize
    ensureGreen("test");
    // verify all cluster states are the same
    // use assert busy to wait for cluster states to be applied (as publish_timeout has low value)
    assertBusy(() -> {
        ClusterState state = null;
        for (String node : nodes) {
            ClusterState nodeState = getNodeClusterState(node);
            if (state == null) {
                state = nodeState;
                continue;
            }
            // assert nodes are identical
            try {
                assertEquals("unequal versions", state.version(), nodeState.version());
                assertEquals("unequal node count", state.nodes().getSize(), nodeState.nodes().getSize());
                assertEquals("different cluster-managers ", state.nodes().getMasterNodeId(), nodeState.nodes().getMasterNodeId());
                assertEquals("different meta data version", state.metadata().version(), nodeState.metadata().version());
                assertEquals("different routing", state.routingTable().toString(), nodeState.routingTable().toString());
            } catch (AssertionError t) {
                fail("failed comparing cluster state: " + t.getMessage() + "\n" + "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state + "\n--- cluster state [" + node + "]: ---\n" + nodeState);
            }
        }
    });
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) TwoPartitions(org.opensearch.test.disruption.NetworkDisruption.TwoPartitions) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) TimeValue(org.opensearch.common.unit.TimeValue)

Example 20 with NetworkDisruption

use of org.opensearch.test.disruption.NetworkDisruption in project OpenSearch by opensearch-project.

the class DiscoveryDisruptionIT method testElectClusterManagerWithLatestVersion.

public void testElectClusterManagerWithLatestVersion() throws Exception {
    final Set<String> nodes = new HashSet<>(internalCluster().startNodes(3));
    ensureStableCluster(3);
    ServiceDisruptionScheme isolateAllNodes = new NetworkDisruption(new NetworkDisruption.IsolateAllNodes(nodes), NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(isolateAllNodes);
    logger.info("--> forcing a complete election to make sure \"preferred\" cluster-manager is elected");
    isolateAllNodes.startDisrupting();
    for (String node : nodes) {
        assertNoClusterManager(node);
    }
    internalCluster().clearDisruptionScheme();
    ensureStableCluster(3);
    final String preferredClusterManagerName = internalCluster().getMasterName();
    final DiscoveryNode preferredClusterManager = internalCluster().clusterService(preferredClusterManagerName).localNode();
    logger.info("--> preferred cluster-manager is {}", preferredClusterManager);
    final Set<String> nonPreferredNodes = new HashSet<>(nodes);
    nonPreferredNodes.remove(preferredClusterManagerName);
    final ServiceDisruptionScheme isolatePreferredClusterManager = isolateClusterManagerDisruption(NetworkDisruption.DISCONNECT);
    internalCluster().setDisruptionScheme(isolatePreferredClusterManager);
    isolatePreferredClusterManager.startDisrupting();
    client(randomFrom(nonPreferredNodes)).admin().indices().prepareCreate("test").setSettings(Settings.builder().put(INDEX_NUMBER_OF_SHARDS_SETTING.getKey(), 1).put(INDEX_NUMBER_OF_REPLICAS_SETTING.getKey(), 0)).get();
    internalCluster().clearDisruptionScheme(false);
    internalCluster().setDisruptionScheme(isolateAllNodes);
    logger.info("--> forcing a complete election again");
    isolateAllNodes.startDisrupting();
    for (String node : nodes) {
        assertNoClusterManager(node);
    }
    isolateAllNodes.stopDisrupting();
    final ClusterState state = client().admin().cluster().prepareState().get().getState();
    if (state.metadata().hasIndex("test") == false) {
        fail("index 'test' was lost. current cluster state: " + state);
    }
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) ServiceDisruptionScheme(org.opensearch.test.disruption.ServiceDisruptionScheme) NetworkDisruption(org.opensearch.test.disruption.NetworkDisruption) HashSet(java.util.HashSet)

Aggregations

NetworkDisruption (org.opensearch.test.disruption.NetworkDisruption)41 HashSet (java.util.HashSet)15 TwoPartitions (org.opensearch.test.disruption.NetworkDisruption.TwoPartitions)14 ClusterState (org.opensearch.cluster.ClusterState)13 Settings (org.opensearch.common.settings.Settings)13 TimeValue (org.opensearch.common.unit.TimeValue)13 CreateSnapshotResponse (org.opensearch.action.admin.cluster.snapshots.create.CreateSnapshotResponse)12 Matchers.containsString (org.hamcrest.Matchers.containsString)8 Client (org.opensearch.client.Client)8 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)8 List (java.util.List)7 CountDownLatch (java.util.concurrent.CountDownLatch)7 Collectors (java.util.stream.Collectors)7 Matchers.equalTo (org.hamcrest.Matchers.equalTo)7 GetResponse (org.opensearch.action.get.GetResponse)7 Collections (java.util.Collections)6 IndexResponse (org.opensearch.action.index.IndexResponse)6 AcknowledgedResponse (org.opensearch.action.support.master.AcknowledgedResponse)6 OpenSearchIntegTestCase (org.opensearch.test.OpenSearchIntegTestCase)6 Collection (java.util.Collection)5