Search in sources :

Example 21 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.

the class ClusterDisruptionIT method testAckedIndexing.

/**
 * Test that we do not loose document whose indexing request was successful, under a randomly selected disruption scheme
 * We also collect & report the type of indexing failures that occur.
 * <p>
 * This test is a superset of tests run in the Jepsen test suite, with the exception of versioned updates
 */
@TestLogging("_root:DEBUG,org.elasticsearch.action.bulk:TRACE,org.elasticsearch.action.get:TRACE," + "org.elasticsearch.discovery:TRACE,org.elasticsearch.action.support.replication:TRACE," + "org.elasticsearch.cluster.service:TRACE,org.elasticsearch.indices.recovery:TRACE," + "org.elasticsearch.indices.cluster:TRACE,org.elasticsearch.index.shard:TRACE")
@Test
public void testAckedIndexing() throws Exception {
    final List<String> nodes = startCluster(3);
    int numberOfShards = 1 + randomInt(2);
    int replicas = randomInt(2);
    logger.info("creating table t clustered into {} shards with {} replicas", numberOfShards, replicas);
    execute("create table t (id int primary key, x string) clustered into " + numberOfShards + " shards " + "with (number_of_replicas = " + replicas + ", \"write.wait_for_active_shards\" = 1, \"global_checkpoint_sync.interval\"='1s')");
    ensureGreen();
    ServiceDisruptionScheme disruptionScheme = addRandomDisruptionScheme();
    logger.info("disruption scheme [{}] added", disruptionScheme);
    // id -> node sent.
    final ConcurrentHashMap<String, String> ackedDocs = new ConcurrentHashMap<>();
    final AtomicBoolean stop = new AtomicBoolean(false);
    List<Thread> indexers = new ArrayList<>(nodes.size());
    List<Semaphore> semaphores = new ArrayList<>(nodes.size());
    final AtomicInteger idGenerator = new AtomicInteger(0);
    final AtomicReference<CountDownLatch> countDownLatchRef = new AtomicReference<>();
    final List<Exception> exceptedExceptions = new CopyOnWriteArrayList<>();
    logger.info("starting indexers");
    try {
        for (final String node : nodes) {
            final Semaphore semaphore = new Semaphore(0);
            semaphores.add(semaphore);
            final String name = "indexer_" + indexers.size();
            Thread thread = new Thread(() -> {
                while (!stop.get()) {
                    String id = null;
                    try {
                        if (!semaphore.tryAcquire(10, TimeUnit.SECONDS)) {
                            continue;
                        }
                        logger.info("[{}] Acquired semaphore and it has {} permits left", name, semaphore.availablePermits());
                        try {
                            id = String.valueOf(idGenerator.incrementAndGet());
                            int shard = Math.floorMod(Murmur3HashFunction.hash(id), numberOfShards);
                            logger.trace("[{}] indexing id [{}] through node [{}] targeting shard [{}]", name, id, node, shard);
                            execute("insert into t (id, x) values (?, ?)", new Object[] { id, randomInt(100) }, node, TimeValue.timeValueSeconds(1L));
                            ackedDocs.put(id, node);
                            logger.trace("[{}] indexed id [{}] through node [{}], response [{}]", name, id, node, response);
                        } catch (ElasticsearchException | DuplicateKeyException e) {
                            exceptedExceptions.add(e);
                            final String rowId = id;
                            logger.trace(() -> new ParameterizedMessage("[{}] failed id [{}] through node [{}]", name, rowId, node), e);
                        } finally {
                            countDownLatchRef.get().countDown();
                            logger.trace("[{}] decreased counter : {}", name, countDownLatchRef.get().getCount());
                        }
                    } catch (InterruptedException e) {
                    // fine - semaphore interrupt
                    } catch (AssertionError | Exception e) {
                        logger.trace(() -> new ParameterizedMessage("unexpected exception in background thread of [{}]", node), e);
                    }
                }
            });
            thread.setName(name);
            thread.start();
            indexers.add(thread);
        }
        int docsPerIndexer = randomInt(3);
        logger.info("indexing {} docs per indexer before partition", docsPerIndexer);
        countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
        for (Semaphore semaphore : semaphores) {
            semaphore.release(docsPerIndexer);
        }
        assertTrue(countDownLatchRef.get().await(1, TimeUnit.MINUTES));
        for (int iter = 1 + randomInt(1); iter > 0; iter--) {
            logger.info("starting disruptions & indexing (iteration [{}])", iter);
            disruptionScheme.startDisrupting();
            docsPerIndexer = randomIntBetween(1, 4);
            logger.info("indexing {} docs per indexer during partition", docsPerIndexer);
            countDownLatchRef.set(new CountDownLatch(docsPerIndexer * indexers.size()));
            Collections.shuffle(semaphores, random());
            for (Semaphore semaphore : semaphores) {
                assertThat(semaphore.availablePermits(), equalTo(0));
                semaphore.release(docsPerIndexer);
            }
            logger.info("waiting for indexing requests to complete");
            assertThat("indexing requests must complete", countDownLatchRef.get().await(20, TimeUnit.SECONDS), is(true));
            logger.info("stopping disruption");
            disruptionScheme.stopDisrupting();
            for (String node : internalCluster().getNodeNames()) {
                ensureStableCluster(nodes.size(), TimeValue.timeValueMillis(disruptionScheme.expectedTimeToHeal().millis() + DISRUPTION_HEALING_OVERHEAD.millis()), true, node);
            }
            // is the super-connected node and recovery source and target are on opposite sides of the bridge
            if (disruptionScheme instanceof NetworkDisruption && ((NetworkDisruption) disruptionScheme).getDisruptedLinks() instanceof Bridge) {
                logger.warn("retrying failed allocations in case of a bridge partition");
                execute("ALTER CLUSTER REROUTE RETRY FAILED");
            }
            ensureGreen();
            logger.info("validating successful docs");
            assertBusy(() -> {
                for (String node : nodes) {
                    try {
                        logger.debug("validating through node [{}] ([{}] acked docs)", node, ackedDocs.size());
                        for (String id : ackedDocs.keySet()) {
                            execute("select * from t where id = ?", new Object[] { id }, node);
                            assertThat("doc [" + id + "] indexed via node [" + ackedDocs.get(id) + "] not found", response.rowCount(), is(1L));
                        }
                    } catch (AssertionError | NoShardAvailableActionException e) {
                        throw new AssertionError(e.getMessage() + " (checked via node [" + node + "]", e);
                    }
                }
            }, 30, TimeUnit.SECONDS);
            logger.info("done validating (iteration [{}])", iter);
        }
    } finally {
        logger.info("shutting down indexers");
        stop.set(true);
        for (Thread indexer : indexers) {
            indexer.interrupt();
            indexer.join(60000);
        }
        if (exceptedExceptions.size() > 0) {
            StringBuilder sb = new StringBuilder();
            for (Exception e : exceptedExceptions) {
                sb.append("\n").append(e.getMessage());
            }
            logger.debug("Indexing exceptions during disruption: {}", sb);
        }
    }
}
Also used : ArrayList(java.util.ArrayList) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ServiceDisruptionScheme(org.elasticsearch.test.disruption.ServiceDisruptionScheme) Semaphore(java.util.concurrent.Semaphore) ElasticsearchException(org.elasticsearch.ElasticsearchException) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) AtomicReference(java.util.concurrent.atomic.AtomicReference) CountDownLatch(java.util.concurrent.CountDownLatch) ElasticsearchException(org.elasticsearch.ElasticsearchException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) DuplicateKeyException(io.crate.exceptions.DuplicateKeyException) NoShardAvailableActionException(org.elasticsearch.action.NoShardAvailableActionException) DuplicateKeyException(io.crate.exceptions.DuplicateKeyException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) NoShardAvailableActionException(org.elasticsearch.action.NoShardAvailableActionException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) Bridge(org.elasticsearch.test.disruption.NetworkDisruption.Bridge) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) TestLogging(org.elasticsearch.test.junit.annotations.TestLogging) Test(org.junit.Test)

Example 22 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.

the class ClusterDisruptionIT method testSendingShardFailure.

// simulate handling of sending shard failure during an isolation
@Test
public void testSendingShardFailure() throws Exception {
    List<String> nodes = startCluster(3);
    String masterNode = internalCluster().getMasterName();
    List<String> nonMasterNodes = nodes.stream().filter(node -> !node.equals(masterNode)).collect(Collectors.toList());
    String nonMasterNode = randomFrom(nonMasterNodes);
    execute("create table t (id int primary key, x string) clustered into 3 shards with (number_of_replicas = 2)");
    ensureGreen();
    String nonMasterNodeId = internalCluster().clusterService(nonMasterNode).localNode().getId();
    // fail a random shard
    ShardRouting failedShard = randomFrom(clusterService().state().getRoutingNodes().node(nonMasterNodeId).shardsWithState(ShardRoutingState.STARTED));
    ShardStateAction service = internalCluster().getInstance(ShardStateAction.class, nonMasterNode);
    CountDownLatch latch = new CountDownLatch(1);
    AtomicBoolean success = new AtomicBoolean();
    String isolatedNode = randomBoolean() ? masterNode : nonMasterNode;
    NetworkDisruption.TwoPartitions partitions = isolateNode(isolatedNode);
    // we cannot use the NetworkUnresponsive disruption type here as it will swallow the "shard failed" request, calling neither
    // onSuccess nor onFailure on the provided listener.
    NetworkLinkDisruptionType disruptionType = new NetworkDisruption.NetworkDisconnect();
    NetworkDisruption networkDisruption = new NetworkDisruption(partitions, disruptionType);
    setDisruptionScheme(networkDisruption);
    networkDisruption.startDisrupting();
    service.localShardFailed(failedShard, "simulated", new CorruptIndexException("simulated", (String) null), new ActionListener<>() {

        @Override
        public void onResponse(Void aVoid) {
            success.set(true);
            latch.countDown();
        }

        @Override
        public void onFailure(Exception e) {
            success.set(false);
            latch.countDown();
            assert false;
        }
    });
    if (isolatedNode.equals(nonMasterNode)) {
        assertNoMaster(nonMasterNode);
    } else {
        ensureStableCluster(2, nonMasterNode);
    }
    // heal the partition
    networkDisruption.removeAndEnsureHealthy(internalCluster());
    // the cluster should stabilize
    ensureStableCluster(3);
    latch.await();
    // the listener should be notified
    assertTrue(success.get());
    // the failed shard should be gone
    List<ShardRouting> shards = clusterService().state().getRoutingTable().allShards(toIndexName(sqlExecutor.getCurrentSchema(), "t", null));
    for (ShardRouting shard : shards) {
        assertThat(shard.allocationId(), not(equalTo(failedShard.allocationId())));
    }
}
Also used : ElasticsearchException(org.elasticsearch.ElasticsearchException) ShardRouting(org.elasticsearch.cluster.routing.ShardRouting) InternalTestCluster(org.elasticsearch.test.InternalTestCluster) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) ConcurrentCollections(org.elasticsearch.common.util.concurrent.ConcurrentCollections) Matchers.not(org.hamcrest.Matchers.not) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) SQLIntegrationTestCase(io.crate.integrationtests.SQLIntegrationTestCase) ShardRoutingState(org.elasticsearch.cluster.routing.ShardRoutingState) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) AtomicReference(java.util.concurrent.atomic.AtomicReference) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) ArrayList(java.util.ArrayList) ClusterState(org.elasticsearch.cluster.ClusterState) Matchers.everyItem(org.hamcrest.Matchers.everyItem) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IndicesService(org.elasticsearch.indices.IndicesService) Bridge(org.elasticsearch.test.disruption.NetworkDisruption.Bridge) ServiceDisruptionScheme(org.elasticsearch.test.disruption.ServiceDisruptionScheme) DuplicateKeyException(io.crate.exceptions.DuplicateKeyException) Matchers.greaterThanOrEqualTo(org.hamcrest.Matchers.greaterThanOrEqualTo) TestLogging(org.elasticsearch.test.junit.annotations.TestLogging) Semaphore(java.util.concurrent.Semaphore) IndexShard(org.elasticsearch.index.shard.IndexShard) NetworkLinkDisruptionType(org.elasticsearch.test.disruption.NetworkDisruption.NetworkLinkDisruptionType) Collection(java.util.Collection) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Plugin(org.elasticsearch.plugins.Plugin) Set(java.util.Set) NoShardAvailableActionException(org.elasticsearch.action.NoShardAvailableActionException) Test(org.junit.Test) Collectors(java.util.stream.Collectors) Murmur3HashFunction(org.elasticsearch.cluster.routing.Murmur3HashFunction) TimeUnit(java.util.concurrent.TimeUnit) InternalSettingsPlugin(org.elasticsearch.test.InternalSettingsPlugin) CountDownLatch(java.util.concurrent.CountDownLatch) IndexShardTestCase(org.elasticsearch.index.shard.IndexShardTestCase) List(java.util.List) IndexParts.toIndexName(io.crate.metadata.IndexParts.toIndexName) ESIntegTestCase(org.elasticsearch.test.ESIntegTestCase) ShardStateAction(org.elasticsearch.cluster.action.shard.ShardStateAction) Matchers.equalTo(org.hamcrest.Matchers.equalTo) TimeValue(io.crate.common.unit.TimeValue) Matchers.is(org.hamcrest.Matchers.is) Collections(java.util.Collections) Matchers.in(org.hamcrest.Matchers.in) ActionListener(org.elasticsearch.action.ActionListener) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) NetworkLinkDisruptionType(org.elasticsearch.test.disruption.NetworkDisruption.NetworkLinkDisruptionType) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) ShardStateAction(org.elasticsearch.cluster.action.shard.ShardStateAction) CountDownLatch(java.util.concurrent.CountDownLatch) ElasticsearchException(org.elasticsearch.ElasticsearchException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) DuplicateKeyException(io.crate.exceptions.DuplicateKeyException) NoShardAvailableActionException(org.elasticsearch.action.NoShardAvailableActionException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ShardRouting(org.elasticsearch.cluster.routing.ShardRouting) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) Test(org.junit.Test)

Example 23 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.

the class ClusterDisruptionIT method testRejoinDocumentExistsInAllShardCopies.

/**
 *  Test that a document which is indexed on the majority side of a partition, is available from the minority side,
 *  once the partition is healed
 */
@Test
public void testRejoinDocumentExistsInAllShardCopies() throws Exception {
    List<String> nodes = startCluster(3);
    execute("create table t (id int primary key, x string) clustered into 1 shards with (number_of_replicas = 2, " + "\"write.wait_for_active_shards\" = 1)");
    ensureGreen();
    nodes = new ArrayList<>(nodes);
    Collections.shuffle(nodes, random());
    String isolatedNode = nodes.get(0);
    String notIsolatedNode = nodes.get(1);
    NetworkDisruption.TwoPartitions partitions = isolateNode(isolatedNode);
    NetworkDisruption scheme = addRandomDisruptionType(partitions);
    scheme.startDisrupting();
    ensureStableCluster(2, notIsolatedNode);
    String indexName = toIndexName(sqlExecutor.getCurrentSchema(), "t", null);
    assertFalse(client(notIsolatedNode).admin().cluster().prepareHealth(indexName).setWaitForYellowStatus().get().isTimedOut());
    execute("insert into t (id, x) values (1, 10)", null, notIsolatedNode);
    logger.info("Verifying if document exists via node[{}]", notIsolatedNode);
    execute("select * from t where id = '1'", null, notIsolatedNode);
    assertThat(response.rowCount(), is(1L));
    scheme.stopDisrupting();
    ensureStableCluster(3);
    ensureGreen();
    for (String node : nodes) {
        logger.info("Verifying if document exists after isolating node[{}] via node[{}]", isolatedNode, node);
        execute("select * from t where id = '1'", null, node);
        assertThat(response.rowCount(), is(1L));
    }
}
Also used : NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) Test(org.junit.Test)

Example 24 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.

the class AbstractDisruptionTestCase method addRandomDisruptionType.

NetworkDisruption addRandomDisruptionType(TwoPartitions partitions) {
    final NetworkLinkDisruptionType disruptionType;
    if (randomBoolean()) {
        disruptionType = new NetworkDisruption.NetworkUnresponsive();
    } else {
        disruptionType = new NetworkDisconnect();
    }
    NetworkDisruption partition = new NetworkDisruption(partitions, disruptionType);
    setDisruptionScheme(partition);
    return partition;
}
Also used : NetworkLinkDisruptionType(org.elasticsearch.test.disruption.NetworkDisruption.NetworkLinkDisruptionType) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) NetworkDisconnect(org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect)

Example 25 with NetworkDisruption

use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.

the class DiscoveryDisruptionIT method testElectMasterWithLatestVersion.

@Test
public void testElectMasterWithLatestVersion() throws Exception {
    final Set<String> nodes = new HashSet<>(internalCluster().startNodes(3));
    ensureStableCluster(3);
    ServiceDisruptionScheme isolateAllNodes = new NetworkDisruption(new NetworkDisruption.IsolateAllNodes(nodes), new NetworkDisconnect());
    internalCluster().setDisruptionScheme(isolateAllNodes);
    logger.info("--> forcing a complete election to make sure \"preferred\" master is elected");
    isolateAllNodes.startDisrupting();
    for (String node : nodes) {
        assertNoMaster(node);
    }
    internalCluster().clearDisruptionScheme();
    ensureStableCluster(3);
    final String preferredMasterName = internalCluster().getMasterName();
    final DiscoveryNode preferredMaster = internalCluster().clusterService(preferredMasterName).localNode();
    logger.info("--> preferred master is {}", preferredMaster);
    final Set<String> nonPreferredNodes = new HashSet<>(nodes);
    nonPreferredNodes.remove(preferredMasterName);
    final ServiceDisruptionScheme isolatePreferredMaster = new NetworkDisruption(new NetworkDisruption.TwoPartitions(Collections.singleton(preferredMasterName), nonPreferredNodes), new NetworkDisconnect());
    internalCluster().setDisruptionScheme(isolatePreferredMaster);
    isolatePreferredMaster.startDisrupting();
    execute("create table t (id int primary key, x string) clustered into 1 shards " + "with (number_of_replicas = 0)", null, randomFrom(nonPreferredNodes));
    internalCluster().clearDisruptionScheme(false);
    internalCluster().setDisruptionScheme(isolateAllNodes);
    logger.info("--> forcing a complete election again");
    isolateAllNodes.startDisrupting();
    for (String node : nodes) {
        assertNoMaster(node);
    }
    isolateAllNodes.stopDisrupting();
    final ClusterState state = client().admin().cluster().prepareState().get().getState();
    if (state.metadata().hasIndex(toIndexName(sqlExecutor.getCurrentSchema(), "t", null)) == false) {
        fail("index 'test' was lost. current cluster state: " + state);
    }
}
Also used : ClusterState(org.elasticsearch.cluster.ClusterState) DiscoveryNode(org.elasticsearch.cluster.node.DiscoveryNode) ServiceDisruptionScheme(org.elasticsearch.test.disruption.ServiceDisruptionScheme) NetworkDisruption(org.elasticsearch.test.disruption.NetworkDisruption) NetworkDisconnect(org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

NetworkDisruption (org.elasticsearch.test.disruption.NetworkDisruption)27 TwoPartitions (org.elasticsearch.test.disruption.NetworkDisruption.TwoPartitions)17 NetworkDisconnect (org.elasticsearch.test.disruption.NetworkDisruption.NetworkDisconnect)14 ClusterState (org.elasticsearch.cluster.ClusterState)12 HashSet (java.util.HashSet)8 Test (org.junit.Test)8 ServiceDisruptionScheme (org.elasticsearch.test.disruption.ServiceDisruptionScheme)7 Settings (org.elasticsearch.common.settings.Settings)6 NetworkLinkDisruptionType (org.elasticsearch.test.disruption.NetworkDisruption.NetworkLinkDisruptionType)6 TestLogging (org.elasticsearch.test.junit.annotations.TestLogging)6 ArrayList (java.util.ArrayList)5 TimeValue (io.crate.common.unit.TimeValue)4 CountDownLatch (java.util.concurrent.CountDownLatch)4 AtomicReference (java.util.concurrent.atomic.AtomicReference)4 NetworkUnresponsive (org.elasticsearch.test.disruption.NetworkDisruption.NetworkUnresponsive)4 Collection (java.util.Collection)3 List (java.util.List)3 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)3 Semaphore (java.util.concurrent.Semaphore)3 TimeUnit (java.util.concurrent.TimeUnit)3