Search in sources :

Example 91 with ClusterState

use of in project lucene-solr by apache.

the class ShardSplitTest method testSplitWithChaosMonkey.

public void testSplitWithChaosMonkey() throws Exception {
    List<StoppableIndexingThread> indexers = new ArrayList<>();
    try {
        for (int i = 0; i < 1; i++) {
            StoppableIndexingThread thread = new StoppableIndexingThread(controlClient, cloudClient, String.valueOf(i), true);
        // give the indexers some time to do their work
    } catch (Exception e) {
        log.error("Error in test", e);
    } finally {
        for (StoppableIndexingThread indexer : indexers) {
    AtomicBoolean stop = new AtomicBoolean();
    AtomicBoolean killed = new AtomicBoolean(false);
    Runnable monkey = new Runnable() {

        public void run() {
            ZkStateReader zkStateReader = cloudClient.getZkStateReader();
            zkStateReader.registerCollectionStateWatcher(AbstractDistribZkTestBase.DEFAULT_COLLECTION, new CollectionStateWatcher() {

                public boolean onStateChanged(Set<String> liveNodes, DocCollection collectionState) {
                    if (stop.get()) {
                        // abort and remove the watch
                        return true;
                    Slice slice = collectionState.getSlice(SHARD1_0);
                    if (slice != null && slice.getReplicas().size() > 1) {
                        // ensure that only one watcher invocation thread can kill!
                        if (killed.compareAndSet(false, true)) {
                  "Monkey thread found 2 replicas for {} {}", AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1);
                            CloudJettyRunner cjetty = shardToLeaderJetty.get(SHARD1);
                            try {
                                Thread.sleep(1000 + random().nextInt(500));
                                return true;
                            } catch (Exception e) {
                                log.error("Monkey unable to kill jetty at port " + cjetty.jetty.getLocalPort(), e);
          "Monkey thread found only one replica for {} {}", AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1);
                    return false;
    Thread monkeyThread = null;
     somehow the cluster state object inside this zk state reader has static copy of the collection which is never updated
     so any call to waitForRecoveriesToFinish just keeps looping until timeout.
     We workaround by explicitly registering the collection as an interesting one so that it is watched by ZkStateReader
     see SOLR-9440. Todo remove this hack after SOLR-9440 is fixed.
    monkeyThread = new Thread(monkey);
    try {
        CollectionAdminRequest.SplitShard splitShard = CollectionAdminRequest.splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
        String asyncId = splitShard.processAsync(cloudClient);
        RequestStatusState splitStatus = null;
        try {
            splitStatus = CollectionAdminRequest.requestStatus(asyncId).waitFor(cloudClient, 120);
        } catch (Exception e) {
            log.warn("Failed to get request status, maybe because the overseer node was shutdown by monkey", e);
        // we don't care if the split failed because we are injecting faults and it is likely
        // that the split has failed but in any case we want to assert that all docs that got
        // indexed are available in SolrCloud and if the split succeeded then all replicas of the sub-shard
        // must be consistent (i.e. have same numdocs)"Shard split request state is COMPLETED");
        Set<String> addFails = new HashSet<>();
        Set<String> deleteFails = new HashSet<>();
        for (StoppableIndexingThread indexer : indexers) {
        CloudJettyRunner cjetty = shardToLeaderJetty.get(SHARD1);"Starting shard1 leader jetty at port {}", cjetty.jetty.getLocalPort());
        cloudClient.getZkStateReader().forceUpdateCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);"Current collection state: {}", printClusterStateInfo(AbstractDistribZkTestBase.DEFAULT_COLLECTION));
        boolean replicaCreationsFailed = false;
        if (splitStatus == RequestStatusState.FAILED) {
            // either one or more replica creation failed (because it may have been created on the same parent shard leader node)
            // or the split may have failed while trying to soft-commit *after* all replicas have been created
            // the latter counts as a successful switch even if the API doesn't say so
            // so we must find a way to distinguish between the two
            // an easy way to do that is to look at the sub-shard replicas and check if the replica core actually exists
            // instead of existing solely inside the cluster state
            DocCollection collectionState = cloudClient.getZkStateReader().getClusterState().getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
            Slice slice10 = collectionState.getSlice(SHARD1_0);
            Slice slice11 = collectionState.getSlice(SHARD1_1);
            if (slice10 != null && slice11 != null) {
                for (Replica replica : slice10) {
                    if (!doesReplicaCoreExist(replica)) {
                        replicaCreationsFailed = true;
                for (Replica replica : slice11) {
                    if (!doesReplicaCoreExist(replica)) {
                        replicaCreationsFailed = true;
        // true if sub-shard states switch to 'active' eventually
        AtomicBoolean areSubShardsActive = new AtomicBoolean(false);
        if (!replicaCreationsFailed) {
            // all sub-shard replicas were created successfully so all cores must recover eventually
            waitForRecoveriesToFinish(AbstractDistribZkTestBase.DEFAULT_COLLECTION, true);
            // let's wait for the overseer to switch shard states
            CountDownLatch latch = new CountDownLatch(1);
            cloudClient.getZkStateReader().registerCollectionStateWatcher(AbstractDistribZkTestBase.DEFAULT_COLLECTION, new CollectionStateWatcher() {

                public boolean onStateChanged(Set<String> liveNodes, DocCollection collectionState) {
                    Slice parent = collectionState.getSlice(SHARD1);
                    Slice slice10 = collectionState.getSlice(SHARD1_0);
                    Slice slice11 = collectionState.getSlice(SHARD1_1);
                    if (slice10 != null && slice11 != null && parent.getState() == Slice.State.INACTIVE && slice10.getState() == Slice.State.ACTIVE && slice11.getState() == Slice.State.ACTIVE) {
                        // removes the watch
                        return true;
                    } else if (slice10 != null && slice11 != null && parent.getState() == Slice.State.ACTIVE && slice10.getState() == Slice.State.RECOVERY_FAILED && slice11.getState() == Slice.State.RECOVERY_FAILED) {
                        return true;
                    return false;
            latch.await(2, TimeUnit.MINUTES);
            if (latch.getCount() != 0) {
                // sanity check
                fail("We think that split was successful but sub-shard states were not updated even after 2 minutes.");
        // for visibility of results on sub-shards
        checkShardConsistency(true, true, addFails, deleteFails);
        long ctrlDocs = controlClient.query(new SolrQuery("*:*")).getResults().getNumFound();
        // ensure we have added more than 0 docs
        long cloudClientDocs = cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound();
        assertTrue("Found " + ctrlDocs + " control docs", cloudClientDocs > 0);
        assertEquals("Found " + ctrlDocs + " control docs and " + cloudClientDocs + " cloud docs", ctrlDocs, cloudClientDocs);
        // handle new shards/replica so well.
        if (areSubShardsActive.get()) {
            ClusterState clusterState = cloudClient.getZkStateReader().getClusterState();
            DocCollection collection = clusterState.getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION);
            int numReplicasChecked = assertConsistentReplicas(collection.getSlice(SHARD1_0));
            assertEquals("We should have checked consistency for exactly 2 replicas of shard1_0", 2, numReplicasChecked);
            numReplicasChecked = assertConsistentReplicas(collection.getSlice(SHARD1_1));
            assertEquals("We should have checked consistency for exactly 2 replicas of shard1_1", 2, numReplicasChecked);
    } finally {
Also used : ArrayList(java.util.ArrayList) CollectionAdminRequest(org.apache.solr.client.solrj.request.CollectionAdminRequest) CollectionStateWatcher( SolrQuery(org.apache.solr.client.solrj.SolrQuery) ZkStateReader( DocCollection( HashSet(java.util.HashSet) ClusterState( CountDownLatch(java.util.concurrent.CountDownLatch) Replica( SolrServerException(org.apache.solr.client.solrj.SolrServerException) IOException( AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Slice( RequestStatusState(org.apache.solr.client.solrj.response.RequestStatusState) Test(org.junit.Test)

Example 92 with ClusterState

use of in project lucene-solr by apache.

the class TestCloudDeleteByQuery method createMiniSolrCloudCluster.

private static void createMiniSolrCloudCluster() throws Exception {
    final String configName = "solrCloudCollectionConfig";
    final Path configDir = Paths.get(TEST_HOME(), "collection1", "conf");
    configureCluster(NUM_SERVERS).addConfig(configName, configDir).configure();
    Map<String, String> collectionProperties = new HashMap<>();
    collectionProperties.put("config", "solrconfig-tlog.xml");
    // string id for doc routing prefix
    collectionProperties.put("schema", "schema15.xml");
    CollectionAdminRequest.createCollection(COLLECTION_NAME, configName, NUM_SHARDS, REPLICATION_FACTOR).setProperties(collectionProperties).process(cluster.getSolrClient());
    CLOUD_CLIENT = cluster.getSolrClient();
    ZkStateReader zkStateReader = CLOUD_CLIENT.getZkStateReader();
    AbstractDistribZkTestBase.waitForRecoveriesToFinish(COLLECTION_NAME, zkStateReader, true, true, 330);
    // really hackish way to get a URL for specific nodes based on shard/replica hosting
    // inspired by TestMiniSolrCloudCluster
    HashMap<String, String> urlMap = new HashMap<>();
    for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
        URL jettyURL = jetty.getBaseUrl();
        String nodeKey = jettyURL.getHost() + ":" + jettyURL.getPort() + jettyURL.getPath().replace("/", "_");
        urlMap.put(nodeKey, jettyURL.toString());
    ClusterState clusterState = zkStateReader.getClusterState();
    for (Slice slice : clusterState.getSlices(COLLECTION_NAME)) {
        String shardName = slice.getName();
        Replica leader = slice.getLeader();
        assertNotNull("slice has null leader: " + slice.toString(), leader);
        assertNotNull("slice leader has null node name: " + slice.toString(), leader.getNodeName());
        String leaderUrl = urlMap.remove(leader.getNodeName());
        assertNotNull("could not find URL for " + shardName + " leader: " + leader.getNodeName(), leaderUrl);
        assertEquals("expected two total replicas for: " + slice.getName(), 2, slice.getReplicas().size());
        String passiveUrl = null;
        for (Replica replica : slice.getReplicas()) {
            if (!replica.equals(leader)) {
                passiveUrl = urlMap.remove(replica.getNodeName());
                assertNotNull("could not find URL for " + shardName + " replica: " + replica.getNodeName(), passiveUrl);
        assertNotNull("could not find URL for " + shardName + " replica", passiveUrl);
        if (shardName.equals("shard1")) {
            S_ONE_LEADER_CLIENT = getHttpSolrClient(leaderUrl + "/" + COLLECTION_NAME + "/");
            S_ONE_NON_LEADER_CLIENT = getHttpSolrClient(passiveUrl + "/" + COLLECTION_NAME + "/");
        } else if (shardName.equals("shard2")) {
            S_TWO_LEADER_CLIENT = getHttpSolrClient(leaderUrl + "/" + COLLECTION_NAME + "/");
            S_TWO_NON_LEADER_CLIENT = getHttpSolrClient(passiveUrl + "/" + COLLECTION_NAME + "/");
        } else {
            fail("unexpected shard: " + shardName);
    assertEquals("Should be exactly one server left (nost hosting either shard)", 1, urlMap.size());
    NO_COLLECTION_CLIENT = getHttpSolrClient(urlMap.values().iterator().next() + "/" + COLLECTION_NAME + "/");
    // sanity check that our S_ONE_PRE & S_TWO_PRE really do map to shard1 & shard2 with default routing
    assertEquals(0, CLOUD_CLIENT.add(doc(f("id", S_ONE_PRE + random().nextInt()), f("expected_shard_s", "shard1"))).getStatus());
    assertEquals(0, CLOUD_CLIENT.add(doc(f("id", S_TWO_PRE + random().nextInt()), f("expected_shard_s", "shard2"))).getStatus());
    assertEquals(0, CLOUD_CLIENT.commit().getStatus());
    SolrDocumentList docs = CLOUD_CLIENT.query(params("q", "*:*", "fl", "id,expected_shard_s,[shard]")).getResults();
    assertEquals(2, docs.getNumFound());
    assertEquals(2, docs.size());
    for (SolrDocument doc : docs) {
        String expected = COLLECTION_NAME + "_" + doc.getFirstValue("expected_shard_s") + "_replica";
        String docShard = doc.getFirstValue("[shard]").toString();
        assertTrue("shard routing prefixes don't seem to be aligned anymore, " + "did someone change the default routing rules? " + "and/or the the default core name rules? " + "and/or the numShards used by this test? ... " + "couldn't find " + expected + " as substring of [shard] == '" + docShard + "' ... for docId == " + doc.getFirstValue("id"), docShard.contains(expected));
Also used : Path(java.nio.file.Path) ClusterState( HashMap(java.util.HashMap) JettySolrRunner(org.apache.solr.client.solrj.embedded.JettySolrRunner) SolrDocumentList(org.apache.solr.common.SolrDocumentList) Replica( URL( ZkStateReader( SolrDocument(org.apache.solr.common.SolrDocument) Slice( BeforeClass(org.junit.BeforeClass)

Example 93 with ClusterState

use of in project lucene-solr by apache.

the class PeerSyncReplicationTest method waitTillNodesActive.

private void waitTillNodesActive() throws Exception {
    for (int i = 0; i < 60; i++) {
        ZkStateReader zkStateReader = cloudClient.getZkStateReader();
        ClusterState clusterState = zkStateReader.getClusterState();
        DocCollection collection1 = clusterState.getCollection("collection1");
        Slice slice = collection1.getSlice("shard1");
        Collection<Replica> replicas = slice.getReplicas();
        boolean allActive = true;
        Collection<String> nodesDownNames = -> n.coreNodeName).collect(Collectors.toList());
        Collection<Replica> replicasToCheck = -> !nodesDownNames.contains(r.getName())).collect(Collectors.toList());
        for (Replica replica : replicasToCheck) {
            if (!clusterState.liveNodesContain(replica.getNodeName()) || replica.getState() != Replica.State.ACTIVE) {
                allActive = false;
        if (allActive) {
    fail("timeout waiting to see all nodes active");
Also used : ZkStateReader( Arrays(java.util.Arrays) Slow(org.apache.lucene.util.LuceneTestCase.Slow) DocCollection( ClusterState( LoggerFactory(org.slf4j.LoggerFactory) ArrayList(java.util.ArrayList) Collections.singletonList(java.util.Collections.singletonList) HashSet(java.util.HashSet) SolrServerException(org.apache.solr.client.solrj.SolrServerException) Map(java.util.Map) Counter(com.codahale.metrics.Counter) RandomStringUtils(org.apache.commons.lang.RandomStringUtils) ZkStateReader( MetricRegistry(com.codahale.metrics.MetricRegistry) Slice( Logger(org.slf4j.Logger) Files(java.nio.file.Files) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) MethodHandles(java.lang.invoke.MethodHandles) Collection(java.util.Collection) Set(java.util.Set) Metric(com.codahale.metrics.Metric) IOException( Test(org.junit.Test) TimeOut(org.apache.solr.util.TimeOut) Collectors( Replica( BadApple(org.apache.lucene.util.LuceneTestCase.BadApple) List(java.util.List) Paths(java.nio.file.Paths) SolrQuery(org.apache.solr.client.solrj.SolrQuery) UpdateRequest(org.apache.solr.client.solrj.request.UpdateRequest) LimitViolationAction( Timer(com.codahale.metrics.Timer) SECONDS(java.util.concurrent.TimeUnit.SECONDS) SolrInputDocument(org.apache.solr.common.SolrInputDocument) ClusterState( Slice( DocCollection( Replica(

Example 94 with ClusterState

use of in project lucene-solr by apache.

the class ShardSplitTest method incompleteOrOverlappingCustomRangeTest.

private void incompleteOrOverlappingCustomRangeTest() throws Exception {
    ClusterState clusterState = cloudClient.getZkStateReader().getClusterState();
    final DocRouter router = clusterState.getCollection(AbstractDistribZkTestBase.DEFAULT_COLLECTION).getRouter();
    Slice shard1 = clusterState.getSlice(AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1);
    DocRouter.Range shard1Range = shard1.getRange() != null ? shard1.getRange() : router.fullRange();
    List<DocRouter.Range> subRanges = new ArrayList<>();
    List<DocRouter.Range> ranges = router.partitionRange(4, shard1Range);
    // test with only one range
    try {
        splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1, subRanges, null);
        fail("Shard splitting with just one custom hash range should not succeed");
    } catch (HttpSolrClient.RemoteSolrException e) {"Expected exception:", e);
    // test with ranges with a hole in between them
    // order shouldn't matter
    try {
        splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1, subRanges, null);
        fail("Shard splitting with missing hashes in between given ranges should not succeed");
    } catch (HttpSolrClient.RemoteSolrException e) {"Expected exception:", e);
    // test with overlapping ranges
    subRanges.add(new DocRouter.Range(ranges.get(3).min - 15, ranges.get(3).max));
    try {
        splitShard(AbstractDistribZkTestBase.DEFAULT_COLLECTION, SHARD1, subRanges, null);
        fail("Shard splitting with overlapping ranges should not succeed");
    } catch (HttpSolrClient.RemoteSolrException e) {"Expected exception:", e);
Also used : HttpSolrClient(org.apache.solr.client.solrj.impl.HttpSolrClient) ClusterState( Slice( DocRouter( ArrayList(java.util.ArrayList)

Example 95 with ClusterState

use of in project lucene-solr by apache.

the class ReplicaPropertiesBase method verifyPropertyVal.

// The params are triplets,
// collection
// shard
// replica
public static void verifyPropertyVal(CloudSolrClient client, String collectionName, String replicaName, String property, String val) throws InterruptedException, KeeperException {
    Replica replica = null;
    ClusterState clusterState = null;
    for (int idx = 0; idx < 300; ++idx) {
        // Keep trying while Overseer writes the ZK state for up to 30 seconds.
        clusterState = client.getZkStateReader().getClusterState();
        replica = clusterState.getReplica(collectionName, replicaName);
        if (replica == null) {
            fail("Could not find collection/replica pair! " + collectionName + "/" + replicaName);
        if (StringUtils.equals(val, replica.getStr(property)))
    fail("Property '" + property + "' with value " + replica.getStr(property) + " not set correctly for collection/replica pair: " + collectionName + "/" + replicaName + " property map is " + replica.getProperties().toString() + ".");
Also used : ClusterState( Replica(


ClusterState ( Slice ( Replica ( ZkStateReader ( DocCollection ( HashMap (java.util.HashMap)41 ArrayList (java.util.ArrayList)36 Map (java.util.Map)24 IOException ( Test (org.junit.Test)18 HashSet (java.util.HashSet)17 SolrException (org.apache.solr.common.SolrException)16 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)15 SolrQuery (org.apache.solr.client.solrj.SolrQuery)13 JettySolrRunner (org.apache.solr.client.solrj.embedded.JettySolrRunner)13 ZkCoreNodeProps ( ZkNodeProps ( List (java.util.List)12 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)12 NamedList (org.apache.solr.common.util.NamedList)12