Search in sources :

Example 41 with Replica

use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.

the class PrepRecoveryOp method execute.

@Override
public void execute(CallInfo it) throws Exception {
    assert TestInjection.injectPrepRecoveryOpPauseForever();
    final SolrParams params = it.req.getParams();
    String cname = params.get(CoreAdminParams.CORE);
    if (cname == null) {
        cname = "";
    }
    String nodeName = params.get("nodeName");
    String coreNodeName = params.get("coreNodeName");
    Replica.State waitForState = Replica.State.getState(params.get(ZkStateReader.STATE_PROP));
    Boolean checkLive = params.getBool("checkLive");
    Boolean onlyIfLeader = params.getBool("onlyIfLeader");
    Boolean onlyIfLeaderActive = params.getBool("onlyIfLeaderActive");
    CoreContainer coreContainer = it.handler.coreContainer;
    // wait long enough for the leader conflict to work itself out plus a little extra
    int conflictWaitMs = coreContainer.getZkController().getLeaderConflictResolveWait();
    int maxTries = (int) Math.round(conflictWaitMs / 1000) + 3;
    log.info("Going to wait for coreNodeName: {}, state: {}, checkLive: {}, onlyIfLeader: {}, onlyIfLeaderActive: {}, maxTime: {} s", coreNodeName, waitForState, checkLive, onlyIfLeader, onlyIfLeaderActive, maxTries);
    Replica.State state = null;
    boolean live = false;
    int retry = 0;
    while (true) {
        try (SolrCore core = coreContainer.getCore(cname)) {
            if (core == null && retry == Math.min(30, maxTries)) {
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "core not found:" + cname);
            }
            if (core != null) {
                if (onlyIfLeader != null && onlyIfLeader) {
                    if (!core.getCoreDescriptor().getCloudDescriptor().isLeader()) {
                        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "We are not the leader");
                    }
                }
                // wait until we are sure the recovering node is ready
                // to accept updates
                CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor();
                String collectionName = cloudDescriptor.getCollectionName();
                if (retry % 15 == 0) {
                    if (retry > 0 && log.isInfoEnabled())
                        log.info("After " + retry + " seconds, core " + cname + " (" + cloudDescriptor.getShardId() + " of " + cloudDescriptor.getCollectionName() + ") still does not have state: " + waitForState + "; forcing ClusterState update from ZooKeeper");
                    // force a cluster state update
                    coreContainer.getZkController().getZkStateReader().forceUpdateCollection(collectionName);
                }
                ClusterState clusterState = coreContainer.getZkController().getClusterState();
                DocCollection collection = clusterState.getCollection(collectionName);
                Slice slice = collection.getSlice(cloudDescriptor.getShardId());
                if (slice != null) {
                    final Replica replica = slice.getReplicasMap().get(coreNodeName);
                    if (replica != null) {
                        state = replica.getState();
                        live = clusterState.liveNodesContain(nodeName);
                        final Replica.State localState = cloudDescriptor.getLastPublished();
                        // TODO: This is funky but I've seen this in testing where the replica asks the
                        // leader to be in recovery? Need to track down how that happens ... in the meantime,
                        // this is a safeguard
                        boolean leaderDoesNotNeedRecovery = (onlyIfLeader != null && onlyIfLeader && core.getName().equals(replica.getStr("core")) && waitForState == Replica.State.RECOVERING && localState == Replica.State.ACTIVE && state == Replica.State.ACTIVE);
                        if (leaderDoesNotNeedRecovery) {
                            log.warn("Leader " + core.getName() + " ignoring request to be in the recovering state because it is live and active.");
                        }
                        boolean onlyIfActiveCheckResult = onlyIfLeaderActive != null && onlyIfLeaderActive && localState != Replica.State.ACTIVE;
                        log.info("In WaitForState(" + waitForState + "): collection=" + collectionName + ", shard=" + slice.getName() + ", thisCore=" + core.getName() + ", leaderDoesNotNeedRecovery=" + leaderDoesNotNeedRecovery + ", isLeader? " + core.getCoreDescriptor().getCloudDescriptor().isLeader() + ", live=" + live + ", checkLive=" + checkLive + ", currentState=" + state.toString() + ", localState=" + localState + ", nodeName=" + nodeName + ", coreNodeName=" + coreNodeName + ", onlyIfActiveCheckResult=" + onlyIfActiveCheckResult + ", nodeProps: " + replica);
                        if (!onlyIfActiveCheckResult && replica != null && (state == waitForState || leaderDoesNotNeedRecovery)) {
                            if (checkLive == null) {
                                break;
                            } else if (checkLive && live) {
                                break;
                            } else if (!checkLive && !live) {
                                break;
                            }
                        }
                    }
                }
            }
            if (retry++ == maxTries) {
                String collection = null;
                String leaderInfo = null;
                String shardId = null;
                try {
                    CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor();
                    collection = cloudDescriptor.getCollectionName();
                    shardId = cloudDescriptor.getShardId();
                    leaderInfo = coreContainer.getZkController().getZkStateReader().getLeaderUrl(collection, shardId, 5000);
                } catch (Exception exc) {
                    leaderInfo = "Not available due to: " + exc;
                }
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "I was asked to wait on state " + waitForState + " for " + shardId + " in " + collection + " on " + nodeName + " but I still do not see the requested state. I see state: " + Objects.toString(state) + " live:" + live + " leader from ZK: " + leaderInfo);
            }
            if (coreContainer.isShutDown()) {
                throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Solr is shutting down");
            }
            // solrcloud_debug
            if (log.isDebugEnabled() && core != null) {
                try {
                    LocalSolrQueryRequest r = new LocalSolrQueryRequest(core, new ModifiableSolrParams());
                    CommitUpdateCommand commitCmd = new CommitUpdateCommand(r, false);
                    commitCmd.softCommit = true;
                    core.getUpdateHandler().commit(commitCmd);
                    RefCounted<SolrIndexSearcher> searchHolder = core.getNewestSearcher(false);
                    SolrIndexSearcher searcher = searchHolder.get();
                    try {
                        log.debug(core.getCoreContainer().getZkController().getNodeName() + " to replicate " + searcher.search(new MatchAllDocsQuery(), 1).totalHits + " gen:" + core.getDeletionPolicy().getLatestCommit().getGeneration() + " data:" + core.getDataDir());
                    } finally {
                        searchHolder.decref();
                    }
                } catch (Exception e) {
                    log.debug("Error in solrcloud_debug block", e);
                }
            }
        }
        Thread.sleep(1000);
    }
    log.info("Waited coreNodeName: " + coreNodeName + ", state: " + waitForState + ", checkLive: " + checkLive + ", onlyIfLeader: " + onlyIfLeader + " for: " + retry + " seconds.");
}
Also used : ClusterState(org.apache.solr.common.cloud.ClusterState) SolrCore(org.apache.solr.core.SolrCore) CommitUpdateCommand(org.apache.solr.update.CommitUpdateCommand) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Replica(org.apache.solr.common.cloud.Replica) CloudDescriptor(org.apache.solr.cloud.CloudDescriptor) SolrException(org.apache.solr.common.SolrException) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) LocalSolrQueryRequest(org.apache.solr.request.LocalSolrQueryRequest) CoreContainer(org.apache.solr.core.CoreContainer) Slice(org.apache.solr.common.cloud.Slice) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) SolrParams(org.apache.solr.common.params.SolrParams) DocCollection(org.apache.solr.common.cloud.DocCollection) SolrException(org.apache.solr.common.SolrException)

Example 42 with Replica

use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.

the class RebalanceLeaders method ensurePreferredIsLeader.

private void ensurePreferredIsLeader(NamedList<Object> results, Slice slice, Map<String, String> currentRequests) throws KeeperException, InterruptedException {
    final String inactivePreferreds = "inactivePreferreds";
    final String alreadyLeaders = "alreadyLeaders";
    String collectionName = req.getParams().get(COLLECTION_PROP);
    for (Replica replica : slice.getReplicas()) {
        // Tell the replica to become the leader if we're the preferred leader AND active AND not the leader already
        if (replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false) == false) {
            continue;
        }
        // OK, we are the preferred leader, are we the actual leader?
        if (replica.getBool(LEADER_PROP, false)) {
            //We're a preferred leader, but we're _also_ the leader, don't need to do anything.
            NamedList<Object> noops = (NamedList<Object>) results.get(alreadyLeaders);
            if (noops == null) {
                noops = new NamedList<>();
                results.add(alreadyLeaders, noops);
            }
            NamedList<Object> res = new NamedList<>();
            res.add("status", "success");
            res.add("msg", "Already leader");
            res.add("shard", slice.getName());
            res.add("nodeName", replica.getNodeName());
            noops.add(replica.getName(), res);
            // already the leader, do nothing.
            return;
        }
        // We're the preferred leader, but someone else is leader. Only become leader if we're active.
        if (replica.getState() != Replica.State.ACTIVE) {
            NamedList<Object> inactives = (NamedList<Object>) results.get(inactivePreferreds);
            if (inactives == null) {
                inactives = new NamedList<>();
                results.add(inactivePreferreds, inactives);
            }
            NamedList<Object> res = new NamedList<>();
            res.add("status", "skipped");
            res.add("msg", "Node is a referredLeader, but it's inactive. Skipping");
            res.add("shard", slice.getName());
            res.add("nodeName", replica.getNodeName());
            inactives.add(replica.getName(), res);
            // Don't try to become the leader if we're not active!
            return;
        }
        // Replica is the preferred leader but not the actual leader, do something about that.
        // "Something" is
        // 1> if the preferred leader isn't first in line, tell it to re-queue itself.
        // 2> tell the actual leader to re-queue itself.
        ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
        List<String> electionNodes = OverseerTaskProcessor.getSortedElectionNodes(zkStateReader.getZkClient(), ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName()));
        if (electionNodes.size() < 2) {
            // if there's only one node in the queue, should already be leader and we shouldn't be here anyway.
            log.info("Rebalancing leaders and slice " + slice.getName() + " has less than two elements in the leader " + "election queue, but replica " + replica.getName() + " doesn't think it's the leader.");
            return;
        }
        // Ok, the sorting for election nodes is a bit strange. If the sequence numbers are the same, then the whole
        // string is used, but that sorts nodes with the same sequence number by their session IDs from ZK.
        // While this is determinate, it's not quite what we need, so re-queue nodes that aren't us and are
        // watching the leader node..
        String firstWatcher = electionNodes.get(1);
        if (LeaderElector.getNodeName(firstWatcher).equals(replica.getName()) == false) {
            makeReplicaFirstWatcher(collectionName, slice, replica);
        }
        String coreName = slice.getReplica(LeaderElector.getNodeName(electionNodes.get(0))).getStr(CORE_NAME_PROP);
        rejoinElection(collectionName, slice, electionNodes.get(0), coreName, false);
        waitForNodeChange(collectionName, slice, electionNodes.get(0));
        // Done with this slice, skip the rest of the replicas.
        return;
    }
}
Also used : ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) NamedList(org.apache.solr.common.util.NamedList) Replica(org.apache.solr.common.cloud.Replica)

Example 43 with Replica

use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.

the class ManagedIndexSchema method getActiveReplicaCoreUrls.

protected static List<String> getActiveReplicaCoreUrls(ZkController zkController, String collection, String localCoreNodeName) {
    List<String> activeReplicaCoreUrls = new ArrayList<>();
    ZkStateReader zkStateReader = zkController.getZkStateReader();
    ClusterState clusterState = zkStateReader.getClusterState();
    Set<String> liveNodes = clusterState.getLiveNodes();
    Collection<Slice> activeSlices = clusterState.getActiveSlices(collection);
    if (activeSlices != null && activeSlices.size() > 0) {
        for (Slice next : activeSlices) {
            Map<String, Replica> replicasMap = next.getReplicasMap();
            if (replicasMap != null) {
                for (Map.Entry<String, Replica> entry : replicasMap.entrySet()) {
                    Replica replica = entry.getValue();
                    if (!localCoreNodeName.equals(replica.getName()) && replica.getState() == Replica.State.ACTIVE && liveNodes.contains(replica.getNodeName())) {
                        ZkCoreNodeProps replicaCoreProps = new ZkCoreNodeProps(replica);
                        activeReplicaCoreUrls.add(replicaCoreProps.getCoreUrl());
                    }
                }
            }
        }
    }
    return activeReplicaCoreUrls;
}
Also used : ClusterState(org.apache.solr.common.cloud.ClusterState) ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) ArrayList(java.util.ArrayList) Replica(org.apache.solr.common.cloud.Replica) ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) Slice(org.apache.solr.common.cloud.Slice) Map(java.util.Map) HashMap(java.util.HashMap)

Example 44 with Replica

use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.

the class HttpSolrCall method getCoreByCollection.

protected SolrCore getCoreByCollection(String collectionName, boolean isPreferLeader) {
    ZkStateReader zkStateReader = cores.getZkController().getZkStateReader();
    ClusterState clusterState = zkStateReader.getClusterState();
    DocCollection collection = clusterState.getCollectionOrNull(collectionName);
    if (collection == null) {
        return null;
    }
    Set<String> liveNodes = clusterState.getLiveNodes();
    if (isPreferLeader) {
        List<Replica> leaderReplicas = collection.getLeaderReplicas(cores.getZkController().getNodeName());
        SolrCore core = randomlyGetSolrCore(liveNodes, leaderReplicas);
        if (core != null)
            return core;
    }
    List<Replica> replicas = collection.getReplicas(cores.getZkController().getNodeName());
    return randomlyGetSolrCore(liveNodes, replicas);
}
Also used : ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) ClusterState(org.apache.solr.common.cloud.ClusterState) SolrCore(org.apache.solr.core.SolrCore) DocCollection(org.apache.solr.common.cloud.DocCollection) Replica(org.apache.solr.common.cloud.Replica)

Example 45 with Replica

use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.

the class CloudSolrClientTest method stateVersionParamTest.

@Test
public void stateVersionParamTest() throws Exception {
    DocCollection coll = cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(COLLECTION);
    Replica r = coll.getSlices().iterator().next().getReplicas().iterator().next();
    SolrQuery q = new SolrQuery().setQuery("*:*");
    HttpSolrClient.RemoteSolrException sse = null;
    final String url = r.getStr(ZkStateReader.BASE_URL_PROP) + "/" + COLLECTION;
    try (HttpSolrClient solrClient = getHttpSolrClient(url)) {
        log.info("should work query, result {}", solrClient.query(q));
        //no problem
        q.setParam(CloudSolrClient.STATE_VERSION, COLLECTION + ":" + coll.getZNodeVersion());
        log.info("2nd query , result {}", solrClient.query(q));
        //no error yet good
        //an older version expect error
        q.setParam(CloudSolrClient.STATE_VERSION, COLLECTION + ":" + (coll.getZNodeVersion() - 1));
        QueryResponse rsp = solrClient.query(q);
        Map m = (Map) rsp.getResponse().get(CloudSolrClient.STATE_VERSION, rsp.getResponse().size() - 1);
        assertNotNull("Expected an extra information from server with the list of invalid collection states", m);
        assertNotNull(m.get(COLLECTION));
    }
    //now send the request to another node that does not serve the collection
    Set<String> allNodesOfColl = new HashSet<>();
    for (Slice slice : coll.getSlices()) {
        for (Replica replica : slice.getReplicas()) {
            allNodesOfColl.add(replica.getStr(ZkStateReader.BASE_URL_PROP));
        }
    }
    String theNode = null;
    Set<String> liveNodes = cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes();
    for (String s : liveNodes) {
        String n = cluster.getSolrClient().getZkStateReader().getBaseUrlForNodeName(s);
        if (!allNodesOfColl.contains(n)) {
            theNode = n;
            break;
        }
    }
    log.info("the node which does not serve this collection{} ", theNode);
    assertNotNull(theNode);
    final String solrClientUrl = theNode + "/" + COLLECTION;
    try (SolrClient solrClient = getHttpSolrClient(solrClientUrl)) {
        q.setParam(CloudSolrClient.STATE_VERSION, COLLECTION + ":" + (coll.getZNodeVersion() - 1));
        try {
            QueryResponse rsp = solrClient.query(q);
            log.info("error was expected");
        } catch (HttpSolrClient.RemoteSolrException e) {
            sse = e;
        }
        assertNotNull(sse);
        assertEquals(" Error code should be 510", SolrException.ErrorCode.INVALID_STATE.code, sse.code());
    }
}
Also used : SolrClient(org.apache.solr.client.solrj.SolrClient) Slice(org.apache.solr.common.cloud.Slice) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) DocCollection(org.apache.solr.common.cloud.DocCollection) Replica(org.apache.solr.common.cloud.Replica) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) Map(java.util.Map) HashMap(java.util.HashMap) SolrQuery(org.apache.solr.client.solrj.SolrQuery) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

Replica (org.apache.solr.common.cloud.Replica)232 Slice (org.apache.solr.common.cloud.Slice)140 DocCollection (org.apache.solr.common.cloud.DocCollection)86 ArrayList (java.util.ArrayList)81 ClusterState (org.apache.solr.common.cloud.ClusterState)67 HashMap (java.util.HashMap)60 SolrException (org.apache.solr.common.SolrException)53 ZkStateReader (org.apache.solr.common.cloud.ZkStateReader)50 Test (org.junit.Test)50 Map (java.util.Map)45 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)37 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)35 JettySolrRunner (org.apache.solr.client.solrj.embedded.JettySolrRunner)29 NamedList (org.apache.solr.common.util.NamedList)28 SolrQuery (org.apache.solr.client.solrj.SolrQuery)26 IOException (java.io.IOException)25 SolrInputDocument (org.apache.solr.common.SolrInputDocument)25 ZkCoreNodeProps (org.apache.solr.common.cloud.ZkCoreNodeProps)25 HashSet (java.util.HashSet)24 List (java.util.List)20