use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class PrepRecoveryOp method execute.
@Override
public void execute(CallInfo it) throws Exception {
assert TestInjection.injectPrepRecoveryOpPauseForever();
final SolrParams params = it.req.getParams();
String cname = params.get(CoreAdminParams.CORE);
if (cname == null) {
cname = "";
}
String nodeName = params.get("nodeName");
String coreNodeName = params.get("coreNodeName");
Replica.State waitForState = Replica.State.getState(params.get(ZkStateReader.STATE_PROP));
Boolean checkLive = params.getBool("checkLive");
Boolean onlyIfLeader = params.getBool("onlyIfLeader");
Boolean onlyIfLeaderActive = params.getBool("onlyIfLeaderActive");
CoreContainer coreContainer = it.handler.coreContainer;
// wait long enough for the leader conflict to work itself out plus a little extra
int conflictWaitMs = coreContainer.getZkController().getLeaderConflictResolveWait();
int maxTries = (int) Math.round(conflictWaitMs / 1000) + 3;
log.info("Going to wait for coreNodeName: {}, state: {}, checkLive: {}, onlyIfLeader: {}, onlyIfLeaderActive: {}, maxTime: {} s", coreNodeName, waitForState, checkLive, onlyIfLeader, onlyIfLeaderActive, maxTries);
Replica.State state = null;
boolean live = false;
int retry = 0;
while (true) {
try (SolrCore core = coreContainer.getCore(cname)) {
if (core == null && retry == Math.min(30, maxTries)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "core not found:" + cname);
}
if (core != null) {
if (onlyIfLeader != null && onlyIfLeader) {
if (!core.getCoreDescriptor().getCloudDescriptor().isLeader()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "We are not the leader");
}
}
// wait until we are sure the recovering node is ready
// to accept updates
CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor();
String collectionName = cloudDescriptor.getCollectionName();
if (retry % 15 == 0) {
if (retry > 0 && log.isInfoEnabled())
log.info("After " + retry + " seconds, core " + cname + " (" + cloudDescriptor.getShardId() + " of " + cloudDescriptor.getCollectionName() + ") still does not have state: " + waitForState + "; forcing ClusterState update from ZooKeeper");
// force a cluster state update
coreContainer.getZkController().getZkStateReader().forceUpdateCollection(collectionName);
}
ClusterState clusterState = coreContainer.getZkController().getClusterState();
DocCollection collection = clusterState.getCollection(collectionName);
Slice slice = collection.getSlice(cloudDescriptor.getShardId());
if (slice != null) {
final Replica replica = slice.getReplicasMap().get(coreNodeName);
if (replica != null) {
state = replica.getState();
live = clusterState.liveNodesContain(nodeName);
final Replica.State localState = cloudDescriptor.getLastPublished();
// TODO: This is funky but I've seen this in testing where the replica asks the
// leader to be in recovery? Need to track down how that happens ... in the meantime,
// this is a safeguard
boolean leaderDoesNotNeedRecovery = (onlyIfLeader != null && onlyIfLeader && core.getName().equals(replica.getStr("core")) && waitForState == Replica.State.RECOVERING && localState == Replica.State.ACTIVE && state == Replica.State.ACTIVE);
if (leaderDoesNotNeedRecovery) {
log.warn("Leader " + core.getName() + " ignoring request to be in the recovering state because it is live and active.");
}
boolean onlyIfActiveCheckResult = onlyIfLeaderActive != null && onlyIfLeaderActive && localState != Replica.State.ACTIVE;
log.info("In WaitForState(" + waitForState + "): collection=" + collectionName + ", shard=" + slice.getName() + ", thisCore=" + core.getName() + ", leaderDoesNotNeedRecovery=" + leaderDoesNotNeedRecovery + ", isLeader? " + core.getCoreDescriptor().getCloudDescriptor().isLeader() + ", live=" + live + ", checkLive=" + checkLive + ", currentState=" + state.toString() + ", localState=" + localState + ", nodeName=" + nodeName + ", coreNodeName=" + coreNodeName + ", onlyIfActiveCheckResult=" + onlyIfActiveCheckResult + ", nodeProps: " + replica);
if (!onlyIfActiveCheckResult && replica != null && (state == waitForState || leaderDoesNotNeedRecovery)) {
if (checkLive == null) {
break;
} else if (checkLive && live) {
break;
} else if (!checkLive && !live) {
break;
}
}
}
}
}
if (retry++ == maxTries) {
String collection = null;
String leaderInfo = null;
String shardId = null;
try {
CloudDescriptor cloudDescriptor = core.getCoreDescriptor().getCloudDescriptor();
collection = cloudDescriptor.getCollectionName();
shardId = cloudDescriptor.getShardId();
leaderInfo = coreContainer.getZkController().getZkStateReader().getLeaderUrl(collection, shardId, 5000);
} catch (Exception exc) {
leaderInfo = "Not available due to: " + exc;
}
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "I was asked to wait on state " + waitForState + " for " + shardId + " in " + collection + " on " + nodeName + " but I still do not see the requested state. I see state: " + Objects.toString(state) + " live:" + live + " leader from ZK: " + leaderInfo);
}
if (coreContainer.isShutDown()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Solr is shutting down");
}
// solrcloud_debug
if (log.isDebugEnabled() && core != null) {
try {
LocalSolrQueryRequest r = new LocalSolrQueryRequest(core, new ModifiableSolrParams());
CommitUpdateCommand commitCmd = new CommitUpdateCommand(r, false);
commitCmd.softCommit = true;
core.getUpdateHandler().commit(commitCmd);
RefCounted<SolrIndexSearcher> searchHolder = core.getNewestSearcher(false);
SolrIndexSearcher searcher = searchHolder.get();
try {
log.debug(core.getCoreContainer().getZkController().getNodeName() + " to replicate " + searcher.search(new MatchAllDocsQuery(), 1).totalHits + " gen:" + core.getDeletionPolicy().getLatestCommit().getGeneration() + " data:" + core.getDataDir());
} finally {
searchHolder.decref();
}
} catch (Exception e) {
log.debug("Error in solrcloud_debug block", e);
}
}
}
Thread.sleep(1000);
}
log.info("Waited coreNodeName: " + coreNodeName + ", state: " + waitForState + ", checkLive: " + checkLive + ", onlyIfLeader: " + onlyIfLeader + " for: " + retry + " seconds.");
}
use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class RebalanceLeaders method ensurePreferredIsLeader.
private void ensurePreferredIsLeader(NamedList<Object> results, Slice slice, Map<String, String> currentRequests) throws KeeperException, InterruptedException {
final String inactivePreferreds = "inactivePreferreds";
final String alreadyLeaders = "alreadyLeaders";
String collectionName = req.getParams().get(COLLECTION_PROP);
for (Replica replica : slice.getReplicas()) {
// Tell the replica to become the leader if we're the preferred leader AND active AND not the leader already
if (replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false) == false) {
continue;
}
// OK, we are the preferred leader, are we the actual leader?
if (replica.getBool(LEADER_PROP, false)) {
//We're a preferred leader, but we're _also_ the leader, don't need to do anything.
NamedList<Object> noops = (NamedList<Object>) results.get(alreadyLeaders);
if (noops == null) {
noops = new NamedList<>();
results.add(alreadyLeaders, noops);
}
NamedList<Object> res = new NamedList<>();
res.add("status", "success");
res.add("msg", "Already leader");
res.add("shard", slice.getName());
res.add("nodeName", replica.getNodeName());
noops.add(replica.getName(), res);
// already the leader, do nothing.
return;
}
// We're the preferred leader, but someone else is leader. Only become leader if we're active.
if (replica.getState() != Replica.State.ACTIVE) {
NamedList<Object> inactives = (NamedList<Object>) results.get(inactivePreferreds);
if (inactives == null) {
inactives = new NamedList<>();
results.add(inactivePreferreds, inactives);
}
NamedList<Object> res = new NamedList<>();
res.add("status", "skipped");
res.add("msg", "Node is a referredLeader, but it's inactive. Skipping");
res.add("shard", slice.getName());
res.add("nodeName", replica.getNodeName());
inactives.add(replica.getName(), res);
// Don't try to become the leader if we're not active!
return;
}
// Replica is the preferred leader but not the actual leader, do something about that.
// "Something" is
// 1> if the preferred leader isn't first in line, tell it to re-queue itself.
// 2> tell the actual leader to re-queue itself.
ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
List<String> electionNodes = OverseerTaskProcessor.getSortedElectionNodes(zkStateReader.getZkClient(), ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName()));
if (electionNodes.size() < 2) {
// if there's only one node in the queue, should already be leader and we shouldn't be here anyway.
log.info("Rebalancing leaders and slice " + slice.getName() + " has less than two elements in the leader " + "election queue, but replica " + replica.getName() + " doesn't think it's the leader.");
return;
}
// Ok, the sorting for election nodes is a bit strange. If the sequence numbers are the same, then the whole
// string is used, but that sorts nodes with the same sequence number by their session IDs from ZK.
// While this is determinate, it's not quite what we need, so re-queue nodes that aren't us and are
// watching the leader node..
String firstWatcher = electionNodes.get(1);
if (LeaderElector.getNodeName(firstWatcher).equals(replica.getName()) == false) {
makeReplicaFirstWatcher(collectionName, slice, replica);
}
String coreName = slice.getReplica(LeaderElector.getNodeName(electionNodes.get(0))).getStr(CORE_NAME_PROP);
rejoinElection(collectionName, slice, electionNodes.get(0), coreName, false);
waitForNodeChange(collectionName, slice, electionNodes.get(0));
// Done with this slice, skip the rest of the replicas.
return;
}
}
use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class ManagedIndexSchema method getActiveReplicaCoreUrls.
protected static List<String> getActiveReplicaCoreUrls(ZkController zkController, String collection, String localCoreNodeName) {
List<String> activeReplicaCoreUrls = new ArrayList<>();
ZkStateReader zkStateReader = zkController.getZkStateReader();
ClusterState clusterState = zkStateReader.getClusterState();
Set<String> liveNodes = clusterState.getLiveNodes();
Collection<Slice> activeSlices = clusterState.getActiveSlices(collection);
if (activeSlices != null && activeSlices.size() > 0) {
for (Slice next : activeSlices) {
Map<String, Replica> replicasMap = next.getReplicasMap();
if (replicasMap != null) {
for (Map.Entry<String, Replica> entry : replicasMap.entrySet()) {
Replica replica = entry.getValue();
if (!localCoreNodeName.equals(replica.getName()) && replica.getState() == Replica.State.ACTIVE && liveNodes.contains(replica.getNodeName())) {
ZkCoreNodeProps replicaCoreProps = new ZkCoreNodeProps(replica);
activeReplicaCoreUrls.add(replicaCoreProps.getCoreUrl());
}
}
}
}
}
return activeReplicaCoreUrls;
}
use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class HttpSolrCall method getCoreByCollection.
protected SolrCore getCoreByCollection(String collectionName, boolean isPreferLeader) {
ZkStateReader zkStateReader = cores.getZkController().getZkStateReader();
ClusterState clusterState = zkStateReader.getClusterState();
DocCollection collection = clusterState.getCollectionOrNull(collectionName);
if (collection == null) {
return null;
}
Set<String> liveNodes = clusterState.getLiveNodes();
if (isPreferLeader) {
List<Replica> leaderReplicas = collection.getLeaderReplicas(cores.getZkController().getNodeName());
SolrCore core = randomlyGetSolrCore(liveNodes, leaderReplicas);
if (core != null)
return core;
}
List<Replica> replicas = collection.getReplicas(cores.getZkController().getNodeName());
return randomlyGetSolrCore(liveNodes, replicas);
}
use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class CloudSolrClientTest method stateVersionParamTest.
@Test
public void stateVersionParamTest() throws Exception {
DocCollection coll = cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(COLLECTION);
Replica r = coll.getSlices().iterator().next().getReplicas().iterator().next();
SolrQuery q = new SolrQuery().setQuery("*:*");
HttpSolrClient.RemoteSolrException sse = null;
final String url = r.getStr(ZkStateReader.BASE_URL_PROP) + "/" + COLLECTION;
try (HttpSolrClient solrClient = getHttpSolrClient(url)) {
log.info("should work query, result {}", solrClient.query(q));
//no problem
q.setParam(CloudSolrClient.STATE_VERSION, COLLECTION + ":" + coll.getZNodeVersion());
log.info("2nd query , result {}", solrClient.query(q));
//no error yet good
//an older version expect error
q.setParam(CloudSolrClient.STATE_VERSION, COLLECTION + ":" + (coll.getZNodeVersion() - 1));
QueryResponse rsp = solrClient.query(q);
Map m = (Map) rsp.getResponse().get(CloudSolrClient.STATE_VERSION, rsp.getResponse().size() - 1);
assertNotNull("Expected an extra information from server with the list of invalid collection states", m);
assertNotNull(m.get(COLLECTION));
}
//now send the request to another node that does not serve the collection
Set<String> allNodesOfColl = new HashSet<>();
for (Slice slice : coll.getSlices()) {
for (Replica replica : slice.getReplicas()) {
allNodesOfColl.add(replica.getStr(ZkStateReader.BASE_URL_PROP));
}
}
String theNode = null;
Set<String> liveNodes = cluster.getSolrClient().getZkStateReader().getClusterState().getLiveNodes();
for (String s : liveNodes) {
String n = cluster.getSolrClient().getZkStateReader().getBaseUrlForNodeName(s);
if (!allNodesOfColl.contains(n)) {
theNode = n;
break;
}
}
log.info("the node which does not serve this collection{} ", theNode);
assertNotNull(theNode);
final String solrClientUrl = theNode + "/" + COLLECTION;
try (SolrClient solrClient = getHttpSolrClient(solrClientUrl)) {
q.setParam(CloudSolrClient.STATE_VERSION, COLLECTION + ":" + (coll.getZNodeVersion() - 1));
try {
QueryResponse rsp = solrClient.query(q);
log.info("error was expected");
} catch (HttpSolrClient.RemoteSolrException e) {
sse = e;
}
assertNotNull(sse);
assertEquals(" Error code should be 510", SolrException.ErrorCode.INVALID_STATE.code, sse.code());
}
}
Aggregations