use of org.apache.solr.client.solrj.request.CoreAdminRequest.WaitForState in project lucene-solr by apache.
the class RecoveryStrategy method sendPrepRecoveryCmd.
private final void sendPrepRecoveryCmd(String leaderBaseUrl, String leaderCoreName, Slice slice) throws SolrServerException, IOException, InterruptedException, ExecutionException {
WaitForState prepCmd = new WaitForState();
prepCmd.setCoreName(leaderCoreName);
prepCmd.setNodeName(zkController.getNodeName());
prepCmd.setCoreNodeName(coreZkNodeName);
prepCmd.setState(Replica.State.RECOVERING);
prepCmd.setCheckLive(true);
prepCmd.setOnlyIfLeader(true);
final Slice.State state = slice.getState();
if (state != Slice.State.CONSTRUCTION && state != Slice.State.RECOVERY && state != Slice.State.RECOVERY_FAILED) {
prepCmd.setOnlyIfLeaderActive(true);
}
final int maxTries = 30;
for (int numTries = 0; numTries < maxTries; numTries++) {
try {
sendPrepRecoveryCmd(leaderBaseUrl, prepCmd);
break;
} catch (ExecutionException e) {
if (e.getCause() instanceof SolrServerException) {
SolrServerException solrException = (SolrServerException) e.getCause();
if (solrException.getRootCause() instanceof SocketTimeoutException && numTries < maxTries) {
LOG.warn("Socket timeout on send prep recovery cmd, retrying.. ");
continue;
}
}
throw e;
}
}
}
use of org.apache.solr.client.solrj.request.CoreAdminRequest.WaitForState in project lucene-solr by apache.
the class ZkController method waitForLeaderToSeeDownState.
private ZkCoreNodeProps waitForLeaderToSeeDownState(CoreDescriptor descriptor, final String coreZkNodeName) {
// try not to wait too long here - if we are waiting too long, we should probably
// move along and join the election
CloudDescriptor cloudDesc = descriptor.getCloudDescriptor();
String collection = cloudDesc.getCollectionName();
String shard = cloudDesc.getShardId();
ZkCoreNodeProps leaderProps = null;
int retries = 2;
for (int i = 0; i < retries; i++) {
try {
if (isClosed) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "We have been closed");
}
// go straight to zk, not the cloud state - we want current info
leaderProps = getLeaderProps(collection, shard, 5000);
break;
} catch (Exception e) {
SolrException.log(log, "There was a problem finding the leader in zk", e);
try {
Thread.sleep(2000);
} catch (InterruptedException e1) {
Thread.currentThread().interrupt();
}
if (i == retries - 1) {
throw new SolrException(ErrorCode.SERVER_ERROR, "There was a problem finding the leader in zk");
}
}
}
String leaderBaseUrl = leaderProps.getBaseUrl();
String leaderCoreName = leaderProps.getCoreName();
String myCoreNodeName = cloudDesc.getCoreNodeName();
String myCoreName = descriptor.getName();
String ourUrl = ZkCoreNodeProps.getCoreUrl(getBaseUrl(), myCoreName);
boolean isLeader = leaderProps.getCoreUrl().equals(ourUrl);
if (!isLeader && !SKIP_AUTO_RECOVERY) {
// detect if this core is in leader-initiated recovery and if so,
// then we don't need the leader to wait on seeing the down state
Replica.State lirState = null;
try {
lirState = getLeaderInitiatedRecoveryState(collection, shard, myCoreNodeName);
} catch (Exception exc) {
log.error("Failed to determine if replica " + myCoreNodeName + " is in leader-initiated recovery due to: " + exc, exc);
}
if (lirState != null) {
log.debug("Replica " + myCoreNodeName + " is already in leader-initiated recovery, so not waiting for leader to see down state.");
} else {
log.info("Replica " + myCoreNodeName + " NOT in leader-initiated recovery, need to wait for leader to see down state.");
try (HttpSolrClient client = new Builder(leaderBaseUrl).build()) {
client.setConnectionTimeout(15000);
client.setSoTimeout(120000);
WaitForState prepCmd = new WaitForState();
prepCmd.setCoreName(leaderCoreName);
prepCmd.setNodeName(getNodeName());
prepCmd.setCoreNodeName(coreZkNodeName);
prepCmd.setState(Replica.State.DOWN);
// let's retry a couple times - perhaps the leader just went down,
// or perhaps he is just not quite ready for us yet
retries = 2;
for (int i = 0; i < retries; i++) {
if (isClosed) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "We have been closed");
}
try {
client.request(prepCmd);
break;
} catch (Exception e) {
// if the core container is shutdown, don't wait
if (cc.isShutDown()) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Core container is shutdown.");
}
Throwable rootCause = SolrException.getRootCause(e);
if (rootCause instanceof IOException) {
// if there was a communication error talking to the leader, see if the leader is even alive
if (!zkStateReader.getClusterState().liveNodesContain(leaderProps.getNodeName())) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Node " + leaderProps.getNodeName() + " hosting leader for " + shard + " in " + collection + " is not live!");
}
}
SolrException.log(log, "There was a problem making a request to the leader", e);
try {
Thread.sleep(2000);
} catch (InterruptedException e1) {
Thread.currentThread().interrupt();
}
if (i == retries - 1) {
throw new SolrException(ErrorCode.SERVER_ERROR, "There was a problem making a request to the leader");
}
}
}
} catch (IOException e) {
SolrException.log(log, "Error closing HttpSolrClient", e);
}
}
}
return leaderProps;
}
Aggregations