use of org.apache.solr.client.solrj.request.CoreAdminRequest.RequestRecovery in project lucene-solr by apache.
the class SyncStrategy method requestRecovery.
private void requestRecovery(final ZkNodeProps leaderProps, final String baseUrl, final String coreName) throws SolrServerException, IOException {
Thread thread = new Thread() {
{
setDaemon(true);
}
@Override
public void run() {
RequestRecovery recoverRequestCmd = new RequestRecovery();
recoverRequestCmd.setAction(CoreAdminAction.REQUESTRECOVERY);
recoverRequestCmd.setCoreName(coreName);
try (HttpSolrClient client = new HttpSolrClient.Builder(baseUrl).withHttpClient(SyncStrategy.this.client).build()) {
client.setConnectionTimeout(30000);
client.setSoTimeout(120000);
client.request(recoverRequestCmd);
} catch (Throwable t) {
SolrException.log(log, ZkCoreNodeProps.getCoreUrl(leaderProps) + ": Could not tell a replica to recover", t);
if (t instanceof Error) {
throw (Error) t;
}
}
}
};
updateExecutor.execute(thread);
}
use of org.apache.solr.client.solrj.request.CoreAdminRequest.RequestRecovery in project lucene-solr by apache.
the class TestCoreAdmin method testInvalidRequestRecovery.
@Test
public void testInvalidRequestRecovery() throws SolrServerException, IOException {
RequestRecovery recoverRequestCmd = new RequestRecovery();
recoverRequestCmd.setCoreName("non_existing_core");
expectThrows(SolrException.class, () -> recoverRequestCmd.process(getSolrAdmin()));
}
use of org.apache.solr.client.solrj.request.CoreAdminRequest.RequestRecovery in project lucene-solr by apache.
the class LeaderInitiatedRecoveryThread method sendRecoveryCommandWithRetry.
protected void sendRecoveryCommandWithRetry() throws Exception {
int tries = 0;
long waitBetweenTriesMs = 5000L;
boolean continueTrying = true;
String replicaCoreName = nodeProps.getCoreName();
String recoveryUrl = nodeProps.getBaseUrl();
String replicaNodeName = nodeProps.getNodeName();
String coreNeedingRecovery = nodeProps.getCoreName();
String replicaCoreNodeName = ((Replica) nodeProps.getNodeProps()).getName();
String replicaUrl = nodeProps.getCoreUrl();
log.info(getName() + " started running to send REQUESTRECOVERY command to " + replicaUrl + "; will try for a max of " + (maxTries * (waitBetweenTriesMs / 1000)) + " secs");
RequestRecovery recoverRequestCmd = new RequestRecovery();
recoverRequestCmd.setAction(CoreAdminAction.REQUESTRECOVERY);
recoverRequestCmd.setCoreName(coreNeedingRecovery);
while (continueTrying && ++tries <= maxTries) {
if (tries > 1) {
log.warn("Asking core={} coreNodeName={} on " + recoveryUrl + " to recover; unsuccessful after " + tries + " of " + maxTries + " attempts so far ...", coreNeedingRecovery, replicaCoreNodeName);
} else {
log.info("Asking core={} coreNodeName={} on " + recoveryUrl + " to recover", coreNeedingRecovery, replicaCoreNodeName);
}
try (HttpSolrClient client = new HttpSolrClient.Builder(recoveryUrl).build()) {
client.setSoTimeout(60000);
client.setConnectionTimeout(15000);
try {
client.request(recoverRequestCmd);
log.info("Successfully sent " + CoreAdminAction.REQUESTRECOVERY + " command to core={} coreNodeName={} on " + recoveryUrl, coreNeedingRecovery, replicaCoreNodeName);
// succeeded, so stop looping
continueTrying = false;
} catch (Exception t) {
Throwable rootCause = SolrException.getRootCause(t);
boolean wasCommError = (rootCause instanceof ConnectException || rootCause instanceof ConnectTimeoutException || rootCause instanceof NoHttpResponseException || rootCause instanceof SocketException);
SolrException.log(log, recoveryUrl + ": Could not tell a replica to recover", t);
if (!wasCommError) {
continueTrying = false;
}
}
}
// wait a few seconds
if (continueTrying) {
try {
Thread.sleep(waitBetweenTriesMs);
} catch (InterruptedException ignoreMe) {
Thread.currentThread().interrupt();
}
if (coreContainer.isShutDown()) {
log.warn("Stop trying to send recovery command to downed replica core={} coreNodeName={} on " + replicaNodeName + " because my core container is closed.", coreNeedingRecovery, replicaCoreNodeName);
continueTrying = false;
break;
}
// see if the replica's node is still live, if not, no need to keep doing this loop
ZkStateReader zkStateReader = zkController.getZkStateReader();
if (!zkStateReader.getClusterState().liveNodesContain(replicaNodeName)) {
log.warn("Node " + replicaNodeName + " hosting core " + coreNeedingRecovery + " is no longer live. No need to keep trying to tell it to recover!");
continueTrying = false;
break;
}
String leaderCoreNodeName = leaderCd.getCloudDescriptor().getCoreNodeName();
// stop trying if I'm no longer the leader
if (leaderCoreNodeName != null && collection != null) {
String leaderCoreNodeNameFromZk = null;
try {
leaderCoreNodeNameFromZk = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 1000).getName();
} catch (Exception exc) {
log.error("Failed to determine if " + leaderCoreNodeName + " is still the leader for " + collection + " " + shardId + " before starting leader-initiated recovery thread for " + replicaUrl + " due to: " + exc);
}
if (!leaderCoreNodeName.equals(leaderCoreNodeNameFromZk)) {
log.warn("Stop trying to send recovery command to downed replica core=" + coreNeedingRecovery + ",coreNodeName=" + replicaCoreNodeName + " on " + replicaNodeName + " because " + leaderCoreNodeName + " is no longer the leader! New leader is " + leaderCoreNodeNameFromZk);
continueTrying = false;
break;
}
if (!leaderCd.getCloudDescriptor().isLeader()) {
log.warn("Stop trying to send recovery command to downed replica core=" + coreNeedingRecovery + ",coreNodeName=" + replicaCoreNodeName + " on " + replicaNodeName + " because " + leaderCoreNodeName + " is no longer the leader!");
continueTrying = false;
break;
}
}
// before acknowledging the leader initiated recovery command
if (collection != null && shardId != null) {
try {
// call out to ZooKeeper to get the leader-initiated recovery state
final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId, replicaCoreNodeName);
if (lirState == null) {
log.warn("Stop trying to send recovery command to downed replica core=" + coreNeedingRecovery + ",coreNodeName=" + replicaCoreNodeName + " on " + replicaNodeName + " because the znode no longer exists.");
continueTrying = false;
break;
}
if (lirState == Replica.State.RECOVERING) {
// replica has ack'd leader initiated recovery and entered the recovering state
// so we don't need to keep looping to send the command
continueTrying = false;
log.info("Replica " + coreNeedingRecovery + " on node " + replicaNodeName + " ack'd the leader initiated recovery state, " + "no need to keep trying to send recovery command");
} else {
String lcnn = zkStateReader.getLeaderRetry(collection, shardId, 5000).getName();
List<ZkCoreNodeProps> replicaProps = zkStateReader.getReplicaProps(collection, shardId, lcnn);
if (replicaProps != null && replicaProps.size() > 0) {
for (ZkCoreNodeProps prop : replicaProps) {
final Replica replica = (Replica) prop.getNodeProps();
if (replicaCoreNodeName.equals(replica.getName())) {
if (replica.getState() == Replica.State.ACTIVE) {
// which is bad if lirState is still "down"
if (lirState == Replica.State.DOWN) {
// OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
// so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;" + " forcing it back to down state to re-run the leader-initiated recovery process; props: " + replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
publishDownState(replicaCoreName, replicaCoreNodeName, replicaNodeName, replicaUrl, true);
}
}
break;
}
}
}
}
} catch (Exception ignoreMe) {
log.warn("Failed to determine state of core={} coreNodeName={} due to: " + ignoreMe, coreNeedingRecovery, replicaCoreNodeName);
// eventually this loop will exhaust max tries and stop so we can just log this for now
}
}
}
}
// replica is no longer in recovery on this node (may be handled on another node)
zkController.removeReplicaFromLeaderInitiatedRecoveryHandling(replicaUrl);
if (continueTrying) {
// ugh! this means the loop timed out before the recovery command could be delivered
// how exotic do we want to get here?
log.error("Timed out after waiting for " + (tries * (waitBetweenTriesMs / 1000)) + " secs to send the recovery request to: " + replicaUrl + "; not much more we can do here?");
// TODO: need to raise a JMX event to allow monitoring tools to take over from here
}
}
Aggregations