use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.
the class RecoveryStrategy method doReplicateOnlyRecovery.
private final void doReplicateOnlyRecovery(SolrCore core) throws InterruptedException {
boolean successfulRecovery = false;
// }
while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) {
// don't use interruption or it will close channels though
try {
CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
ZkNodeProps leaderprops = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId());
final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
final String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP);
String leaderUrl = ZkCoreNodeProps.getCoreUrl(leaderBaseUrl, leaderCoreName);
String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
//TODO: We can probably delete most of this code if we say this strategy can only be used for pull replicas
boolean isLeader = leaderUrl.equals(ourUrl);
if (isLeader && !cloudDesc.isLeader()) {
throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
}
if (cloudDesc.isLeader()) {
assert cloudDesc.getReplicaType() != Replica.Type.PULL;
// we are now the leader - no one else must have been suitable
LOG.warn("We have not yet recovered - but we are now the leader!");
LOG.info("Finished recovery process.");
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
return;
}
LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl, ourUrl);
zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
if (isClosed()) {
LOG.info("Recovery for core {} has been closed", core.getName());
break;
}
LOG.info("Starting Replication Recovery.");
try {
LOG.info("Stopping background replicate from leader process");
zkController.stopReplicationFromLeader(coreName);
replicate(zkController.getNodeName(), core, leaderprops);
if (isClosed()) {
LOG.info("Recovery for core {} has been closed", core.getName());
break;
}
LOG.info("Replication Recovery was successful.");
successfulRecovery = true;
} catch (Exception e) {
SolrException.log(LOG, "Error while trying to recover", e);
}
} catch (Exception e) {
SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e);
} finally {
if (successfulRecovery) {
LOG.info("Restaring background replicate from leader process");
zkController.startReplicationFromLeader(coreName, false);
LOG.info("Registering as Active after recovery.");
try {
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
} catch (Exception e) {
LOG.error("Could not publish as ACTIVE after succesful recovery", e);
successfulRecovery = false;
}
if (successfulRecovery) {
close = true;
recoveryListener.recovered();
}
}
}
if (!successfulRecovery) {
// Or do a fall off retry...
try {
if (isClosed()) {
LOG.info("Recovery for core {} has been closed", core.getName());
break;
}
LOG.error("Recovery failed - trying again... (" + retries + ")");
retries++;
if (retries >= maxRetries) {
SolrException.log(LOG, "Recovery failed - max retries exceeded (" + retries + ").");
try {
recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
} catch (Exception e) {
SolrException.log(LOG, "Could not publish that recovery failed", e);
}
break;
}
} catch (Exception e) {
SolrException.log(LOG, "An error has occurred during recovery", e);
}
try {
// Wait an exponential interval between retries, start at 5 seconds and work up to a minute.
// If we're at attempt >= 4, there's no point computing pow(2, retries) because the result
// will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
// order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
double loopCount = retries < 4 ? Math.min(Math.pow(2, retries), 12) : 12;
LOG.info("Wait [{}] seconds before trying to recover again (attempt={})", loopCount, retries);
for (int i = 0; i < loopCount; i++) {
if (isClosed()) {
LOG.info("Recovery for core {} has been closed", core.getName());
// check if someone closed us
break;
}
Thread.sleep(startingRecoveryDelayMilliSeconds);
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
LOG.warn("Recovery was interrupted.", e);
close = true;
}
}
}
// We skip core.seedVersionBuckets(); We don't have a transaction log
LOG.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery));
}
use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.
the class OverseerCollectionMessageHandler method migrateStateFormat.
//TODO should we not remove in the next release ?
private void migrateStateFormat(ClusterState state, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException {
final String collectionName = message.getStr(COLLECTION_PROP);
boolean firstLoop = true;
// wait for a while until the state format changes
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
while (!timeout.hasTimedOut()) {
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
if (collection == null) {
throw new SolrException(ErrorCode.BAD_REQUEST, "Collection: " + collectionName + " not found");
}
if (collection.getStateFormat() == 2) {
// Done.
results.add("success", new SimpleOrderedMap<>());
return;
}
if (firstLoop) {
// Actually queue the migration command.
firstLoop = false;
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, MIGRATESTATEFORMAT.toLower(), COLLECTION_PROP, collectionName);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
}
Thread.sleep(100);
}
throw new SolrException(ErrorCode.SERVER_ERROR, "Could not migrate state format for collection: " + collectionName);
}
use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.
the class OverseerNodePrioritizer method prioritizeOverseerNodes.
public synchronized void prioritizeOverseerNodes(String overseerId) throws KeeperException, InterruptedException {
SolrZkClient zk = zkStateReader.getZkClient();
if (!zk.exists(ZkStateReader.ROLES, true))
return;
Map m = (Map) Utils.fromJSON(zk.getData(ZkStateReader.ROLES, null, new Stat(), true));
List overseerDesignates = (List) m.get("overseer");
if (overseerDesignates == null || overseerDesignates.isEmpty())
return;
String ldr = OverseerTaskProcessor.getLeaderNode(zk);
if (overseerDesignates.contains(ldr))
return;
log.info("prioritizing overseer nodes at {} overseer designates are {}", overseerId, overseerDesignates);
List<String> electionNodes = OverseerTaskProcessor.getSortedElectionNodes(zk, Overseer.OVERSEER_ELECT + LeaderElector.ELECTION_NODE);
if (electionNodes.size() < 2)
return;
log.info("sorted nodes {}", electionNodes);
String designateNodeId = null;
for (String electionNode : electionNodes) {
if (overseerDesignates.contains(LeaderElector.getNodeName(electionNode))) {
designateNodeId = electionNode;
break;
}
}
if (designateNodeId == null) {
log.warn("No live overseer designate ");
return;
}
if (!designateNodeId.equals(electionNodes.get(1))) {
//checking if it is already at no:1
log.info("asking node {} to come join election at head", designateNodeId);
//ask designate to come first
invokeOverseerOp(designateNodeId, "rejoinAtHead");
log.info("asking the old first in line {} to rejoin election ", electionNodes.get(1));
//ask second inline to go behind
invokeOverseerOp(electionNodes.get(1), "rejoin");
}
//now ask the current leader to QUIT , so that the designate can takeover
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.QUIT.toLower(), ID, OverseerTaskProcessor.getLeaderId(zkStateReader.getZkClient()))));
}
use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.
the class OverseerCollectionMessageHandler method modifyCollection.
private void modifyCollection(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException {
final String collectionName = message.getStr(ZkStateReader.COLLECTION_PROP);
//the rest of the processing is based on writing cluster state properties
//remove the property here to avoid any errors down the pipeline due to this property appearing
String configName = (String) message.getProperties().remove(COLL_CONF);
if (configName != null) {
validateConfigOrThrowSolrException(configName);
boolean isLegacyCloud = Overseer.isLegacy(zkStateReader);
createConfNode(configName, collectionName, isLegacyCloud);
reloadCollection(null, new ZkNodeProps(NAME, collectionName), results);
}
overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message));
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
boolean areChangesVisible = true;
while (!timeout.hasTimedOut()) {
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
areChangesVisible = true;
for (Map.Entry<String, Object> updateEntry : message.getProperties().entrySet()) {
String updateKey = updateEntry.getKey();
if (!updateKey.equals(ZkStateReader.COLLECTION_PROP) && !updateKey.equals(Overseer.QUEUE_OPERATION) && !collection.get(updateKey).equals(updateEntry.getValue())) {
areChangesVisible = false;
break;
}
}
if (areChangesVisible)
break;
Thread.sleep(100);
}
if (!areChangesVisible)
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not modify collection " + message);
}
use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.
the class OverseerCollectionMessageHandler method processReplicaAddPropertyCommand.
@SuppressWarnings("unchecked")
private void processReplicaAddPropertyCommand(ClusterState clusterState, ZkNodeProps message, NamedList results) throws Exception {
checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP, PROPERTY_PROP, PROPERTY_VALUE_PROP);
SolrZkClient zkClient = zkStateReader.getZkClient();
DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkClient);
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICAPROP.toLower());
propMap.putAll(message.getProperties());
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
}
Aggregations