use of org.apache.zookeeper.KeeperException.NoNodeException in project lucene-solr by apache.
the class OverseerElectionContext method startLeaderInitiatedRecoveryOnReplicas.
private void startLeaderInitiatedRecoveryOnReplicas(String coreName) throws Exception {
try (SolrCore core = cc.getCore(coreName)) {
CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
String coll = cloudDesc.getCollectionName();
String shardId = cloudDesc.getShardId();
String coreNodeName = cloudDesc.getCoreNodeName();
if (coll == null || shardId == null) {
log.error("Cannot start leader-initiated recovery on new leader (core=" + coreName + ",coreNodeName=" + coreNodeName + ") because collection and/or shard is null!");
return;
}
String znodePath = zkController.getLeaderInitiatedRecoveryZnodePath(coll, shardId);
List<String> replicas = null;
try {
replicas = zkClient.getChildren(znodePath, null, false);
} catch (NoNodeException nne) {
// this can be ignored
}
if (replicas != null && replicas.size() > 0) {
for (String replicaCoreNodeName : replicas) {
if (coreNodeName.equals(replicaCoreNodeName))
// added safe-guard so we don't mark this core as down
continue;
final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(coll, shardId, replicaCoreNodeName);
if (lirState == Replica.State.DOWN || lirState == Replica.State.RECOVERY_FAILED) {
log.info("After core={} coreNodeName={} was elected leader, a replica coreNodeName={} was found in state: " + lirState.toString() + " and needing recovery.", coreName, coreNodeName, replicaCoreNodeName);
List<ZkCoreNodeProps> replicaProps = zkController.getZkStateReader().getReplicaProps(collection, shardId, coreNodeName);
if (replicaProps != null && replicaProps.size() > 0) {
ZkCoreNodeProps coreNodeProps = null;
for (ZkCoreNodeProps p : replicaProps) {
if (((Replica) p.getNodeProps()).getName().equals(replicaCoreNodeName)) {
coreNodeProps = p;
break;
}
}
zkController.ensureReplicaInLeaderInitiatedRecovery(cc, collection, shardId, coreNodeProps, core.getCoreDescriptor(), false);
}
}
}
}
}
// core gets closed automagically
}
use of org.apache.zookeeper.KeeperException.NoNodeException in project lucene-solr by apache.
the class ZkController method registerAllCoresAsDown.
private void registerAllCoresAsDown(final CurrentCoreDescriptorProvider registerOnReconnect, boolean updateLastPublished) {
List<CoreDescriptor> descriptors = registerOnReconnect.getCurrentDescriptors();
if (isClosed)
return;
if (descriptors != null) {
// before registering as live, make sure everyone is in a
// down state
publishNodeAsDown(getNodeName());
for (CoreDescriptor descriptor : descriptors) {
// if it looks like we are going to be the leader, we don't
// want to wait for the following stuff
CloudDescriptor cloudDesc = descriptor.getCloudDescriptor();
String collection = cloudDesc.getCollectionName();
String slice = cloudDesc.getShardId();
try {
int children = zkStateReader.getZkClient().getChildren(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection + "/leader_elect/" + slice + "/election", null, true).size();
if (children == 0) {
log.debug("looks like we are going to be the leader for collection {} shard {}", collection, slice);
continue;
}
} catch (NoNodeException e) {
log.debug("looks like we are going to be the leader for collection {} shard {}", collection, slice);
continue;
} catch (InterruptedException e2) {
Thread.currentThread().interrupt();
} catch (KeeperException e) {
log.warn("", e);
Thread.currentThread().interrupt();
}
final String coreZkNodeName = descriptor.getCloudDescriptor().getCoreNodeName();
try {
log.debug("calling waitForLeaderToSeeDownState for coreZkNodeName={} collection={} shard={}", new Object[] { coreZkNodeName, collection, slice });
waitForLeaderToSeeDownState(descriptor, coreZkNodeName);
} catch (Exception e) {
SolrException.log(log, "", e);
if (isClosed) {
return;
}
}
}
}
}
use of org.apache.zookeeper.KeeperException.NoNodeException in project lucene-solr by apache.
the class ZkController method getLeaderInitiatedRecoveryStateObject.
public Map<String, Object> getLeaderInitiatedRecoveryStateObject(String collection, String shardId, String coreNodeName) {
if (collection == null || shardId == null || coreNodeName == null)
// if we don't have complete data about a core in cloud mode, return null
return null;
String znodePath = getLeaderInitiatedRecoveryZnodePath(collection, shardId, coreNodeName);
byte[] stateData = null;
try {
stateData = zkClient.getData(znodePath, null, new Stat(), false);
} catch (NoNodeException ignoreMe) {
// safe to ignore as this znode will only exist if the leader initiated recovery
} catch (ConnectionLossException | SessionExpiredException cle) {
// sort of safe to ignore ??? Usually these are seen when the core is going down
// or there are bigger issues to deal with than reading this znode
log.warn("Unable to read " + znodePath + " due to: " + cle);
} catch (Exception exc) {
log.error("Failed to read data from znode " + znodePath + " due to: " + exc);
if (exc instanceof SolrException) {
throw (SolrException) exc;
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Failed to read data from znodePath: " + znodePath, exc);
}
}
Map<String, Object> stateObj = null;
if (stateData != null && stateData.length > 0) {
// TODO: Remove later ... this is for upgrading from 4.8.x to 4.10.3 (see: SOLR-6732)
if (stateData[0] == (byte) '{') {
Object parsedJson = Utils.fromJSON(stateData);
if (parsedJson instanceof Map) {
stateObj = (Map<String, Object>) parsedJson;
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Leader-initiated recovery state data is invalid! " + parsedJson);
}
} else {
// old format still in ZK
stateObj = Utils.makeMap("state", new String(stateData, StandardCharsets.UTF_8));
}
}
return stateObj;
}
use of org.apache.zookeeper.KeeperException.NoNodeException in project lucene-solr by apache.
the class ZkController method linkConfSet.
public static void linkConfSet(SolrZkClient zkClient, String collection, String confSetName) throws KeeperException, InterruptedException {
String path = ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection;
log.debug("Load collection config from:" + path);
byte[] data;
try {
data = zkClient.getData(path, null, null, true);
} catch (NoNodeException e) {
// if there is no node, we will try and create it
// first try to make in case we are pre configuring
ZkNodeProps props = new ZkNodeProps(CONFIGNAME_PROP, confSetName);
try {
zkClient.makePath(path, Utils.toJSON(props), CreateMode.PERSISTENT, null, true);
} catch (KeeperException e2) {
// it's okay if the node already exists
if (e2.code() != KeeperException.Code.NODEEXISTS) {
throw e;
}
// if we fail creating, setdata
// TODO: we should consider using version
zkClient.setData(path, Utils.toJSON(props), true);
}
return;
}
// we found existing data, let's update it
ZkNodeProps props = null;
if (data != null) {
props = ZkNodeProps.load(data);
Map<String, Object> newProps = new HashMap<>();
newProps.putAll(props.getProperties());
newProps.put(CONFIGNAME_PROP, confSetName);
props = new ZkNodeProps(newProps);
} else {
props = new ZkNodeProps(CONFIGNAME_PROP, confSetName);
}
// TODO: we should consider using version
zkClient.setData(path, Utils.toJSON(props), true);
}
use of org.apache.zookeeper.KeeperException.NoNodeException in project lucene-solr by apache.
the class ZkController method checkOverseerDesignate.
public void checkOverseerDesignate() {
try {
byte[] data = zkClient.getData(ZkStateReader.ROLES, null, new Stat(), true);
if (data == null)
return;
Map roles = (Map) Utils.fromJSON(data);
if (roles == null)
return;
List nodeList = (List) roles.get("overseer");
if (nodeList == null)
return;
if (nodeList.contains(getNodeName())) {
ZkNodeProps props = new ZkNodeProps(Overseer.QUEUE_OPERATION, CollectionParams.CollectionAction.ADDROLE.toString().toLowerCase(Locale.ROOT), "node", getNodeName(), "role", "overseer");
log.info("Going to add role {} ", props);
getOverseerCollectionQueue().offer(Utils.toJSON(props));
}
} catch (NoNodeException nne) {
return;
} catch (Exception e) {
log.warn("could not read the overseer designate ", e);
}
}
Aggregations