use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.
the class V2HttpCall method getDocCollection.
protected DocCollection getDocCollection(String collectionName) {
if (!cores.isZooKeeperAware()) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Solr not running in cloud mode ");
}
ZkStateReader zkStateReader = cores.getZkController().getZkStateReader();
DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(collectionName);
if (collection == null) {
collectionName = corename = lookupAliases(collectionName);
collection = zkStateReader.getClusterState().getCollectionOrNull(collectionName);
}
return collection;
}
use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.
the class OverseerStatusCmd method call.
@Override
@SuppressWarnings("unchecked")
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
ZkStateReader zkStateReader = ocmh.zkStateReader;
String leaderNode = OverseerTaskProcessor.getLeaderNode(zkStateReader.getZkClient());
results.add("leader", leaderNode);
Stat stat = new Stat();
zkStateReader.getZkClient().getData("/overseer/queue", null, stat, true);
results.add("overseer_queue_size", stat.getNumChildren());
stat = new Stat();
zkStateReader.getZkClient().getData("/overseer/queue-work", null, stat, true);
results.add("overseer_work_queue_size", stat.getNumChildren());
stat = new Stat();
zkStateReader.getZkClient().getData("/overseer/collection-queue-work", null, stat, true);
results.add("overseer_collection_queue_size", stat.getNumChildren());
NamedList overseerStats = new NamedList();
NamedList collectionStats = new NamedList();
NamedList stateUpdateQueueStats = new NamedList();
NamedList workQueueStats = new NamedList();
NamedList collectionQueueStats = new NamedList();
Overseer.Stats stats = ocmh.stats;
for (Map.Entry<String, Overseer.Stat> entry : stats.getStats().entrySet()) {
String key = entry.getKey();
NamedList<Object> lst = new SimpleOrderedMap<>();
if (key.startsWith("collection_")) {
collectionStats.add(key.substring(11), lst);
int successes = stats.getSuccessCount(entry.getKey());
int errors = stats.getErrorCount(entry.getKey());
lst.add("requests", successes);
lst.add("errors", errors);
List<Overseer.FailedOp> failureDetails = stats.getFailureDetails(key);
if (failureDetails != null) {
List<SimpleOrderedMap<Object>> failures = new ArrayList<>();
for (Overseer.FailedOp failedOp : failureDetails) {
SimpleOrderedMap<Object> fail = new SimpleOrderedMap<>();
fail.add("request", failedOp.req.getProperties());
fail.add("response", failedOp.resp.getResponse());
failures.add(fail);
}
lst.add("recent_failures", failures);
}
} else if (key.startsWith("/overseer/queue_")) {
stateUpdateQueueStats.add(key.substring(16), lst);
} else if (key.startsWith("/overseer/queue-work_")) {
workQueueStats.add(key.substring(21), lst);
} else if (key.startsWith("/overseer/collection-queue-work_")) {
collectionQueueStats.add(key.substring(32), lst);
} else {
// overseer stats
overseerStats.add(key, lst);
int successes = stats.getSuccessCount(entry.getKey());
int errors = stats.getErrorCount(entry.getKey());
lst.add("requests", successes);
lst.add("errors", errors);
}
Timer timer = entry.getValue().requestTime;
MetricUtils.addMetrics(lst, timer);
}
results.add("overseer_operations", overseerStats);
results.add("collection_operations", collectionStats);
results.add("overseer_queue", stateUpdateQueueStats);
results.add("overseer_internal_queue", workQueueStats);
results.add("collection_queue", collectionQueueStats);
}
use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.
the class ReplaceNodeCmd method call.
@Override
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
ZkStateReader zkStateReader = ocmh.zkStateReader;
ocmh.checkRequired(message, "source", "target");
String source = message.getStr("source");
String target = message.getStr("target");
String async = message.getStr("async");
boolean parallel = message.getBool("parallel", false);
ClusterState clusterState = zkStateReader.getClusterState();
if (!clusterState.liveNodesContain(source)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + source + " is not live");
}
if (!clusterState.liveNodesContain(target)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Target Node: " + target + " is not live");
}
List<ZkNodeProps> sourceReplicas = getReplicasOfNode(source, clusterState);
List<ZkNodeProps> createdReplicas = new ArrayList<>();
AtomicBoolean anyOneFailed = new AtomicBoolean(false);
CountDownLatch countDownLatch = new CountDownLatch(sourceReplicas.size());
for (ZkNodeProps sourceReplica : sourceReplicas) {
NamedList nl = new NamedList();
log.info("Going to create replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target);
if (async != null)
msg.getProperties().put(ASYNC, async);
final ZkNodeProps addedReplica = ocmh.addReplica(clusterState, msg, nl, () -> {
countDownLatch.countDown();
if (nl.get("failure") != null) {
String errorString = String.format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" + " on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
log.warn(errorString);
// and exit
synchronized (results) {
results.add("failure", errorString);
anyOneFailed.set(true);
}
} else {
log.debug("Successfully created replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
}
});
if (addedReplica != null) {
createdReplicas.add(addedReplica);
}
}
log.debug("Waiting for replace node action to complete");
countDownLatch.await(5, TimeUnit.MINUTES);
log.debug("Finished waiting for replace node action to complete");
if (anyOneFailed.get()) {
log.info("Failed to create some replicas. Cleaning up all replicas on target node");
CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size());
for (ZkNodeProps createdReplica : createdReplicas) {
NamedList deleteResult = new NamedList();
try {
ocmh.deleteReplica(zkStateReader.getClusterState(), createdReplica.plus("parallel", "true"), deleteResult, () -> {
cleanupLatch.countDown();
if (deleteResult.get("failure") != null) {
synchronized (results) {
results.add("failure", "Could not cleanup, because of : " + deleteResult.get("failure"));
}
}
});
} catch (KeeperException e) {
cleanupLatch.countDown();
log.warn("Error deleting replica ", e);
} catch (Exception e) {
log.warn("Error deleting replica ", e);
cleanupLatch.countDown();
throw e;
}
}
cleanupLatch.await(5, TimeUnit.MINUTES);
}
// we have reached this far means all replicas could be recreated
//now cleanup the replicas in the source node
DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh, source, async);
results.add("success", "REPLACENODE action completed successfully from : " + source + " to : " + target);
}
use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.
the class OverseerRoleCmd method call.
@Override
@SuppressWarnings("unchecked")
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
ZkStateReader zkStateReader = ocmh.zkStateReader;
SolrZkClient zkClient = zkStateReader.getZkClient();
Map roles = null;
String node = message.getStr("node");
String roleName = message.getStr("role");
boolean nodeExists = false;
if (nodeExists = zkClient.exists(ZkStateReader.ROLES, true)) {
roles = (Map) Utils.fromJSON(zkClient.getData(ZkStateReader.ROLES, null, new Stat(), true));
} else {
roles = new LinkedHashMap(1);
}
List nodeList = (List) roles.get(roleName);
if (nodeList == null)
roles.put(roleName, nodeList = new ArrayList());
if (ADDROLE == operation) {
log.info("Overseer role added to {}", node);
if (!nodeList.contains(node))
nodeList.add(node);
} else if (REMOVEROLE == operation) {
log.info("Overseer role removed from {}", node);
nodeList.remove(node);
}
if (nodeExists) {
zkClient.setData(ZkStateReader.ROLES, Utils.toJSON(roles), true);
} else {
zkClient.create(ZkStateReader.ROLES, Utils.toJSON(roles), CreateMode.PERSISTENT, true);
}
//if there are too many nodes this command may time out. And most likely dedicated
// overseers are created when there are too many nodes . So , do this operation in a separate thread
new Thread(() -> {
try {
overseerPrioritizer.prioritizeOverseerNodes(ocmh.myId);
} catch (Exception e) {
log.error("Error in prioritizing Overseer", e);
}
}).start();
}
use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.
the class RebalanceLeaders method ensurePreferredIsLeader.
private void ensurePreferredIsLeader(NamedList<Object> results, Slice slice, Map<String, String> currentRequests) throws KeeperException, InterruptedException {
final String inactivePreferreds = "inactivePreferreds";
final String alreadyLeaders = "alreadyLeaders";
String collectionName = req.getParams().get(COLLECTION_PROP);
for (Replica replica : slice.getReplicas()) {
// Tell the replica to become the leader if we're the preferred leader AND active AND not the leader already
if (replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false) == false) {
continue;
}
// OK, we are the preferred leader, are we the actual leader?
if (replica.getBool(LEADER_PROP, false)) {
//We're a preferred leader, but we're _also_ the leader, don't need to do anything.
NamedList<Object> noops = (NamedList<Object>) results.get(alreadyLeaders);
if (noops == null) {
noops = new NamedList<>();
results.add(alreadyLeaders, noops);
}
NamedList<Object> res = new NamedList<>();
res.add("status", "success");
res.add("msg", "Already leader");
res.add("shard", slice.getName());
res.add("nodeName", replica.getNodeName());
noops.add(replica.getName(), res);
// already the leader, do nothing.
return;
}
// We're the preferred leader, but someone else is leader. Only become leader if we're active.
if (replica.getState() != Replica.State.ACTIVE) {
NamedList<Object> inactives = (NamedList<Object>) results.get(inactivePreferreds);
if (inactives == null) {
inactives = new NamedList<>();
results.add(inactivePreferreds, inactives);
}
NamedList<Object> res = new NamedList<>();
res.add("status", "skipped");
res.add("msg", "Node is a referredLeader, but it's inactive. Skipping");
res.add("shard", slice.getName());
res.add("nodeName", replica.getNodeName());
inactives.add(replica.getName(), res);
// Don't try to become the leader if we're not active!
return;
}
// Replica is the preferred leader but not the actual leader, do something about that.
// "Something" is
// 1> if the preferred leader isn't first in line, tell it to re-queue itself.
// 2> tell the actual leader to re-queue itself.
ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
List<String> electionNodes = OverseerTaskProcessor.getSortedElectionNodes(zkStateReader.getZkClient(), ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName()));
if (electionNodes.size() < 2) {
// if there's only one node in the queue, should already be leader and we shouldn't be here anyway.
log.info("Rebalancing leaders and slice " + slice.getName() + " has less than two elements in the leader " + "election queue, but replica " + replica.getName() + " doesn't think it's the leader.");
return;
}
// Ok, the sorting for election nodes is a bit strange. If the sequence numbers are the same, then the whole
// string is used, but that sorts nodes with the same sequence number by their session IDs from ZK.
// While this is determinate, it's not quite what we need, so re-queue nodes that aren't us and are
// watching the leader node..
String firstWatcher = electionNodes.get(1);
if (LeaderElector.getNodeName(firstWatcher).equals(replica.getName()) == false) {
makeReplicaFirstWatcher(collectionName, slice, replica);
}
String coreName = slice.getReplica(LeaderElector.getNodeName(electionNodes.get(0))).getStr(CORE_NAME_PROP);
rejoinElection(collectionName, slice, electionNodes.get(0), coreName, false);
waitForNodeChange(collectionName, slice, electionNodes.get(0));
// Done with this slice, skip the rest of the replicas.
return;
}
}
Aggregations