Search in sources :

Example 11 with ZkStateReader

use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.

the class V2HttpCall method getDocCollection.

protected DocCollection getDocCollection(String collectionName) {
    if (!cores.isZooKeeperAware()) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Solr not running in cloud mode ");
    }
    ZkStateReader zkStateReader = cores.getZkController().getZkStateReader();
    DocCollection collection = zkStateReader.getClusterState().getCollectionOrNull(collectionName);
    if (collection == null) {
        collectionName = corename = lookupAliases(collectionName);
        collection = zkStateReader.getClusterState().getCollectionOrNull(collectionName);
    }
    return collection;
}
Also used : ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) DocCollection(org.apache.solr.common.cloud.DocCollection) SolrException(org.apache.solr.common.SolrException)

Example 12 with ZkStateReader

use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.

the class OverseerStatusCmd method call.

@Override
@SuppressWarnings("unchecked")
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
    ZkStateReader zkStateReader = ocmh.zkStateReader;
    String leaderNode = OverseerTaskProcessor.getLeaderNode(zkStateReader.getZkClient());
    results.add("leader", leaderNode);
    Stat stat = new Stat();
    zkStateReader.getZkClient().getData("/overseer/queue", null, stat, true);
    results.add("overseer_queue_size", stat.getNumChildren());
    stat = new Stat();
    zkStateReader.getZkClient().getData("/overseer/queue-work", null, stat, true);
    results.add("overseer_work_queue_size", stat.getNumChildren());
    stat = new Stat();
    zkStateReader.getZkClient().getData("/overseer/collection-queue-work", null, stat, true);
    results.add("overseer_collection_queue_size", stat.getNumChildren());
    NamedList overseerStats = new NamedList();
    NamedList collectionStats = new NamedList();
    NamedList stateUpdateQueueStats = new NamedList();
    NamedList workQueueStats = new NamedList();
    NamedList collectionQueueStats = new NamedList();
    Overseer.Stats stats = ocmh.stats;
    for (Map.Entry<String, Overseer.Stat> entry : stats.getStats().entrySet()) {
        String key = entry.getKey();
        NamedList<Object> lst = new SimpleOrderedMap<>();
        if (key.startsWith("collection_")) {
            collectionStats.add(key.substring(11), lst);
            int successes = stats.getSuccessCount(entry.getKey());
            int errors = stats.getErrorCount(entry.getKey());
            lst.add("requests", successes);
            lst.add("errors", errors);
            List<Overseer.FailedOp> failureDetails = stats.getFailureDetails(key);
            if (failureDetails != null) {
                List<SimpleOrderedMap<Object>> failures = new ArrayList<>();
                for (Overseer.FailedOp failedOp : failureDetails) {
                    SimpleOrderedMap<Object> fail = new SimpleOrderedMap<>();
                    fail.add("request", failedOp.req.getProperties());
                    fail.add("response", failedOp.resp.getResponse());
                    failures.add(fail);
                }
                lst.add("recent_failures", failures);
            }
        } else if (key.startsWith("/overseer/queue_")) {
            stateUpdateQueueStats.add(key.substring(16), lst);
        } else if (key.startsWith("/overseer/queue-work_")) {
            workQueueStats.add(key.substring(21), lst);
        } else if (key.startsWith("/overseer/collection-queue-work_")) {
            collectionQueueStats.add(key.substring(32), lst);
        } else {
            // overseer stats
            overseerStats.add(key, lst);
            int successes = stats.getSuccessCount(entry.getKey());
            int errors = stats.getErrorCount(entry.getKey());
            lst.add("requests", successes);
            lst.add("errors", errors);
        }
        Timer timer = entry.getValue().requestTime;
        MetricUtils.addMetrics(lst, timer);
    }
    results.add("overseer_operations", overseerStats);
    results.add("collection_operations", collectionStats);
    results.add("overseer_queue", stateUpdateQueueStats);
    results.add("overseer_internal_queue", workQueueStats);
    results.add("collection_queue", collectionQueueStats);
}
Also used : NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) Stat(org.apache.zookeeper.data.Stat) Timer(com.codahale.metrics.Timer) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) Map(java.util.Map)

Example 13 with ZkStateReader

use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.

the class ReplaceNodeCmd method call.

@Override
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
    ZkStateReader zkStateReader = ocmh.zkStateReader;
    ocmh.checkRequired(message, "source", "target");
    String source = message.getStr("source");
    String target = message.getStr("target");
    String async = message.getStr("async");
    boolean parallel = message.getBool("parallel", false);
    ClusterState clusterState = zkStateReader.getClusterState();
    if (!clusterState.liveNodesContain(source)) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Source Node: " + source + " is not live");
    }
    if (!clusterState.liveNodesContain(target)) {
        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Target Node: " + target + " is not live");
    }
    List<ZkNodeProps> sourceReplicas = getReplicasOfNode(source, clusterState);
    List<ZkNodeProps> createdReplicas = new ArrayList<>();
    AtomicBoolean anyOneFailed = new AtomicBoolean(false);
    CountDownLatch countDownLatch = new CountDownLatch(sourceReplicas.size());
    for (ZkNodeProps sourceReplica : sourceReplicas) {
        NamedList nl = new NamedList();
        log.info("Going to create replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
        ZkNodeProps msg = sourceReplica.plus("parallel", String.valueOf(parallel)).plus(CoreAdminParams.NODE, target);
        if (async != null)
            msg.getProperties().put(ASYNC, async);
        final ZkNodeProps addedReplica = ocmh.addReplica(clusterState, msg, nl, () -> {
            countDownLatch.countDown();
            if (nl.get("failure") != null) {
                String errorString = String.format(Locale.ROOT, "Failed to create replica for collection=%s shard=%s" + " on node=%s", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
                log.warn(errorString);
                // and exit
                synchronized (results) {
                    results.add("failure", errorString);
                    anyOneFailed.set(true);
                }
            } else {
                log.debug("Successfully created replica for collection={} shard={} on node={}", sourceReplica.getStr(COLLECTION_PROP), sourceReplica.getStr(SHARD_ID_PROP), target);
            }
        });
        if (addedReplica != null) {
            createdReplicas.add(addedReplica);
        }
    }
    log.debug("Waiting for replace node action to complete");
    countDownLatch.await(5, TimeUnit.MINUTES);
    log.debug("Finished waiting for replace node action to complete");
    if (anyOneFailed.get()) {
        log.info("Failed to create some replicas. Cleaning up all replicas on target node");
        CountDownLatch cleanupLatch = new CountDownLatch(createdReplicas.size());
        for (ZkNodeProps createdReplica : createdReplicas) {
            NamedList deleteResult = new NamedList();
            try {
                ocmh.deleteReplica(zkStateReader.getClusterState(), createdReplica.plus("parallel", "true"), deleteResult, () -> {
                    cleanupLatch.countDown();
                    if (deleteResult.get("failure") != null) {
                        synchronized (results) {
                            results.add("failure", "Could not cleanup, because of : " + deleteResult.get("failure"));
                        }
                    }
                });
            } catch (KeeperException e) {
                cleanupLatch.countDown();
                log.warn("Error deleting replica ", e);
            } catch (Exception e) {
                log.warn("Error deleting replica ", e);
                cleanupLatch.countDown();
                throw e;
            }
        }
        cleanupLatch.await(5, TimeUnit.MINUTES);
    }
    // we have reached this far means all replicas could be recreated
    //now cleanup the replicas in the source node
    DeleteNodeCmd.cleanupReplicas(results, state, sourceReplicas, ocmh, source, async);
    results.add("success", "REPLACENODE action completed successfully from  : " + source + " to : " + target);
}
Also used : ClusterState(org.apache.solr.common.cloud.ClusterState) NamedList(org.apache.solr.common.util.NamedList) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) ArrayList(java.util.ArrayList) CountDownLatch(java.util.concurrent.CountDownLatch) KeeperException(org.apache.zookeeper.KeeperException) SolrException(org.apache.solr.common.SolrException) ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) SolrException(org.apache.solr.common.SolrException) KeeperException(org.apache.zookeeper.KeeperException)

Example 14 with ZkStateReader

use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.

the class OverseerRoleCmd method call.

@Override
@SuppressWarnings("unchecked")
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
    ZkStateReader zkStateReader = ocmh.zkStateReader;
    SolrZkClient zkClient = zkStateReader.getZkClient();
    Map roles = null;
    String node = message.getStr("node");
    String roleName = message.getStr("role");
    boolean nodeExists = false;
    if (nodeExists = zkClient.exists(ZkStateReader.ROLES, true)) {
        roles = (Map) Utils.fromJSON(zkClient.getData(ZkStateReader.ROLES, null, new Stat(), true));
    } else {
        roles = new LinkedHashMap(1);
    }
    List nodeList = (List) roles.get(roleName);
    if (nodeList == null)
        roles.put(roleName, nodeList = new ArrayList());
    if (ADDROLE == operation) {
        log.info("Overseer role added to {}", node);
        if (!nodeList.contains(node))
            nodeList.add(node);
    } else if (REMOVEROLE == operation) {
        log.info("Overseer role removed from {}", node);
        nodeList.remove(node);
    }
    if (nodeExists) {
        zkClient.setData(ZkStateReader.ROLES, Utils.toJSON(roles), true);
    } else {
        zkClient.create(ZkStateReader.ROLES, Utils.toJSON(roles), CreateMode.PERSISTENT, true);
    }
    //if there are too many nodes this command may time out. And most likely dedicated
    // overseers are created when there are too many nodes  . So , do this operation in a separate thread
    new Thread(() -> {
        try {
            overseerPrioritizer.prioritizeOverseerNodes(ocmh.myId);
        } catch (Exception e) {
            log.error("Error in prioritizing Overseer", e);
        }
    }).start();
}
Also used : ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) Stat(org.apache.zookeeper.data.Stat) ArrayList(java.util.ArrayList) NamedList(org.apache.solr.common.util.NamedList) ArrayList(java.util.ArrayList) List(java.util.List) SolrZkClient(org.apache.solr.common.cloud.SolrZkClient) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap)

Example 15 with ZkStateReader

use of org.apache.solr.common.cloud.ZkStateReader in project lucene-solr by apache.

the class RebalanceLeaders method ensurePreferredIsLeader.

private void ensurePreferredIsLeader(NamedList<Object> results, Slice slice, Map<String, String> currentRequests) throws KeeperException, InterruptedException {
    final String inactivePreferreds = "inactivePreferreds";
    final String alreadyLeaders = "alreadyLeaders";
    String collectionName = req.getParams().get(COLLECTION_PROP);
    for (Replica replica : slice.getReplicas()) {
        // Tell the replica to become the leader if we're the preferred leader AND active AND not the leader already
        if (replica.getBool(SliceMutator.PREFERRED_LEADER_PROP, false) == false) {
            continue;
        }
        // OK, we are the preferred leader, are we the actual leader?
        if (replica.getBool(LEADER_PROP, false)) {
            //We're a preferred leader, but we're _also_ the leader, don't need to do anything.
            NamedList<Object> noops = (NamedList<Object>) results.get(alreadyLeaders);
            if (noops == null) {
                noops = new NamedList<>();
                results.add(alreadyLeaders, noops);
            }
            NamedList<Object> res = new NamedList<>();
            res.add("status", "success");
            res.add("msg", "Already leader");
            res.add("shard", slice.getName());
            res.add("nodeName", replica.getNodeName());
            noops.add(replica.getName(), res);
            // already the leader, do nothing.
            return;
        }
        // We're the preferred leader, but someone else is leader. Only become leader if we're active.
        if (replica.getState() != Replica.State.ACTIVE) {
            NamedList<Object> inactives = (NamedList<Object>) results.get(inactivePreferreds);
            if (inactives == null) {
                inactives = new NamedList<>();
                results.add(inactivePreferreds, inactives);
            }
            NamedList<Object> res = new NamedList<>();
            res.add("status", "skipped");
            res.add("msg", "Node is a referredLeader, but it's inactive. Skipping");
            res.add("shard", slice.getName());
            res.add("nodeName", replica.getNodeName());
            inactives.add(replica.getName(), res);
            // Don't try to become the leader if we're not active!
            return;
        }
        // Replica is the preferred leader but not the actual leader, do something about that.
        // "Something" is
        // 1> if the preferred leader isn't first in line, tell it to re-queue itself.
        // 2> tell the actual leader to re-queue itself.
        ZkStateReader zkStateReader = coreContainer.getZkController().getZkStateReader();
        List<String> electionNodes = OverseerTaskProcessor.getSortedElectionNodes(zkStateReader.getZkClient(), ZkStateReader.getShardLeadersElectPath(collectionName, slice.getName()));
        if (electionNodes.size() < 2) {
            // if there's only one node in the queue, should already be leader and we shouldn't be here anyway.
            log.info("Rebalancing leaders and slice " + slice.getName() + " has less than two elements in the leader " + "election queue, but replica " + replica.getName() + " doesn't think it's the leader.");
            return;
        }
        // Ok, the sorting for election nodes is a bit strange. If the sequence numbers are the same, then the whole
        // string is used, but that sorts nodes with the same sequence number by their session IDs from ZK.
        // While this is determinate, it's not quite what we need, so re-queue nodes that aren't us and are
        // watching the leader node..
        String firstWatcher = electionNodes.get(1);
        if (LeaderElector.getNodeName(firstWatcher).equals(replica.getName()) == false) {
            makeReplicaFirstWatcher(collectionName, slice, replica);
        }
        String coreName = slice.getReplica(LeaderElector.getNodeName(electionNodes.get(0))).getStr(CORE_NAME_PROP);
        rejoinElection(collectionName, slice, electionNodes.get(0), coreName, false);
        waitForNodeChange(collectionName, slice, electionNodes.get(0));
        // Done with this slice, skip the rest of the replicas.
        return;
    }
}
Also used : ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) NamedList(org.apache.solr.common.util.NamedList) Replica(org.apache.solr.common.cloud.Replica)

Aggregations

ZkStateReader (org.apache.solr.common.cloud.ZkStateReader)133 ClusterState (org.apache.solr.common.cloud.ClusterState)58 Replica (org.apache.solr.common.cloud.Replica)48 Slice (org.apache.solr.common.cloud.Slice)48 HashMap (java.util.HashMap)34 SolrZkClient (org.apache.solr.common.cloud.SolrZkClient)33 ArrayList (java.util.ArrayList)32 DocCollection (org.apache.solr.common.cloud.DocCollection)31 Test (org.junit.Test)26 SolrException (org.apache.solr.common.SolrException)25 CloudSolrClient (org.apache.solr.client.solrj.impl.CloudSolrClient)22 ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)20 IOException (java.io.IOException)19 Map (java.util.Map)19 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)18 KeeperException (org.apache.zookeeper.KeeperException)16 SolrQuery (org.apache.solr.client.solrj.SolrQuery)15 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)15 SolrServerException (org.apache.solr.client.solrj.SolrServerException)13 JettySolrRunner (org.apache.solr.client.solrj.embedded.JettySolrRunner)12