Search in sources :

Example 36 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class TextLogitStream method getShardUrls.

protected List<String> getShardUrls() throws IOException {
    try {
        ZkStateReader zkStateReader = cloudSolrClient.getZkStateReader();
        Collection<Slice> slices = CloudSolrStream.getSlices(this.collection, zkStateReader, false);
        ClusterState clusterState = zkStateReader.getClusterState();
        Set<String> liveNodes = clusterState.getLiveNodes();
        List<String> baseUrls = new ArrayList<>();
        for (Slice slice : slices) {
            Collection<Replica> replicas = slice.getReplicas();
            List<Replica> shuffler = new ArrayList<>();
            for (Replica replica : replicas) {
                if (replica.getState() == Replica.State.ACTIVE && liveNodes.contains(replica.getNodeName())) {
                    shuffler.add(replica);
                }
            }
            Collections.shuffle(shuffler, new Random());
            Replica rep = shuffler.get(0);
            ZkCoreNodeProps zkProps = new ZkCoreNodeProps(rep);
            String url = zkProps.getCoreUrl();
            baseUrls.add(url);
        }
        return baseUrls;
    } catch (Exception e) {
        throw new IOException(e);
    }
}
Also used : ClusterState(org.apache.solr.common.cloud.ClusterState) ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) ArrayList(java.util.ArrayList) IOException(java.io.IOException) Replica(org.apache.solr.common.cloud.Replica) IOException(java.io.IOException) ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) Random(java.util.Random) Slice(org.apache.solr.common.cloud.Slice)

Example 37 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class HttpPartitionTest method getHttpSolrClient.

protected HttpSolrClient getHttpSolrClient(Replica replica, String coll) throws Exception {
    ZkCoreNodeProps zkProps = new ZkCoreNodeProps(replica);
    String url = zkProps.getBaseUrl() + "/" + coll;
    return getHttpSolrClient(url);
}
Also used : ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps)

Example 38 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class LeaderElectionTest method getLeaderUrl.

private String getLeaderUrl(final String collection, final String slice) throws KeeperException, InterruptedException {
    int iterCount = 60;
    while (iterCount-- > 0) {
        try {
            byte[] data = zkClient.getData(ZkStateReader.getShardLeadersPath(collection, slice), null, null, true);
            ZkCoreNodeProps leaderProps = new ZkCoreNodeProps(ZkNodeProps.load(data));
            return leaderProps.getCoreUrl();
        } catch (NoNodeException | SessionExpiredException e) {
            Thread.sleep(500);
        }
    }
    zkClient.printLayoutToStdOut();
    throw new RuntimeException("Could not get leader props");
}
Also used : ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) SessionExpiredException(org.apache.zookeeper.KeeperException.SessionExpiredException)

Example 39 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class LeaderInitiatedRecoveryThread method sendRecoveryCommandWithRetry.

protected void sendRecoveryCommandWithRetry() throws Exception {
    int tries = 0;
    long waitBetweenTriesMs = 5000L;
    boolean continueTrying = true;
    String replicaCoreName = nodeProps.getCoreName();
    String recoveryUrl = nodeProps.getBaseUrl();
    String replicaNodeName = nodeProps.getNodeName();
    String coreNeedingRecovery = nodeProps.getCoreName();
    String replicaCoreNodeName = ((Replica) nodeProps.getNodeProps()).getName();
    String replicaUrl = nodeProps.getCoreUrl();
    log.info(getName() + " started running to send REQUESTRECOVERY command to " + replicaUrl + "; will try for a max of " + (maxTries * (waitBetweenTriesMs / 1000)) + " secs");
    RequestRecovery recoverRequestCmd = new RequestRecovery();
    recoverRequestCmd.setAction(CoreAdminAction.REQUESTRECOVERY);
    recoverRequestCmd.setCoreName(coreNeedingRecovery);
    while (continueTrying && ++tries <= maxTries) {
        if (tries > 1) {
            log.warn("Asking core={} coreNodeName={} on " + recoveryUrl + " to recover; unsuccessful after " + tries + " of " + maxTries + " attempts so far ...", coreNeedingRecovery, replicaCoreNodeName);
        } else {
            log.info("Asking core={} coreNodeName={} on " + recoveryUrl + " to recover", coreNeedingRecovery, replicaCoreNodeName);
        }
        try (HttpSolrClient client = new HttpSolrClient.Builder(recoveryUrl).build()) {
            client.setSoTimeout(60000);
            client.setConnectionTimeout(15000);
            try {
                client.request(recoverRequestCmd);
                log.info("Successfully sent " + CoreAdminAction.REQUESTRECOVERY + " command to core={} coreNodeName={} on " + recoveryUrl, coreNeedingRecovery, replicaCoreNodeName);
                // succeeded, so stop looping
                continueTrying = false;
            } catch (Exception t) {
                Throwable rootCause = SolrException.getRootCause(t);
                boolean wasCommError = (rootCause instanceof ConnectException || rootCause instanceof ConnectTimeoutException || rootCause instanceof NoHttpResponseException || rootCause instanceof SocketException);
                SolrException.log(log, recoveryUrl + ": Could not tell a replica to recover", t);
                if (!wasCommError) {
                    continueTrying = false;
                }
            }
        }
        // wait a few seconds
        if (continueTrying) {
            try {
                Thread.sleep(waitBetweenTriesMs);
            } catch (InterruptedException ignoreMe) {
                Thread.currentThread().interrupt();
            }
            if (coreContainer.isShutDown()) {
                log.warn("Stop trying to send recovery command to downed replica core={} coreNodeName={} on " + replicaNodeName + " because my core container is closed.", coreNeedingRecovery, replicaCoreNodeName);
                continueTrying = false;
                break;
            }
            // see if the replica's node is still live, if not, no need to keep doing this loop
            ZkStateReader zkStateReader = zkController.getZkStateReader();
            if (!zkStateReader.getClusterState().liveNodesContain(replicaNodeName)) {
                log.warn("Node " + replicaNodeName + " hosting core " + coreNeedingRecovery + " is no longer live. No need to keep trying to tell it to recover!");
                continueTrying = false;
                break;
            }
            String leaderCoreNodeName = leaderCd.getCloudDescriptor().getCoreNodeName();
            // stop trying if I'm no longer the leader
            if (leaderCoreNodeName != null && collection != null) {
                String leaderCoreNodeNameFromZk = null;
                try {
                    leaderCoreNodeNameFromZk = zkController.getZkStateReader().getLeaderRetry(collection, shardId, 1000).getName();
                } catch (Exception exc) {
                    log.error("Failed to determine if " + leaderCoreNodeName + " is still the leader for " + collection + " " + shardId + " before starting leader-initiated recovery thread for " + replicaUrl + " due to: " + exc);
                }
                if (!leaderCoreNodeName.equals(leaderCoreNodeNameFromZk)) {
                    log.warn("Stop trying to send recovery command to downed replica core=" + coreNeedingRecovery + ",coreNodeName=" + replicaCoreNodeName + " on " + replicaNodeName + " because " + leaderCoreNodeName + " is no longer the leader! New leader is " + leaderCoreNodeNameFromZk);
                    continueTrying = false;
                    break;
                }
                if (!leaderCd.getCloudDescriptor().isLeader()) {
                    log.warn("Stop trying to send recovery command to downed replica core=" + coreNeedingRecovery + ",coreNodeName=" + replicaCoreNodeName + " on " + replicaNodeName + " because " + leaderCoreNodeName + " is no longer the leader!");
                    continueTrying = false;
                    break;
                }
            }
            // before acknowledging the leader initiated recovery command
            if (collection != null && shardId != null) {
                try {
                    // call out to ZooKeeper to get the leader-initiated recovery state
                    final Replica.State lirState = zkController.getLeaderInitiatedRecoveryState(collection, shardId, replicaCoreNodeName);
                    if (lirState == null) {
                        log.warn("Stop trying to send recovery command to downed replica core=" + coreNeedingRecovery + ",coreNodeName=" + replicaCoreNodeName + " on " + replicaNodeName + " because the znode no longer exists.");
                        continueTrying = false;
                        break;
                    }
                    if (lirState == Replica.State.RECOVERING) {
                        // replica has ack'd leader initiated recovery and entered the recovering state
                        // so we don't need to keep looping to send the command
                        continueTrying = false;
                        log.info("Replica " + coreNeedingRecovery + " on node " + replicaNodeName + " ack'd the leader initiated recovery state, " + "no need to keep trying to send recovery command");
                    } else {
                        String lcnn = zkStateReader.getLeaderRetry(collection, shardId, 5000).getName();
                        List<ZkCoreNodeProps> replicaProps = zkStateReader.getReplicaProps(collection, shardId, lcnn);
                        if (replicaProps != null && replicaProps.size() > 0) {
                            for (ZkCoreNodeProps prop : replicaProps) {
                                final Replica replica = (Replica) prop.getNodeProps();
                                if (replicaCoreNodeName.equals(replica.getName())) {
                                    if (replica.getState() == Replica.State.ACTIVE) {
                                        // which is bad if lirState is still "down"
                                        if (lirState == Replica.State.DOWN) {
                                            // OK, so the replica thinks it is active, but it never ack'd the leader initiated recovery
                                            // so its state cannot be trusted and it needs to be told to recover again ... and we keep looping here
                                            log.warn("Replica core={} coreNodeName={} set to active but the leader thinks it should be in recovery;" + " forcing it back to down state to re-run the leader-initiated recovery process; props: " + replicaProps.get(0), coreNeedingRecovery, replicaCoreNodeName);
                                            publishDownState(replicaCoreName, replicaCoreNodeName, replicaNodeName, replicaUrl, true);
                                        }
                                    }
                                    break;
                                }
                            }
                        }
                    }
                } catch (Exception ignoreMe) {
                    log.warn("Failed to determine state of core={} coreNodeName={} due to: " + ignoreMe, coreNeedingRecovery, replicaCoreNodeName);
                // eventually this loop will exhaust max tries and stop so we can just log this for now
                }
            }
        }
    }
    // replica is no longer in recovery on this node (may be handled on another node)
    zkController.removeReplicaFromLeaderInitiatedRecoveryHandling(replicaUrl);
    if (continueTrying) {
        // ugh! this means the loop timed out before the recovery command could be delivered
        // how exotic do we want to get here?
        log.error("Timed out after waiting for " + (tries * (waitBetweenTriesMs / 1000)) + " secs to send the recovery request to: " + replicaUrl + "; not much more we can do here?");
    // TODO: need to raise a JMX event to allow monitoring tools to take over from here
    }
}
Also used : NoHttpResponseException(org.apache.http.NoHttpResponseException) SocketException(java.net.SocketException) ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) RequestRecovery(org.apache.solr.client.solrj.request.CoreAdminRequest.RequestRecovery) Replica(org.apache.solr.common.cloud.Replica) KeeperException(org.apache.zookeeper.KeeperException) NoHttpResponseException(org.apache.http.NoHttpResponseException) SolrException(org.apache.solr.common.SolrException) SocketException(java.net.SocketException) ConnectTimeoutException(org.apache.http.conn.ConnectTimeoutException) ConnectException(java.net.ConnectException) HttpSolrClient(org.apache.solr.client.solrj.impl.HttpSolrClient) ZkStateReader(org.apache.solr.common.cloud.ZkStateReader) ConnectException(java.net.ConnectException) ConnectTimeoutException(org.apache.http.conn.ConnectTimeoutException)

Example 40 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class CdcrRequestHandler method handleCollectionCheckpointAction.

/**
   * This action is generally executed on the target cluster in order to retrieve the latest update checkpoint.
   * This checkpoint is used on the source cluster to setup the
   * {@link org.apache.solr.update.CdcrUpdateLog.CdcrLogReader} of a shard leader. <br/>
   * This method will execute in parallel one
   * {@link org.apache.solr.handler.CdcrParams.CdcrAction#SHARDCHECKPOINT} request per shard leader. It will
   * then pick the lowest version number as checkpoint. Picking the lowest amongst all shards will ensure that we do not
   * pick a checkpoint that is ahead of the source cluster. This can occur when other shard leaders are sending new
   * updates to the target cluster while we are currently instantiating the
   * {@link org.apache.solr.update.CdcrUpdateLog.CdcrLogReader}.
   * This solution only works in scenarios where the topology of the source and target clusters are identical.
   */
private void handleCollectionCheckpointAction(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, SolrServerException {
    ZkController zkController = core.getCoreContainer().getZkController();
    try {
        zkController.getZkStateReader().forceUpdateCollection(collection);
    } catch (Exception e) {
        log.warn("Error when updating cluster state", e);
    }
    ClusterState cstate = zkController.getClusterState();
    Collection<Slice> shards = cstate.getActiveSlices(collection);
    ExecutorService parallelExecutor = ExecutorUtil.newMDCAwareCachedThreadPool(new DefaultSolrThreadFactory("parallelCdcrExecutor"));
    long checkpoint = Long.MAX_VALUE;
    try {
        List<Callable<Long>> callables = new ArrayList<>();
        for (Slice shard : shards) {
            ZkNodeProps leaderProps = zkController.getZkStateReader().getLeaderRetry(collection, shard.getName());
            ZkCoreNodeProps nodeProps = new ZkCoreNodeProps(leaderProps);
            callables.add(new SliceCheckpointCallable(nodeProps.getCoreUrl(), path));
        }
        for (final Future<Long> future : parallelExecutor.invokeAll(callables)) {
            long version = future.get();
            if (version < checkpoint) {
                // we must take the lowest checkpoint from all the shards
                checkpoint = version;
            }
        }
    } catch (InterruptedException e) {
        Thread.currentThread().interrupt();
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error while requesting shard's checkpoints", e);
    } catch (ExecutionException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error while requesting shard's checkpoints", e);
    } finally {
        parallelExecutor.shutdown();
    }
    rsp.add(CdcrParams.CHECKPOINT, checkpoint);
}
Also used : ClusterState(org.apache.solr.common.cloud.ClusterState) ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) ArrayList(java.util.ArrayList) DefaultSolrThreadFactory(org.apache.solr.util.DefaultSolrThreadFactory) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrException(org.apache.solr.common.SolrException) CancellationException(java.util.concurrent.CancellationException) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) Callable(java.util.concurrent.Callable) ZkController(org.apache.solr.cloud.ZkController) Slice(org.apache.solr.common.cloud.Slice) ExecutorService(java.util.concurrent.ExecutorService) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) ExecutionException(java.util.concurrent.ExecutionException) SolrException(org.apache.solr.common.SolrException)

Aggregations

ZkCoreNodeProps (org.apache.solr.common.cloud.ZkCoreNodeProps)47 Replica (org.apache.solr.common.cloud.Replica)24 ArrayList (java.util.ArrayList)22 Slice (org.apache.solr.common.cloud.Slice)20 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)16 SolrException (org.apache.solr.common.SolrException)13 ClusterState (org.apache.solr.common.cloud.ClusterState)13 IOException (java.io.IOException)12 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)12 RetryNode (org.apache.solr.update.SolrCmdDistributor.RetryNode)12 StdNode (org.apache.solr.update.SolrCmdDistributor.StdNode)12 Node (org.apache.solr.update.SolrCmdDistributor.Node)11 ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)10 ZkStateReader (org.apache.solr.common.cloud.ZkStateReader)10 SolrQuery (org.apache.solr.client.solrj.SolrQuery)9 ZooKeeperException (org.apache.solr.common.cloud.ZooKeeperException)8 KeeperException (org.apache.zookeeper.KeeperException)8 SolrServerException (org.apache.solr.client.solrj.SolrServerException)7 Random (java.util.Random)6 NamedList (org.apache.solr.common.util.NamedList)6