Search in sources :

Example 41 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class DistributedUpdateProcessor method doFinish.

// TODO: optionally fail if n replicas are not reached...
private void doFinish() {
    // TODO: if not a forward and replication req is not specified, we could
    // send in a background thread    
    cmdDistrib.finish();
    List<Error> errors = cmdDistrib.getErrors();
    // TODO - we may need to tell about more than one error...
    List<Error> errorsForClient = new ArrayList<>(errors.size());
    for (final SolrCmdDistributor.Error error : errors) {
        if (error.req.node instanceof RetryNode) {
            // if it's a forward, any fail is a problem - 
            // otherwise we assume things are fine if we got it locally
            // until we start allowing min replication param
            errorsForClient.add(error);
            continue;
        }
        // succeeded 
        if (log.isWarnEnabled()) {
            log.warn("Error sending update to " + error.req.node.getBaseUrl(), error.e);
        }
        // Since it is not a forward request, for each fail, try to tell them to
        // recover - the doc was already added locally, so it should have been
        // legit
        DistribPhase phase = DistribPhase.parseParam(error.req.uReq.getParams().get(DISTRIB_UPDATE_PARAM));
        if (phase != DistribPhase.FROMLEADER)
            // don't have non-leaders try to recovery other nodes
            continue;
        // we don't want to run recovery on a node which missed a commit command
        if (error.req.uReq.getParams().get(COMMIT_END_POINT) != null)
            continue;
        final String replicaUrl = error.req.node.getUrl();
        // if the remote replica failed the request because of leader change (SOLR-6511), then fail the request
        String cause = (error.e instanceof SolrException) ? ((SolrException) error.e).getMetadata("cause") : null;
        if ("LeaderChanged".equals(cause)) {
            // let's just fail this request and let the client retry? or just call processAdd again?
            log.error("On " + cloudDesc.getCoreNodeName() + ", replica " + replicaUrl + " now thinks it is the leader! Failing the request to let the client retry! " + error.e);
            errorsForClient.add(error);
            continue;
        }
        String collection = null;
        String shardId = null;
        if (error.req.node instanceof StdNode) {
            StdNode stdNode = (StdNode) error.req.node;
            collection = stdNode.getCollection();
            shardId = stdNode.getShardId();
            // before we go setting other replicas to down, make sure we're still the leader!
            String leaderCoreNodeName = null;
            Exception getLeaderExc = null;
            Replica leaderProps = null;
            try {
                leaderProps = zkController.getZkStateReader().getLeader(collection, shardId);
                if (leaderProps != null) {
                    leaderCoreNodeName = leaderProps.getName();
                }
            } catch (Exception exc) {
                getLeaderExc = exc;
            }
            if (leaderCoreNodeName == null) {
                log.warn("Failed to determine if {} is still the leader for collection={} shardId={} " + "before putting {} into leader-initiated recovery", cloudDesc.getCoreNodeName(), collection, shardId, replicaUrl, getLeaderExc);
            }
            List<ZkCoreNodeProps> myReplicas = zkController.getZkStateReader().getReplicaProps(collection, cloudDesc.getShardId(), cloudDesc.getCoreNodeName());
            boolean foundErrorNodeInReplicaList = false;
            if (myReplicas != null) {
                for (ZkCoreNodeProps replicaProp : myReplicas) {
                    if (((Replica) replicaProp.getNodeProps()).getName().equals(((Replica) stdNode.getNodeProps().getNodeProps()).getName())) {
                        foundErrorNodeInReplicaList = true;
                        break;
                    }
                }
            }
            // If the client specified minRf and we didn't achieve the minRf, don't send recovery and let client retry
            if (replicationTracker != null && replicationTracker.getAchievedRf() < replicationTracker.minRf) {
                continue;
            }
            if (// we are still same leader
            leaderCoreNodeName != null && cloudDesc.getCoreNodeName().equals(leaderCoreNodeName) && // we found an error for one of replicas
            foundErrorNodeInReplicaList && !stdNode.getNodeProps().getCoreUrl().equals(leaderProps.getCoreUrl())) {
                // we do not want to put ourself into LIR
                try {
                    // if false, then the node is probably not "live" anymore
                    // and we do not need to send a recovery message
                    Throwable rootCause = SolrException.getRootCause(error.e);
                    log.error("Setting up to try to start recovery on replica {}", replicaUrl, rootCause);
                    zkController.ensureReplicaInLeaderInitiatedRecovery(req.getCore().getCoreContainer(), collection, shardId, stdNode.getNodeProps(), req.getCore().getCoreDescriptor(), false);
                } catch (Exception exc) {
                    Throwable setLirZnodeFailedCause = SolrException.getRootCause(exc);
                    log.error("Leader failed to set replica " + error.req.node.getUrl() + " state to DOWN due to: " + setLirZnodeFailedCause, setLirZnodeFailedCause);
                }
            } else {
                // not the leader anymore maybe or the error'd node is not my replica?
                if (!foundErrorNodeInReplicaList) {
                    log.warn("Core " + cloudDesc.getCoreNodeName() + " belonging to " + collection + " " + shardId + ", does not have error'd node " + stdNode.getNodeProps().getCoreUrl() + " as a replica. " + "No request recovery command will be sent!");
                } else {
                    log.warn("Core " + cloudDesc.getCoreNodeName() + " is no longer the leader for " + collection + " " + shardId + " or we tried to put ourself into LIR, no request recovery command will be sent!");
                }
            }
        }
    }
    if (replicationTracker != null) {
        rsp.getResponseHeader().add(UpdateRequest.REPFACT, replicationTracker.getAchievedRf());
        rsp.getResponseHeader().add(UpdateRequest.MIN_REPFACT, replicationTracker.minRf);
        replicationTracker = null;
    }
    if (0 < errorsForClient.size()) {
        throw new DistributedUpdatesAsyncException(errorsForClient);
    }
}
Also used : RetryNode(org.apache.solr.update.SolrCmdDistributor.RetryNode) ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) Error(org.apache.solr.update.SolrCmdDistributor.Error) ArrayList(java.util.ArrayList) Error(org.apache.solr.update.SolrCmdDistributor.Error) Replica(org.apache.solr.common.cloud.Replica) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrException(org.apache.solr.common.SolrException) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) SolrCmdDistributor(org.apache.solr.update.SolrCmdDistributor) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) SolrException(org.apache.solr.common.SolrException)

Example 42 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class DistributedUpdateProcessor method getNodesByRoutingRules.

private List<Node> getNodesByRoutingRules(ClusterState cstate, DocCollection coll, String id, SolrInputDocument doc) {
    DocRouter router = coll.getRouter();
    List<Node> nodes = null;
    if (router instanceof CompositeIdRouter) {
        CompositeIdRouter compositeIdRouter = (CompositeIdRouter) router;
        String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId();
        Slice slice = coll.getSlice(myShardId);
        Map<String, RoutingRule> routingRules = slice.getRoutingRules();
        if (routingRules != null) {
            // delete by query case
            if (id == null) {
                for (Entry<String, RoutingRule> entry : routingRules.entrySet()) {
                    String targetCollectionName = entry.getValue().getTargetCollectionName();
                    Collection<Slice> activeSlices = cstate.getActiveSlices(targetCollectionName);
                    if (activeSlices != null && !activeSlices.isEmpty()) {
                        Slice any = activeSlices.iterator().next();
                        if (nodes == null)
                            nodes = new ArrayList<>();
                        nodes.add(new StdNode(new ZkCoreNodeProps(any.getLeader())));
                    }
                }
                return nodes;
            }
            String routeKey = SolrIndexSplitter.getRouteKey(id);
            if (routeKey != null) {
                RoutingRule rule = routingRules.get(routeKey + "!");
                if (rule != null) {
                    if (!rule.isExpired()) {
                        List<DocRouter.Range> ranges = rule.getRouteRanges();
                        if (ranges != null && !ranges.isEmpty()) {
                            int hash = compositeIdRouter.sliceHash(id, doc, null, coll);
                            for (DocRouter.Range range : ranges) {
                                if (range.includes(hash)) {
                                    DocCollection targetColl = cstate.getCollection(rule.getTargetCollectionName());
                                    Collection<Slice> activeSlices = targetColl.getRouter().getSearchSlicesSingle(id, null, targetColl);
                                    if (activeSlices == null || activeSlices.isEmpty()) {
                                        throw new SolrException(ErrorCode.SERVER_ERROR, "No active slices serving " + id + " found for target collection: " + rule.getTargetCollectionName());
                                    }
                                    Replica targetLeader = targetColl.getLeader(activeSlices.iterator().next().getName());
                                    nodes = new ArrayList<>(1);
                                    nodes.add(new StdNode(new ZkCoreNodeProps(targetLeader)));
                                    break;
                                }
                            }
                        }
                    } else {
                        ReentrantLock ruleExpiryLock = req.getCore().getRuleExpiryLock();
                        if (!ruleExpiryLock.isLocked()) {
                            try {
                                if (ruleExpiryLock.tryLock(10, TimeUnit.MILLISECONDS)) {
                                    log.info("Going to expire routing rule");
                                    try {
                                        Map<String, Object> map = Utils.makeMap(Overseer.QUEUE_OPERATION, OverseerAction.REMOVEROUTINGRULE.toLower(), ZkStateReader.COLLECTION_PROP, collection, ZkStateReader.SHARD_ID_PROP, myShardId, "routeKey", routeKey + "!");
                                        SolrZkClient zkClient = req.getCore().getCoreContainer().getZkController().getZkClient();
                                        DistributedQueue queue = Overseer.getStateUpdateQueue(zkClient);
                                        queue.offer(Utils.toJSON(map));
                                    } catch (KeeperException e) {
                                        log.warn("Exception while removing routing rule for route key: " + routeKey, e);
                                    } catch (Exception e) {
                                        log.error("Exception while removing routing rule for route key: " + routeKey, e);
                                    } finally {
                                        ruleExpiryLock.unlock();
                                    }
                                }
                            } catch (InterruptedException e) {
                                Thread.currentThread().interrupt();
                            }
                        }
                    }
                }
            }
        }
    }
    return nodes;
}
Also used : ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) RetryNode(org.apache.solr.update.SolrCmdDistributor.RetryNode) Node(org.apache.solr.update.SolrCmdDistributor.Node) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) ArrayList(java.util.ArrayList) DocRouter(org.apache.solr.common.cloud.DocRouter) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) DocCollection(org.apache.solr.common.cloud.DocCollection) RoutingRule(org.apache.solr.common.cloud.RoutingRule) SolrException(org.apache.solr.common.SolrException) ReentrantLock(java.util.concurrent.locks.ReentrantLock) Replica(org.apache.solr.common.cloud.Replica) SolrZkClient(org.apache.solr.common.cloud.SolrZkClient) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrException(org.apache.solr.common.SolrException) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) CompositeIdRouter(org.apache.solr.common.cloud.CompositeIdRouter) Slice(org.apache.solr.common.cloud.Slice) DistributedQueue(org.apache.solr.cloud.DistributedQueue) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) KeeperException(org.apache.zookeeper.KeeperException)

Example 43 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class DistributedUpdateProcessor method doDeleteByQuery.

public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException {
    // even in non zk mode, tests simulate updates from a leader
    if (!zkEnabled) {
        isLeader = getNonZkLeaderAssumption(req);
    } else {
        zkCheck();
    }
    // NONE: we are the first to receive this deleteByQuery
    //       - it must be forwarded to the leader of every shard
    // TO:   we are a leader receiving a forwarded deleteByQuery... we must:
    //       - block all updates (use VersionInfo)
    //       - flush *all* updates going to our replicas
    //       - forward the DBQ to our replicas and wait for the response
    //       - log + execute the local DBQ
    // FROM: we are a replica receiving a DBQ from our leader
    //       - log + execute the local DBQ
    DistribPhase phase = DistribPhase.parseParam(req.getParams().get(DISTRIB_UPDATE_PARAM));
    DocCollection coll = zkEnabled ? zkController.getClusterState().getCollection(collection) : null;
    if (zkEnabled && DistribPhase.NONE == phase) {
        // start off by assuming we are not a leader for any shard
        boolean leaderForAnyShard = false;
        ModifiableSolrParams outParams = new ModifiableSolrParams(filterParams(req.getParams()));
        outParams.set(DISTRIB_UPDATE_PARAM, DistribPhase.TOLEADER.toString());
        outParams.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
        SolrParams params = req.getParams();
        String route = params.get(ShardParams._ROUTE_);
        Collection<Slice> slices = coll.getRouter().getSearchSlices(route, params, coll);
        List<Node> leaders = new ArrayList<>(slices.size());
        for (Slice slice : slices) {
            String sliceName = slice.getName();
            Replica leader;
            try {
                leader = zkController.getZkStateReader().getLeaderRetry(collection, sliceName);
            } catch (InterruptedException e) {
                throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + sliceName, e);
            }
            // TODO: What if leaders changed in the meantime?
            // should we send out slice-at-a-time and if a node returns "hey, I'm not a leader" (or we get an error because it went down) then look up the new leader?
            // Am I the leader for this slice?
            ZkCoreNodeProps coreLeaderProps = new ZkCoreNodeProps(leader);
            String leaderCoreNodeName = leader.getName();
            String coreNodeName = req.getCore().getCoreDescriptor().getCloudDescriptor().getCoreNodeName();
            isLeader = coreNodeName.equals(leaderCoreNodeName);
            if (isLeader) {
                // don't forward to ourself
                leaderForAnyShard = true;
            } else {
                leaders.add(new RetryNode(coreLeaderProps, zkController.getZkStateReader(), collection, sliceName));
            }
        }
        // this will be distributed from the local commit
        outParams.remove("commit");
        cmdDistrib.distribDelete(cmd, leaders, outParams);
        if (!leaderForAnyShard) {
            return;
        }
        // change the phase to TOLEADER so we look up and forward to our own replicas (if any)
        phase = DistribPhase.TOLEADER;
    }
    List<Node> replicas = null;
    if (zkEnabled && DistribPhase.TOLEADER == phase) {
        // This core should be a leader
        isLeader = true;
        replicas = setupRequestForDBQ();
    } else if (DistribPhase.FROMLEADER == phase) {
        isLeader = false;
    }
    if (vinfo == null) {
        super.processDelete(cmd);
        return;
    }
    // at this point, there is an update we need to try and apply.
    // we may or may not be the leader.
    boolean isReplayOrPeersync = (cmd.getFlags() & (UpdateCommand.REPLAY | UpdateCommand.PEER_SYNC)) != 0;
    boolean leaderLogic = isLeader && !isReplayOrPeersync;
    versionDeleteByQuery(cmd);
    if (zkEnabled) {
        // forward to all replicas
        ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
        params.set(CommonParams.VERSION_FIELD, Long.toString(cmd.getVersion()));
        params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
        params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
        boolean someReplicas = false;
        boolean subShardLeader = false;
        try {
            subShardLeader = amISubShardLeader(coll, null, null, null);
            if (subShardLeader) {
                String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId();
                Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, myShardId);
                // DBQ forwarded to NRT and TLOG replicas
                List<ZkCoreNodeProps> replicaProps = zkController.getZkStateReader().getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
                if (replicaProps != null) {
                    final List<Node> myReplicas = new ArrayList<>(replicaProps.size());
                    for (ZkCoreNodeProps replicaProp : replicaProps) {
                        myReplicas.add(new StdNode(replicaProp, collection, myShardId));
                    }
                    cmdDistrib.distribDelete(cmd, myReplicas, params);
                    someReplicas = true;
                }
            }
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new ZooKeeperException(ErrorCode.SERVER_ERROR, "", e);
        }
        if (leaderLogic) {
            List<Node> subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), null, null);
            if (subShardLeaders != null) {
                cmdDistrib.distribDelete(cmd, subShardLeaders, params, true);
            }
            final List<Node> nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, null, null);
            if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) {
                params = new ModifiableSolrParams(filterParams(req.getParams()));
                params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
                params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
                params.set(DISTRIB_FROM_COLLECTION, req.getCore().getCoreDescriptor().getCloudDescriptor().getCollectionName());
                params.set(DISTRIB_FROM_SHARD, req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId());
                cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true);
            }
            if (replicas != null) {
                cmdDistrib.distribDelete(cmd, replicas, params);
                someReplicas = true;
            }
        }
        if (someReplicas) {
            cmdDistrib.blockAndDoRetries();
        }
    }
    if (returnVersions && rsp != null) {
        if (deleteByQueryResponse == null) {
            deleteByQueryResponse = new NamedList<String>(1);
            rsp.add("deleteByQuery", deleteByQueryResponse);
        }
        deleteByQueryResponse.add(cmd.getQuery(), cmd.getVersion());
    }
}
Also used : RetryNode(org.apache.solr.update.SolrCmdDistributor.RetryNode) ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) RetryNode(org.apache.solr.update.SolrCmdDistributor.RetryNode) Node(org.apache.solr.update.SolrCmdDistributor.Node) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) ArrayList(java.util.ArrayList) Replica(org.apache.solr.common.cloud.Replica) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) Slice(org.apache.solr.common.cloud.Slice) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) SolrParams(org.apache.solr.common.params.SolrParams) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) DocCollection(org.apache.solr.common.cloud.DocCollection) SolrException(org.apache.solr.common.SolrException)

Example 44 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class TestLeaderInitiatedRecoveryThread method testPublishDownState.

public void testPublishDownState() throws Exception {
    waitForRecoveriesToFinish(true);
    final String leaderCoreNodeName = shardToLeaderJetty.get(SHARD1).coreNodeName;
    final CloudJettyRunner leaderRunner = shardToLeaderJetty.get(SHARD1);
    CoreContainer coreContainer = leaderRunner.jetty.getCoreContainer();
    ZkController zkController = coreContainer.getZkController();
    CloudJettyRunner notLeader = null;
    for (CloudJettyRunner cloudJettyRunner : shardToJetty.get(SHARD1)) {
        if (cloudJettyRunner != leaderRunner) {
            notLeader = cloudJettyRunner;
            break;
        }
    }
    assertNotNull(notLeader);
    Replica replica = cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, notLeader.coreNodeName);
    ZkCoreNodeProps replicaCoreNodeProps = new ZkCoreNodeProps(replica);
    MockCoreDescriptor cd = new MockCoreDescriptor() {

        public CloudDescriptor getCloudDescriptor() {
            return new CloudDescriptor(shardToLeaderJetty.get(SHARD1).info.getStr(ZkStateReader.CORE_NAME_PROP), new Properties(), this) {

                @Override
                public String getCoreNodeName() {
                    return shardToLeaderJetty.get(SHARD1).info.getStr(ZkStateReader.CORE_NODE_NAME_PROP);
                }

                @Override
                public boolean isLeader() {
                    return true;
                }
            };
        }
    };
    /*
     1. Test that publishDownState throws exception when zkController.isReplicaInRecoveryHandling == false
      */
    try {
        LeaderInitiatedRecoveryThread thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd);
        assertFalse(zkController.isReplicaInRecoveryHandling(replicaCoreNodeProps.getCoreUrl()));
        thread.run();
        fail("publishDownState should not have succeeded because replica url is not marked in leader initiated recovery in ZkController");
    } catch (SolrException e) {
        assertTrue(e.code() == SolrException.ErrorCode.INVALID_STATE.code);
    }
    /*
     2. Test that a non-live replica cannot be put into LIR or down state
      */
    LeaderInitiatedRecoveryThread thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd);
    // kill the replica
    int children = cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size();
    ChaosMonkey.stop(notLeader.jetty);
    TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS);
    while (!timeOut.hasTimedOut()) {
        if (children > cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size()) {
            break;
        }
        Thread.sleep(500);
    }
    assertTrue(children > cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size());
    int cversion = getOverseerCversion();
    // Thread should not publish LIR and down state for node which is not live, regardless of whether forcePublish is true or false
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    // lets assert that we did not publish anything to overseer queue, simplest way is to assert that cversion of overseer queue zk node is still the same
    assertEquals(cversion, getOverseerCversion());
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
    // lets assert that we did not publish anything to overseer queue
    assertEquals(cversion, getOverseerCversion());
    /*
    3. Test that if ZK connection loss then thread should not attempt to publish down state even if forcePublish=true
     */
    ChaosMonkey.start(notLeader.jetty);
    waitForRecoveriesToFinish(true);
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {

        @Override
        protected void updateLIRState(String replicaCoreNodeName) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", new KeeperException.ConnectionLossException());
        }
    };
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
    assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
    /*
     4. Test that if ZK connection loss or session expired then thread should not attempt to publish down state even if forcePublish=true
      */
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {

        @Override
        protected void updateLIRState(String replicaCoreNodeName) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", new KeeperException.SessionExpiredException());
        }
    };
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
    assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
    /*
     5. Test that any exception other then ZK connection loss or session expired should publish down state only if forcePublish=true
      */
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {

        @Override
        protected void updateLIRState(String replicaCoreNodeName) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "bogus exception");
        }
    };
    // the following should return true because regardless of the bogus exception in setting LIR state, we still want recovery commands to be sent,
    // however the following will not publish a down state
    cversion = getOverseerCversion();
    assertTrue(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    // lets assert that we did not publish anything to overseer queue, simplest way is to assert that cversion of overseer queue zk node is still the same
    assertEquals(cversion, getOverseerCversion());
    assertTrue(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
    // this should have published a down state so assert that cversion has incremented
    assertTrue(getOverseerCversion() > cversion);
    timeOut = new TimeOut(30, TimeUnit.SECONDS);
    while (!timeOut.hasTimedOut()) {
        Replica r = cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, replica.getName());
        if (r.getState() == Replica.State.DOWN) {
            break;
        }
        Thread.sleep(500);
    }
    assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
    assertEquals(Replica.State.DOWN, cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, replica.getName()).getState());
    /*
    6. Test that non-leader cannot set LIR nodes
     */
    coreContainer = notLeader.jetty.getCoreContainer();
    zkController = coreContainer.getZkController();
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, coreContainer.getCores().iterator().next().getCoreDescriptor()) {

        @Override
        protected void updateLIRState(String replicaCoreNodeName) {
            try {
                super.updateLIRState(replicaCoreNodeName);
            } catch (Exception e) {
                assertTrue(e instanceof ZkController.NotLeaderException);
                throw e;
            }
        }
    };
    cversion = getOverseerCversion();
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    assertEquals(cversion, getOverseerCversion());
    /*
     7. assert that we can write a LIR state if everything else is fine
      */
    // reset the zkcontroller to the one from the leader
    coreContainer = leaderRunner.jetty.getCoreContainer();
    zkController = coreContainer.getZkController();
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, coreContainer.getCores().iterator().next().getCoreDescriptor());
    thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false);
    timeOut = new TimeOut(30, TimeUnit.SECONDS);
    while (!timeOut.hasTimedOut()) {
        Replica.State state = zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName());
        if (state == Replica.State.DOWN) {
            break;
        }
        Thread.sleep(500);
    }
    assertNotNull(zkController.getLeaderInitiatedRecoveryStateObject(DEFAULT_COLLECTION, SHARD1, replica.getName()));
    assertEquals(Replica.State.DOWN, zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
/*
    7. Test that
     */
}
Also used : ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) TimeOut(org.apache.solr.util.TimeOut) Properties(java.util.Properties) Replica(org.apache.solr.common.cloud.Replica) SolrException(org.apache.solr.common.SolrException) KeeperException(org.apache.zookeeper.KeeperException) CoreContainer(org.apache.solr.core.CoreContainer) MockCoreDescriptor(org.apache.solr.util.MockCoreContainer.MockCoreDescriptor) SolrException(org.apache.solr.common.SolrException)

Example 45 with ZkCoreNodeProps

use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.

the class SyncStrategy method syncWithReplicas.

private PeerSync.PeerSyncResult syncWithReplicas(ZkController zkController, SolrCore core, ZkNodeProps props, String collection, String shardId, boolean peerSyncOnlyWithActive) {
    List<ZkCoreNodeProps> nodes = zkController.getZkStateReader().getReplicaProps(collection, shardId, core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
    if (nodes == null) {
        // I have no replicas
        return PeerSync.PeerSyncResult.success();
    }
    List<String> syncWith = new ArrayList<>(nodes.size());
    for (ZkCoreNodeProps node : nodes) {
        syncWith.add(node.getCoreUrl());
    }
    // if we can't reach a replica for sync, we still consider the overall sync a success
    // TODO: as an assurance, we should still try and tell the sync nodes that we couldn't reach
    // to recover once more?
    // Fingerprinting here is off because the we currently rely on having at least one of the nodes return "true", and if replicas are out-of-sync
    // we still need to pick one as leader.  A followup sync from the replica to the new leader (with fingerprinting on) should then fail and
    // initiate recovery-by-replication.
    PeerSync peerSync = new PeerSync(core, syncWith, core.getUpdateHandler().getUpdateLog().getNumRecordsToKeep(), true, true, peerSyncOnlyWithActive, false);
    return peerSync.sync();
}
Also used : ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) PeerSync(org.apache.solr.update.PeerSync) ArrayList(java.util.ArrayList)

Aggregations

ZkCoreNodeProps (org.apache.solr.common.cloud.ZkCoreNodeProps)47 Replica (org.apache.solr.common.cloud.Replica)24 ArrayList (java.util.ArrayList)22 Slice (org.apache.solr.common.cloud.Slice)20 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)16 SolrException (org.apache.solr.common.SolrException)13 ClusterState (org.apache.solr.common.cloud.ClusterState)13 IOException (java.io.IOException)12 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)12 RetryNode (org.apache.solr.update.SolrCmdDistributor.RetryNode)12 StdNode (org.apache.solr.update.SolrCmdDistributor.StdNode)12 Node (org.apache.solr.update.SolrCmdDistributor.Node)11 ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)10 ZkStateReader (org.apache.solr.common.cloud.ZkStateReader)10 SolrQuery (org.apache.solr.client.solrj.SolrQuery)9 ZooKeeperException (org.apache.solr.common.cloud.ZooKeeperException)8 KeeperException (org.apache.zookeeper.KeeperException)8 SolrServerException (org.apache.solr.client.solrj.SolrServerException)7 Random (java.util.Random)6 NamedList (org.apache.solr.common.util.NamedList)6