Search in sources :

Example 6 with RetryNode

use of org.apache.solr.update.SolrCmdDistributor.RetryNode in project lucene-solr by apache.

the class DistributedUpdateProcessor method doFinish.

// TODO: optionally fail if n replicas are not reached...
private void doFinish() {
    // TODO: if not a forward and replication req is not specified, we could
    // send in a background thread    
    cmdDistrib.finish();
    List<Error> errors = cmdDistrib.getErrors();
    // TODO - we may need to tell about more than one error...
    List<Error> errorsForClient = new ArrayList<>(errors.size());
    for (final SolrCmdDistributor.Error error : errors) {
        if (error.req.node instanceof RetryNode) {
            // if it's a forward, any fail is a problem - 
            // otherwise we assume things are fine if we got it locally
            // until we start allowing min replication param
            errorsForClient.add(error);
            continue;
        }
        // succeeded 
        if (log.isWarnEnabled()) {
            log.warn("Error sending update to " + error.req.node.getBaseUrl(), error.e);
        }
        // Since it is not a forward request, for each fail, try to tell them to
        // recover - the doc was already added locally, so it should have been
        // legit
        DistribPhase phase = DistribPhase.parseParam(error.req.uReq.getParams().get(DISTRIB_UPDATE_PARAM));
        if (phase != DistribPhase.FROMLEADER)
            // don't have non-leaders try to recovery other nodes
            continue;
        // we don't want to run recovery on a node which missed a commit command
        if (error.req.uReq.getParams().get(COMMIT_END_POINT) != null)
            continue;
        final String replicaUrl = error.req.node.getUrl();
        // if the remote replica failed the request because of leader change (SOLR-6511), then fail the request
        String cause = (error.e instanceof SolrException) ? ((SolrException) error.e).getMetadata("cause") : null;
        if ("LeaderChanged".equals(cause)) {
            // let's just fail this request and let the client retry? or just call processAdd again?
            log.error("On " + cloudDesc.getCoreNodeName() + ", replica " + replicaUrl + " now thinks it is the leader! Failing the request to let the client retry! " + error.e);
            errorsForClient.add(error);
            continue;
        }
        String collection = null;
        String shardId = null;
        if (error.req.node instanceof StdNode) {
            StdNode stdNode = (StdNode) error.req.node;
            collection = stdNode.getCollection();
            shardId = stdNode.getShardId();
            // before we go setting other replicas to down, make sure we're still the leader!
            String leaderCoreNodeName = null;
            Exception getLeaderExc = null;
            Replica leaderProps = null;
            try {
                leaderProps = zkController.getZkStateReader().getLeader(collection, shardId);
                if (leaderProps != null) {
                    leaderCoreNodeName = leaderProps.getName();
                }
            } catch (Exception exc) {
                getLeaderExc = exc;
            }
            if (leaderCoreNodeName == null) {
                log.warn("Failed to determine if {} is still the leader for collection={} shardId={} " + "before putting {} into leader-initiated recovery", cloudDesc.getCoreNodeName(), collection, shardId, replicaUrl, getLeaderExc);
            }
            List<ZkCoreNodeProps> myReplicas = zkController.getZkStateReader().getReplicaProps(collection, cloudDesc.getShardId(), cloudDesc.getCoreNodeName());
            boolean foundErrorNodeInReplicaList = false;
            if (myReplicas != null) {
                for (ZkCoreNodeProps replicaProp : myReplicas) {
                    if (((Replica) replicaProp.getNodeProps()).getName().equals(((Replica) stdNode.getNodeProps().getNodeProps()).getName())) {
                        foundErrorNodeInReplicaList = true;
                        break;
                    }
                }
            }
            // If the client specified minRf and we didn't achieve the minRf, don't send recovery and let client retry
            if (replicationTracker != null && replicationTracker.getAchievedRf() < replicationTracker.minRf) {
                continue;
            }
            if (// we are still same leader
            leaderCoreNodeName != null && cloudDesc.getCoreNodeName().equals(leaderCoreNodeName) && // we found an error for one of replicas
            foundErrorNodeInReplicaList && !stdNode.getNodeProps().getCoreUrl().equals(leaderProps.getCoreUrl())) {
                // we do not want to put ourself into LIR
                try {
                    // if false, then the node is probably not "live" anymore
                    // and we do not need to send a recovery message
                    Throwable rootCause = SolrException.getRootCause(error.e);
                    log.error("Setting up to try to start recovery on replica {}", replicaUrl, rootCause);
                    zkController.ensureReplicaInLeaderInitiatedRecovery(req.getCore().getCoreContainer(), collection, shardId, stdNode.getNodeProps(), req.getCore().getCoreDescriptor(), false);
                } catch (Exception exc) {
                    Throwable setLirZnodeFailedCause = SolrException.getRootCause(exc);
                    log.error("Leader failed to set replica " + error.req.node.getUrl() + " state to DOWN due to: " + setLirZnodeFailedCause, setLirZnodeFailedCause);
                }
            } else {
                // not the leader anymore maybe or the error'd node is not my replica?
                if (!foundErrorNodeInReplicaList) {
                    log.warn("Core " + cloudDesc.getCoreNodeName() + " belonging to " + collection + " " + shardId + ", does not have error'd node " + stdNode.getNodeProps().getCoreUrl() + " as a replica. " + "No request recovery command will be sent!");
                } else {
                    log.warn("Core " + cloudDesc.getCoreNodeName() + " is no longer the leader for " + collection + " " + shardId + " or we tried to put ourself into LIR, no request recovery command will be sent!");
                }
            }
        }
    }
    if (replicationTracker != null) {
        rsp.getResponseHeader().add(UpdateRequest.REPFACT, replicationTracker.getAchievedRf());
        rsp.getResponseHeader().add(UpdateRequest.MIN_REPFACT, replicationTracker.minRf);
        replicationTracker = null;
    }
    if (0 < errorsForClient.size()) {
        throw new DistributedUpdatesAsyncException(errorsForClient);
    }
}
Also used : RetryNode(org.apache.solr.update.SolrCmdDistributor.RetryNode) ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) Error(org.apache.solr.update.SolrCmdDistributor.Error) ArrayList(java.util.ArrayList) Error(org.apache.solr.update.SolrCmdDistributor.Error) Replica(org.apache.solr.common.cloud.Replica) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrException(org.apache.solr.common.SolrException) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) SolrCmdDistributor(org.apache.solr.update.SolrCmdDistributor) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) SolrException(org.apache.solr.common.SolrException)

Example 7 with RetryNode

use of org.apache.solr.update.SolrCmdDistributor.RetryNode in project lucene-solr by apache.

the class DistributedUpdateProcessor method doDeleteByQuery.

public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException {
    // even in non zk mode, tests simulate updates from a leader
    if (!zkEnabled) {
        isLeader = getNonZkLeaderAssumption(req);
    } else {
        zkCheck();
    }
    // NONE: we are the first to receive this deleteByQuery
    //       - it must be forwarded to the leader of every shard
    // TO:   we are a leader receiving a forwarded deleteByQuery... we must:
    //       - block all updates (use VersionInfo)
    //       - flush *all* updates going to our replicas
    //       - forward the DBQ to our replicas and wait for the response
    //       - log + execute the local DBQ
    // FROM: we are a replica receiving a DBQ from our leader
    //       - log + execute the local DBQ
    DistribPhase phase = DistribPhase.parseParam(req.getParams().get(DISTRIB_UPDATE_PARAM));
    DocCollection coll = zkEnabled ? zkController.getClusterState().getCollection(collection) : null;
    if (zkEnabled && DistribPhase.NONE == phase) {
        // start off by assuming we are not a leader for any shard
        boolean leaderForAnyShard = false;
        ModifiableSolrParams outParams = new ModifiableSolrParams(filterParams(req.getParams()));
        outParams.set(DISTRIB_UPDATE_PARAM, DistribPhase.TOLEADER.toString());
        outParams.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
        SolrParams params = req.getParams();
        String route = params.get(ShardParams._ROUTE_);
        Collection<Slice> slices = coll.getRouter().getSearchSlices(route, params, coll);
        List<Node> leaders = new ArrayList<>(slices.size());
        for (Slice slice : slices) {
            String sliceName = slice.getName();
            Replica leader;
            try {
                leader = zkController.getZkStateReader().getLeaderRetry(collection, sliceName);
            } catch (InterruptedException e) {
                throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + sliceName, e);
            }
            // TODO: What if leaders changed in the meantime?
            // should we send out slice-at-a-time and if a node returns "hey, I'm not a leader" (or we get an error because it went down) then look up the new leader?
            // Am I the leader for this slice?
            ZkCoreNodeProps coreLeaderProps = new ZkCoreNodeProps(leader);
            String leaderCoreNodeName = leader.getName();
            String coreNodeName = req.getCore().getCoreDescriptor().getCloudDescriptor().getCoreNodeName();
            isLeader = coreNodeName.equals(leaderCoreNodeName);
            if (isLeader) {
                // don't forward to ourself
                leaderForAnyShard = true;
            } else {
                leaders.add(new RetryNode(coreLeaderProps, zkController.getZkStateReader(), collection, sliceName));
            }
        }
        // this will be distributed from the local commit
        outParams.remove("commit");
        cmdDistrib.distribDelete(cmd, leaders, outParams);
        if (!leaderForAnyShard) {
            return;
        }
        // change the phase to TOLEADER so we look up and forward to our own replicas (if any)
        phase = DistribPhase.TOLEADER;
    }
    List<Node> replicas = null;
    if (zkEnabled && DistribPhase.TOLEADER == phase) {
        // This core should be a leader
        isLeader = true;
        replicas = setupRequestForDBQ();
    } else if (DistribPhase.FROMLEADER == phase) {
        isLeader = false;
    }
    if (vinfo == null) {
        super.processDelete(cmd);
        return;
    }
    // at this point, there is an update we need to try and apply.
    // we may or may not be the leader.
    boolean isReplayOrPeersync = (cmd.getFlags() & (UpdateCommand.REPLAY | UpdateCommand.PEER_SYNC)) != 0;
    boolean leaderLogic = isLeader && !isReplayOrPeersync;
    versionDeleteByQuery(cmd);
    if (zkEnabled) {
        // forward to all replicas
        ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
        params.set(CommonParams.VERSION_FIELD, Long.toString(cmd.getVersion()));
        params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
        params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
        boolean someReplicas = false;
        boolean subShardLeader = false;
        try {
            subShardLeader = amISubShardLeader(coll, null, null, null);
            if (subShardLeader) {
                String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId();
                Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, myShardId);
                // DBQ forwarded to NRT and TLOG replicas
                List<ZkCoreNodeProps> replicaProps = zkController.getZkStateReader().getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
                if (replicaProps != null) {
                    final List<Node> myReplicas = new ArrayList<>(replicaProps.size());
                    for (ZkCoreNodeProps replicaProp : replicaProps) {
                        myReplicas.add(new StdNode(replicaProp, collection, myShardId));
                    }
                    cmdDistrib.distribDelete(cmd, myReplicas, params);
                    someReplicas = true;
                }
            }
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
            throw new ZooKeeperException(ErrorCode.SERVER_ERROR, "", e);
        }
        if (leaderLogic) {
            List<Node> subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), null, null);
            if (subShardLeaders != null) {
                cmdDistrib.distribDelete(cmd, subShardLeaders, params, true);
            }
            final List<Node> nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, null, null);
            if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) {
                params = new ModifiableSolrParams(filterParams(req.getParams()));
                params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
                params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
                params.set(DISTRIB_FROM_COLLECTION, req.getCore().getCoreDescriptor().getCloudDescriptor().getCollectionName());
                params.set(DISTRIB_FROM_SHARD, req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId());
                cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true);
            }
            if (replicas != null) {
                cmdDistrib.distribDelete(cmd, replicas, params);
                someReplicas = true;
            }
        }
        if (someReplicas) {
            cmdDistrib.blockAndDoRetries();
        }
    }
    if (returnVersions && rsp != null) {
        if (deleteByQueryResponse == null) {
            deleteByQueryResponse = new NamedList<String>(1);
            rsp.add("deleteByQuery", deleteByQueryResponse);
        }
        deleteByQueryResponse.add(cmd.getQuery(), cmd.getVersion());
    }
}
Also used : RetryNode(org.apache.solr.update.SolrCmdDistributor.RetryNode) ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) RetryNode(org.apache.solr.update.SolrCmdDistributor.RetryNode) Node(org.apache.solr.update.SolrCmdDistributor.Node) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) ArrayList(java.util.ArrayList) Replica(org.apache.solr.common.cloud.Replica) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) Slice(org.apache.solr.common.cloud.Slice) StdNode(org.apache.solr.update.SolrCmdDistributor.StdNode) SolrParams(org.apache.solr.common.params.SolrParams) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) DocCollection(org.apache.solr.common.cloud.DocCollection) SolrException(org.apache.solr.common.SolrException)

Aggregations

ArrayList (java.util.ArrayList)7 ZkCoreNodeProps (org.apache.solr.common.cloud.ZkCoreNodeProps)7 RetryNode (org.apache.solr.update.SolrCmdDistributor.RetryNode)7 StdNode (org.apache.solr.update.SolrCmdDistributor.StdNode)7 Node (org.apache.solr.update.SolrCmdDistributor.Node)6 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)5 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)4 ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)4 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 SolrQuery (org.apache.solr.client.solrj.SolrQuery)3 SolrException (org.apache.solr.common.SolrException)3 Replica (org.apache.solr.common.cloud.Replica)3 ZooKeeperException (org.apache.solr.common.cloud.ZooKeeperException)3 DocCollection (org.apache.solr.common.cloud.DocCollection)2 Slice (org.apache.solr.common.cloud.Slice)2 IOException (java.io.IOException)1 EnumSet (java.util.EnumSet)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Set (java.util.Set)1