Search in sources :

Example 71 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class RecoveryStrategy method doReplicateOnlyRecovery.

private final void doReplicateOnlyRecovery(SolrCore core) throws InterruptedException {
    boolean successfulRecovery = false;
    //  }
    while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) {
        // don't use interruption or it will close channels though
        try {
            CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
            ZkNodeProps leaderprops = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId());
            final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
            final String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP);
            String leaderUrl = ZkCoreNodeProps.getCoreUrl(leaderBaseUrl, leaderCoreName);
            String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
            //TODO: We can probably delete most of this code if we say this strategy can only be used for pull replicas
            boolean isLeader = leaderUrl.equals(ourUrl);
            if (isLeader && !cloudDesc.isLeader()) {
                throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
            }
            if (cloudDesc.isLeader()) {
                assert cloudDesc.getReplicaType() != Replica.Type.PULL;
                // we are now the leader - no one else must have been suitable
                LOG.warn("We have not yet recovered - but we are now the leader!");
                LOG.info("Finished recovery process.");
                zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
                return;
            }
            LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl, ourUrl);
            zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
            if (isClosed()) {
                LOG.info("Recovery for core {} has been closed", core.getName());
                break;
            }
            LOG.info("Starting Replication Recovery.");
            try {
                LOG.info("Stopping background replicate from leader process");
                zkController.stopReplicationFromLeader(coreName);
                replicate(zkController.getNodeName(), core, leaderprops);
                if (isClosed()) {
                    LOG.info("Recovery for core {} has been closed", core.getName());
                    break;
                }
                LOG.info("Replication Recovery was successful.");
                successfulRecovery = true;
            } catch (Exception e) {
                SolrException.log(LOG, "Error while trying to recover", e);
            }
        } catch (Exception e) {
            SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e);
        } finally {
            if (successfulRecovery) {
                LOG.info("Restaring background replicate from leader process");
                zkController.startReplicationFromLeader(coreName, false);
                LOG.info("Registering as Active after recovery.");
                try {
                    zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
                } catch (Exception e) {
                    LOG.error("Could not publish as ACTIVE after succesful recovery", e);
                    successfulRecovery = false;
                }
                if (successfulRecovery) {
                    close = true;
                    recoveryListener.recovered();
                }
            }
        }
        if (!successfulRecovery) {
            // Or do a fall off retry...
            try {
                if (isClosed()) {
                    LOG.info("Recovery for core {} has been closed", core.getName());
                    break;
                }
                LOG.error("Recovery failed - trying again... (" + retries + ")");
                retries++;
                if (retries >= maxRetries) {
                    SolrException.log(LOG, "Recovery failed - max retries exceeded (" + retries + ").");
                    try {
                        recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
                    } catch (Exception e) {
                        SolrException.log(LOG, "Could not publish that recovery failed", e);
                    }
                    break;
                }
            } catch (Exception e) {
                SolrException.log(LOG, "An error has occurred during recovery", e);
            }
            try {
                // Wait an exponential interval between retries, start at 5 seconds and work up to a minute.
                // If we're at attempt >= 4, there's no point computing pow(2, retries) because the result 
                // will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
                // order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
                double loopCount = retries < 4 ? Math.min(Math.pow(2, retries), 12) : 12;
                LOG.info("Wait [{}] seconds before trying to recover again (attempt={})", loopCount, retries);
                for (int i = 0; i < loopCount; i++) {
                    if (isClosed()) {
                        LOG.info("Recovery for core {} has been closed", core.getName());
                        // check if someone closed us
                        break;
                    }
                    Thread.sleep(startingRecoveryDelayMilliSeconds);
                }
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                LOG.warn("Recovery was interrupted.", e);
                close = true;
            }
        }
    }
    // We skip core.seedVersionBuckets(); We don't have a transaction log
    LOG.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery));
}
Also used : ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) SolrException(org.apache.solr.common.SolrException) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrException(org.apache.solr.common.SolrException) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) SocketTimeoutException(java.net.SocketTimeoutException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException)

Example 72 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class OverseerCollectionMessageHandler method migrateStateFormat.

//TODO should we not remove in the next release ?
private void migrateStateFormat(ClusterState state, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException {
    final String collectionName = message.getStr(COLLECTION_PROP);
    boolean firstLoop = true;
    // wait for a while until the state format changes
    TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
    while (!timeout.hasTimedOut()) {
        DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
        if (collection == null) {
            throw new SolrException(ErrorCode.BAD_REQUEST, "Collection: " + collectionName + " not found");
        }
        if (collection.getStateFormat() == 2) {
            // Done.
            results.add("success", new SimpleOrderedMap<>());
            return;
        }
        if (firstLoop) {
            // Actually queue the migration command.
            firstLoop = false;
            ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, MIGRATESTATEFORMAT.toLower(), COLLECTION_PROP, collectionName);
            Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
        }
        Thread.sleep(100);
    }
    throw new SolrException(ErrorCode.SERVER_ERROR, "Could not migrate state format for collection: " + collectionName);
}
Also used : TimeOut(org.apache.solr.util.TimeOut) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) DocCollection(org.apache.solr.common.cloud.DocCollection) RemoteSolrException(org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException) SolrException(org.apache.solr.common.SolrException)

Example 73 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class OverseerNodePrioritizer method prioritizeOverseerNodes.

public synchronized void prioritizeOverseerNodes(String overseerId) throws KeeperException, InterruptedException {
    SolrZkClient zk = zkStateReader.getZkClient();
    if (!zk.exists(ZkStateReader.ROLES, true))
        return;
    Map m = (Map) Utils.fromJSON(zk.getData(ZkStateReader.ROLES, null, new Stat(), true));
    List overseerDesignates = (List) m.get("overseer");
    if (overseerDesignates == null || overseerDesignates.isEmpty())
        return;
    String ldr = OverseerTaskProcessor.getLeaderNode(zk);
    if (overseerDesignates.contains(ldr))
        return;
    log.info("prioritizing overseer nodes at {} overseer designates are {}", overseerId, overseerDesignates);
    List<String> electionNodes = OverseerTaskProcessor.getSortedElectionNodes(zk, Overseer.OVERSEER_ELECT + LeaderElector.ELECTION_NODE);
    if (electionNodes.size() < 2)
        return;
    log.info("sorted nodes {}", electionNodes);
    String designateNodeId = null;
    for (String electionNode : electionNodes) {
        if (overseerDesignates.contains(LeaderElector.getNodeName(electionNode))) {
            designateNodeId = electionNode;
            break;
        }
    }
    if (designateNodeId == null) {
        log.warn("No live overseer designate ");
        return;
    }
    if (!designateNodeId.equals(electionNodes.get(1))) {
        //checking if it is already at no:1
        log.info("asking node {} to come join election at head", designateNodeId);
        //ask designate to come first
        invokeOverseerOp(designateNodeId, "rejoinAtHead");
        log.info("asking the old first in line {} to rejoin election  ", electionNodes.get(1));
        //ask second inline to go behind
        invokeOverseerOp(electionNodes.get(1), "rejoin");
    }
    //now ask the current leader to QUIT , so that the designate can takeover
    Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(new ZkNodeProps(Overseer.QUEUE_OPERATION, OverseerAction.QUIT.toLower(), ID, OverseerTaskProcessor.getLeaderId(zkStateReader.getZkClient()))));
}
Also used : Stat(org.apache.zookeeper.data.Stat) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) List(java.util.List) SolrZkClient(org.apache.solr.common.cloud.SolrZkClient) Map(java.util.Map)

Example 74 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class OverseerCollectionMessageHandler method modifyCollection.

private void modifyCollection(ClusterState clusterState, ZkNodeProps message, NamedList results) throws KeeperException, InterruptedException {
    final String collectionName = message.getStr(ZkStateReader.COLLECTION_PROP);
    //the rest of the processing is based on writing cluster state properties
    //remove the property here to avoid any errors down the pipeline due to this property appearing
    String configName = (String) message.getProperties().remove(COLL_CONF);
    if (configName != null) {
        validateConfigOrThrowSolrException(configName);
        boolean isLegacyCloud = Overseer.isLegacy(zkStateReader);
        createConfNode(configName, collectionName, isLegacyCloud);
        reloadCollection(null, new ZkNodeProps(NAME, collectionName), results);
    }
    overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message));
    TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
    boolean areChangesVisible = true;
    while (!timeout.hasTimedOut()) {
        DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
        areChangesVisible = true;
        for (Map.Entry<String, Object> updateEntry : message.getProperties().entrySet()) {
            String updateKey = updateEntry.getKey();
            if (!updateKey.equals(ZkStateReader.COLLECTION_PROP) && !updateKey.equals(Overseer.QUEUE_OPERATION) && !collection.get(updateKey).equals(updateEntry.getValue())) {
                areChangesVisible = false;
                break;
            }
        }
        if (areChangesVisible)
            break;
        Thread.sleep(100);
    }
    if (!areChangesVisible)
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not modify collection " + message);
}
Also used : TimeOut(org.apache.solr.util.TimeOut) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) DocCollection(org.apache.solr.common.cloud.DocCollection) Map(java.util.Map) Utils.makeMap(org.apache.solr.common.util.Utils.makeMap) SimpleOrderedMap(org.apache.solr.common.util.SimpleOrderedMap) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) RemoteSolrException(org.apache.solr.client.solrj.impl.HttpSolrClient.RemoteSolrException) SolrException(org.apache.solr.common.SolrException)

Example 75 with ZkNodeProps

use of org.apache.solr.common.cloud.ZkNodeProps in project lucene-solr by apache.

the class OverseerCollectionMessageHandler method processReplicaAddPropertyCommand.

@SuppressWarnings("unchecked")
private void processReplicaAddPropertyCommand(ClusterState clusterState, ZkNodeProps message, NamedList results) throws Exception {
    checkRequired(message, COLLECTION_PROP, SHARD_ID_PROP, REPLICA_PROP, PROPERTY_PROP, PROPERTY_VALUE_PROP);
    SolrZkClient zkClient = zkStateReader.getZkClient();
    DistributedQueue inQueue = Overseer.getStateUpdateQueue(zkClient);
    Map<String, Object> propMap = new HashMap<>();
    propMap.put(Overseer.QUEUE_OPERATION, ADDREPLICAPROP.toLower());
    propMap.putAll(message.getProperties());
    ZkNodeProps m = new ZkNodeProps(propMap);
    inQueue.offer(Utils.toJSON(m));
}
Also used : HashMap(java.util.HashMap) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) SolrZkClient(org.apache.solr.common.cloud.SolrZkClient)

Aggregations

ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)91 SolrException (org.apache.solr.common.SolrException)35 HashMap (java.util.HashMap)28 Replica (org.apache.solr.common.cloud.Replica)22 ZkStateReader (org.apache.solr.common.cloud.ZkStateReader)20 ArrayList (java.util.ArrayList)19 Slice (org.apache.solr.common.cloud.Slice)19 KeeperException (org.apache.zookeeper.KeeperException)19 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)16 Test (org.junit.Test)16 DocCollection (org.apache.solr.common.cloud.DocCollection)15 SolrZkClient (org.apache.solr.common.cloud.SolrZkClient)14 Map (java.util.Map)13 ClusterState (org.apache.solr.common.cloud.ClusterState)13 IOException (java.io.IOException)10 ZkCoreNodeProps (org.apache.solr.common.cloud.ZkCoreNodeProps)10 ZooKeeperException (org.apache.solr.common.cloud.ZooKeeperException)10 NamedList (org.apache.solr.common.util.NamedList)10 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)9 SolrCore (org.apache.solr.core.SolrCore)8