Search in sources :

Example 1 with PeerSync

use of org.apache.solr.update.PeerSync in project lucene-solr by apache.

the class RealTimeGetComponent method processSync.

public void processSync(ResponseBuilder rb, int nVersions, String sync) {
    boolean onlyIfActive = rb.req.getParams().getBool("onlyIfActive", false);
    if (onlyIfActive) {
        if (rb.req.getCore().getCoreDescriptor().getCloudDescriptor().getLastPublished() != Replica.State.ACTIVE) {
            log.info("Last published state was not ACTIVE, cannot sync.");
            rb.rsp.add("sync", "false");
            return;
        }
    }
    List<String> replicas = StrUtils.splitSmart(sync, ",", true);
    boolean cantReachIsSuccess = rb.req.getParams().getBool("cantReachIsSuccess", false);
    PeerSync peerSync = new PeerSync(rb.req.getCore(), replicas, nVersions, cantReachIsSuccess, true);
    boolean success = peerSync.sync().isSuccess();
    // TODO: more complex response?
    rb.rsp.add("sync", success);
}
Also used : PeerSync(org.apache.solr.update.PeerSync)

Example 2 with PeerSync

use of org.apache.solr.update.PeerSync in project lucene-solr by apache.

the class RecoveryStrategy method doSyncOrReplicateRecovery.

// TODO: perhaps make this grab a new core each time through the loop to handle core reloads?
public final void doSyncOrReplicateRecovery(SolrCore core) throws KeeperException, InterruptedException {
    boolean replayed = false;
    boolean successfulRecovery = false;
    UpdateLog ulog;
    ulog = core.getUpdateHandler().getUpdateLog();
    if (ulog == null) {
        SolrException.log(LOG, "No UpdateLog found - cannot recover.");
        recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
        return;
    }
    // we temporary ignore peersync for tlog replicas
    boolean firstTime = replicaType != Replica.Type.TLOG;
    List<Long> recentVersions;
    try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
        recentVersions = recentUpdates.getVersions(ulog.getNumRecordsToKeep());
    } catch (Exception e) {
        SolrException.log(LOG, "Corrupt tlog - ignoring.", e);
        recentVersions = new ArrayList<>(0);
    }
    List<Long> startingVersions = ulog.getStartingVersions();
    if (startingVersions != null && recoveringAfterStartup) {
        try {
            // index of the start of the old list in the current list
            int oldIdx = 0;
            long firstStartingVersion = startingVersions.size() > 0 ? startingVersions.get(0) : 0;
            for (; oldIdx < recentVersions.size(); oldIdx++) {
                if (recentVersions.get(oldIdx) == firstStartingVersion)
                    break;
            }
            if (oldIdx > 0) {
                LOG.info("####### Found new versions added after startup: num=[{}]", oldIdx);
                LOG.info("###### currentVersions=[{}]", recentVersions);
            }
            LOG.info("###### startupVersions=[{}]", startingVersions);
        } catch (Exception e) {
            SolrException.log(LOG, "Error getting recent versions.", e);
            recentVersions = new ArrayList<>(0);
        }
    }
    if (recoveringAfterStartup) {
        // if we're recovering after startup (i.e. we have been down), then we need to know what the last versions were
        // when we went down.  We may have received updates since then.
        recentVersions = startingVersions;
        try {
            if ((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0) {
                // last operation at the time of startup had the GAP flag set...
                // this means we were previously doing a full index replication
                // that probably didn't complete and buffering updates in the
                // meantime.
                LOG.info("Looks like a previous replication recovery did not complete - skipping peer sync.");
                // skip peersync
                firstTime = false;
            }
        } catch (Exception e) {
            SolrException.log(LOG, "Error trying to get ulog starting operation.", e);
            // skip peersync
            firstTime = false;
        }
    }
    if (replicaType == Replica.Type.TLOG) {
        zkController.stopReplicationFromLeader(coreName);
    }
    Future<RecoveryInfo> replayFuture = null;
    while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) {
        // don't use interruption or it will close channels though
        try {
            CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
            ZkNodeProps leaderprops = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId());
            final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
            final String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP);
            String leaderUrl = ZkCoreNodeProps.getCoreUrl(leaderBaseUrl, leaderCoreName);
            String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
            boolean isLeader = leaderUrl.equals(ourUrl);
            if (isLeader && !cloudDesc.isLeader()) {
                throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
            }
            if (cloudDesc.isLeader()) {
                // we are now the leader - no one else must have been suitable
                LOG.warn("We have not yet recovered - but we are now the leader!");
                LOG.info("Finished recovery process.");
                zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
                return;
            }
            LOG.info("Begin buffering updates. core=[{}]", coreName);
            ulog.bufferUpdates();
            replayed = false;
            LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl, ourUrl);
            zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
            final Slice slice = zkStateReader.getClusterState().getSlice(cloudDesc.getCollectionName(), cloudDesc.getShardId());
            try {
                prevSendPreRecoveryHttpUriRequest.abort();
            } catch (NullPointerException e) {
            // okay
            }
            if (isClosed()) {
                LOG.info("RecoveryStrategy has been closed");
                break;
            }
            sendPrepRecoveryCmd(leaderBaseUrl, leaderCoreName, slice);
            if (isClosed()) {
                LOG.info("RecoveryStrategy has been closed");
                break;
            }
            // discussion around current value)
            try {
                Thread.sleep(waitForUpdatesWithStaleStatePauseMilliSeconds);
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
            }
            // first thing we just try to sync
            if (firstTime) {
                // only try sync the first time through the loop
                firstTime = false;
                LOG.info("Attempting to PeerSync from [{}] - recoveringAfterStartup=[{}]", leaderUrl, recoveringAfterStartup);
                // System.out.println("Attempting to PeerSync from " + leaderUrl
                // + " i am:" + zkController.getNodeName());
                PeerSync peerSync = new PeerSync(core, Collections.singletonList(leaderUrl), ulog.getNumRecordsToKeep(), false, false);
                peerSync.setStartingVersions(recentVersions);
                boolean syncSuccess = peerSync.sync().isSuccess();
                if (syncSuccess) {
                    SolrQueryRequest req = new LocalSolrQueryRequest(core, new ModifiableSolrParams());
                    // force open a new searcher
                    core.getUpdateHandler().commit(new CommitUpdateCommand(req, false));
                    LOG.info("PeerSync stage of recovery was successful.");
                    // solrcloud_debug
                    cloudDebugLog(core, "synced");
                    LOG.info("Replaying updates buffered during PeerSync.");
                    replay(core);
                    replayed = true;
                    // sync success
                    successfulRecovery = true;
                    return;
                }
                LOG.info("PeerSync Recovery was not successful - trying replication.");
            }
            if (isClosed()) {
                LOG.info("RecoveryStrategy has been closed");
                break;
            }
            LOG.info("Starting Replication Recovery.");
            try {
                replicate(zkController.getNodeName(), core, leaderprops);
                if (isClosed()) {
                    LOG.info("RecoveryStrategy has been closed");
                    break;
                }
                replayFuture = replay(core);
                replayed = true;
                if (isClosed()) {
                    LOG.info("RecoveryStrategy has been closed");
                    break;
                }
                LOG.info("Replication Recovery was successful.");
                successfulRecovery = true;
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                LOG.warn("Recovery was interrupted", e);
                close = true;
            } catch (Exception e) {
                SolrException.log(LOG, "Error while trying to recover", e);
            }
        } catch (Exception e) {
            SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e);
        } finally {
            if (!replayed) {
                // dropBufferedUpdate()s currently only supports returning to ACTIVE state, which risks additional updates
                // being added w/o UpdateLog.FLAG_GAP, hence losing the info on restart that we are not up-to-date.
                // For now, ulog will simply remain in BUFFERING state, and an additional call to bufferUpdates() will
                // reset our starting point for playback.
                LOG.info("Replay not started, or was not successful... still buffering updates.");
            /** this prev code is retained in case we want to switch strategies.
          try {
            ulog.dropBufferedUpdates();
          } catch (Exception e) {
            SolrException.log(log, "", e);
          }
          **/
            }
            if (successfulRecovery) {
                LOG.info("Registering as Active after recovery.");
                try {
                    if (replicaType == Replica.Type.TLOG) {
                        zkController.startReplicationFromLeader(coreName, true);
                    }
                    zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
                } catch (Exception e) {
                    LOG.error("Could not publish as ACTIVE after succesful recovery", e);
                    successfulRecovery = false;
                }
                if (successfulRecovery) {
                    close = true;
                    recoveryListener.recovered();
                }
            }
        }
        if (!successfulRecovery) {
            // Or do a fall off retry...
            try {
                if (isClosed()) {
                    LOG.info("RecoveryStrategy has been closed");
                    break;
                }
                LOG.error("Recovery failed - trying again... (" + retries + ")");
                retries++;
                if (retries >= maxRetries) {
                    SolrException.log(LOG, "Recovery failed - max retries exceeded (" + retries + ").");
                    try {
                        recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
                    } catch (Exception e) {
                        SolrException.log(LOG, "Could not publish that recovery failed", e);
                    }
                    break;
                }
            } catch (Exception e) {
                SolrException.log(LOG, "An error has occurred during recovery", e);
            }
            try {
                // Wait an exponential interval between retries, start at 5 seconds and work up to a minute.
                // If we're at attempt >= 4, there's no point computing pow(2, retries) because the result 
                // will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
                // order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
                double loopCount = retries < 4 ? Math.min(Math.pow(2, retries), 12) : 12;
                LOG.info("Wait [{}] seconds before trying to recover again (attempt={})", loopCount, retries);
                for (int i = 0; i < loopCount; i++) {
                    if (isClosed()) {
                        LOG.info("RecoveryStrategy has been closed");
                        // check if someone closed us
                        break;
                    }
                    Thread.sleep(startingRecoveryDelayMilliSeconds);
                }
            } catch (InterruptedException e) {
                Thread.currentThread().interrupt();
                LOG.warn("Recovery was interrupted.", e);
                close = true;
            }
        }
    }
    // then we still need to update version bucket seeds after recovery
    if (successfulRecovery && replayFuture == null) {
        LOG.info("Updating version bucket highest from index after successful recovery.");
        core.seedVersionBuckets();
    }
    LOG.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery));
}
Also used : ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) PeerSync(org.apache.solr.update.PeerSync) ArrayList(java.util.ArrayList) RecoveryInfo(org.apache.solr.update.UpdateLog.RecoveryInfo) CommitUpdateCommand(org.apache.solr.update.CommitUpdateCommand) SolrServerException(org.apache.solr.client.solrj.SolrServerException) SolrException(org.apache.solr.common.SolrException) ZooKeeperException(org.apache.solr.common.cloud.ZooKeeperException) SocketTimeoutException(java.net.SocketTimeoutException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) ModifiableSolrParams(org.apache.solr.common.params.ModifiableSolrParams) LocalSolrQueryRequest(org.apache.solr.request.LocalSolrQueryRequest) SolrQueryRequest(org.apache.solr.request.SolrQueryRequest) LocalSolrQueryRequest(org.apache.solr.request.LocalSolrQueryRequest) Slice(org.apache.solr.common.cloud.Slice) UpdateLog(org.apache.solr.update.UpdateLog) SolrException(org.apache.solr.common.SolrException)

Example 3 with PeerSync

use of org.apache.solr.update.PeerSync in project lucene-solr by apache.

the class SyncStrategy method syncWithReplicas.

private PeerSync.PeerSyncResult syncWithReplicas(ZkController zkController, SolrCore core, ZkNodeProps props, String collection, String shardId, boolean peerSyncOnlyWithActive) {
    List<ZkCoreNodeProps> nodes = zkController.getZkStateReader().getReplicaProps(collection, shardId, core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
    if (nodes == null) {
        // I have no replicas
        return PeerSync.PeerSyncResult.success();
    }
    List<String> syncWith = new ArrayList<>(nodes.size());
    for (ZkCoreNodeProps node : nodes) {
        syncWith.add(node.getCoreUrl());
    }
    // if we can't reach a replica for sync, we still consider the overall sync a success
    // TODO: as an assurance, we should still try and tell the sync nodes that we couldn't reach
    // to recover once more?
    // Fingerprinting here is off because the we currently rely on having at least one of the nodes return "true", and if replicas are out-of-sync
    // we still need to pick one as leader.  A followup sync from the replica to the new leader (with fingerprinting on) should then fail and
    // initiate recovery-by-replication.
    PeerSync peerSync = new PeerSync(core, syncWith, core.getUpdateHandler().getUpdateLog().getNumRecordsToKeep(), true, true, peerSyncOnlyWithActive, false);
    return peerSync.sync();
}
Also used : ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) PeerSync(org.apache.solr.update.PeerSync) ArrayList(java.util.ArrayList)

Aggregations

PeerSync (org.apache.solr.update.PeerSync)3 ArrayList (java.util.ArrayList)2 IOException (java.io.IOException)1 SocketTimeoutException (java.net.SocketTimeoutException)1 ExecutionException (java.util.concurrent.ExecutionException)1 SolrServerException (org.apache.solr.client.solrj.SolrServerException)1 SolrException (org.apache.solr.common.SolrException)1 Slice (org.apache.solr.common.cloud.Slice)1 ZkCoreNodeProps (org.apache.solr.common.cloud.ZkCoreNodeProps)1 ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)1 ZooKeeperException (org.apache.solr.common.cloud.ZooKeeperException)1 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)1 LocalSolrQueryRequest (org.apache.solr.request.LocalSolrQueryRequest)1 SolrQueryRequest (org.apache.solr.request.SolrQueryRequest)1 CommitUpdateCommand (org.apache.solr.update.CommitUpdateCommand)1 UpdateLog (org.apache.solr.update.UpdateLog)1 RecoveryInfo (org.apache.solr.update.UpdateLog.RecoveryInfo)1 KeeperException (org.apache.zookeeper.KeeperException)1