use of org.apache.solr.update.UpdateLog.RecoveryInfo in project lucene-solr by apache.
the class RecoveryStrategy method replay.
private final Future<RecoveryInfo> replay(SolrCore core) throws InterruptedException, ExecutionException {
if (testing_beforeReplayBufferingUpdates != null) {
testing_beforeReplayBufferingUpdates.run();
}
if (replicaType == Replica.Type.TLOG) {
// roll over all updates during buffering to new tlog, make RTG available
SolrQueryRequest req = new LocalSolrQueryRequest(core, new ModifiableSolrParams());
core.getUpdateHandler().getUpdateLog().copyOverBufferingUpdates(new CommitUpdateCommand(req, false));
return null;
}
Future<RecoveryInfo> future = core.getUpdateHandler().getUpdateLog().applyBufferedUpdates();
if (future == null) {
// no replay needed\
LOG.info("No replay needed.");
} else {
LOG.info("Replaying buffered documents.");
// wait for replay
RecoveryInfo report = future.get();
if (report.failed) {
SolrException.log(LOG, "Replay failed");
throw new SolrException(ErrorCode.SERVER_ERROR, "Replay failed");
}
}
// solrcloud_debug
cloudDebugLog(core, "replayed");
return future;
}
use of org.apache.solr.update.UpdateLog.RecoveryInfo in project lucene-solr by apache.
the class RecoveryStrategy method doSyncOrReplicateRecovery.
// TODO: perhaps make this grab a new core each time through the loop to handle core reloads?
public final void doSyncOrReplicateRecovery(SolrCore core) throws KeeperException, InterruptedException {
boolean replayed = false;
boolean successfulRecovery = false;
UpdateLog ulog;
ulog = core.getUpdateHandler().getUpdateLog();
if (ulog == null) {
SolrException.log(LOG, "No UpdateLog found - cannot recover.");
recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
return;
}
// we temporary ignore peersync for tlog replicas
boolean firstTime = replicaType != Replica.Type.TLOG;
List<Long> recentVersions;
try (UpdateLog.RecentUpdates recentUpdates = ulog.getRecentUpdates()) {
recentVersions = recentUpdates.getVersions(ulog.getNumRecordsToKeep());
} catch (Exception e) {
SolrException.log(LOG, "Corrupt tlog - ignoring.", e);
recentVersions = new ArrayList<>(0);
}
List<Long> startingVersions = ulog.getStartingVersions();
if (startingVersions != null && recoveringAfterStartup) {
try {
// index of the start of the old list in the current list
int oldIdx = 0;
long firstStartingVersion = startingVersions.size() > 0 ? startingVersions.get(0) : 0;
for (; oldIdx < recentVersions.size(); oldIdx++) {
if (recentVersions.get(oldIdx) == firstStartingVersion)
break;
}
if (oldIdx > 0) {
LOG.info("####### Found new versions added after startup: num=[{}]", oldIdx);
LOG.info("###### currentVersions=[{}]", recentVersions);
}
LOG.info("###### startupVersions=[{}]", startingVersions);
} catch (Exception e) {
SolrException.log(LOG, "Error getting recent versions.", e);
recentVersions = new ArrayList<>(0);
}
}
if (recoveringAfterStartup) {
// if we're recovering after startup (i.e. we have been down), then we need to know what the last versions were
// when we went down. We may have received updates since then.
recentVersions = startingVersions;
try {
if ((ulog.getStartingOperation() & UpdateLog.FLAG_GAP) != 0) {
// last operation at the time of startup had the GAP flag set...
// this means we were previously doing a full index replication
// that probably didn't complete and buffering updates in the
// meantime.
LOG.info("Looks like a previous replication recovery did not complete - skipping peer sync.");
// skip peersync
firstTime = false;
}
} catch (Exception e) {
SolrException.log(LOG, "Error trying to get ulog starting operation.", e);
// skip peersync
firstTime = false;
}
}
if (replicaType == Replica.Type.TLOG) {
zkController.stopReplicationFromLeader(coreName);
}
Future<RecoveryInfo> replayFuture = null;
while (!successfulRecovery && !Thread.currentThread().isInterrupted() && !isClosed()) {
// don't use interruption or it will close channels though
try {
CloudDescriptor cloudDesc = core.getCoreDescriptor().getCloudDescriptor();
ZkNodeProps leaderprops = zkStateReader.getLeaderRetry(cloudDesc.getCollectionName(), cloudDesc.getShardId());
final String leaderBaseUrl = leaderprops.getStr(ZkStateReader.BASE_URL_PROP);
final String leaderCoreName = leaderprops.getStr(ZkStateReader.CORE_NAME_PROP);
String leaderUrl = ZkCoreNodeProps.getCoreUrl(leaderBaseUrl, leaderCoreName);
String ourUrl = ZkCoreNodeProps.getCoreUrl(baseUrl, coreName);
boolean isLeader = leaderUrl.equals(ourUrl);
if (isLeader && !cloudDesc.isLeader()) {
throw new SolrException(ErrorCode.SERVER_ERROR, "Cloud state still says we are leader.");
}
if (cloudDesc.isLeader()) {
// we are now the leader - no one else must have been suitable
LOG.warn("We have not yet recovered - but we are now the leader!");
LOG.info("Finished recovery process.");
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
return;
}
LOG.info("Begin buffering updates. core=[{}]", coreName);
ulog.bufferUpdates();
replayed = false;
LOG.info("Publishing state of core [{}] as recovering, leader is [{}] and I am [{}]", core.getName(), leaderUrl, ourUrl);
zkController.publish(core.getCoreDescriptor(), Replica.State.RECOVERING);
final Slice slice = zkStateReader.getClusterState().getSlice(cloudDesc.getCollectionName(), cloudDesc.getShardId());
try {
prevSendPreRecoveryHttpUriRequest.abort();
} catch (NullPointerException e) {
// okay
}
if (isClosed()) {
LOG.info("RecoveryStrategy has been closed");
break;
}
sendPrepRecoveryCmd(leaderBaseUrl, leaderCoreName, slice);
if (isClosed()) {
LOG.info("RecoveryStrategy has been closed");
break;
}
// discussion around current value)
try {
Thread.sleep(waitForUpdatesWithStaleStatePauseMilliSeconds);
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
// first thing we just try to sync
if (firstTime) {
// only try sync the first time through the loop
firstTime = false;
LOG.info("Attempting to PeerSync from [{}] - recoveringAfterStartup=[{}]", leaderUrl, recoveringAfterStartup);
// System.out.println("Attempting to PeerSync from " + leaderUrl
// + " i am:" + zkController.getNodeName());
PeerSync peerSync = new PeerSync(core, Collections.singletonList(leaderUrl), ulog.getNumRecordsToKeep(), false, false);
peerSync.setStartingVersions(recentVersions);
boolean syncSuccess = peerSync.sync().isSuccess();
if (syncSuccess) {
SolrQueryRequest req = new LocalSolrQueryRequest(core, new ModifiableSolrParams());
// force open a new searcher
core.getUpdateHandler().commit(new CommitUpdateCommand(req, false));
LOG.info("PeerSync stage of recovery was successful.");
// solrcloud_debug
cloudDebugLog(core, "synced");
LOG.info("Replaying updates buffered during PeerSync.");
replay(core);
replayed = true;
// sync success
successfulRecovery = true;
return;
}
LOG.info("PeerSync Recovery was not successful - trying replication.");
}
if (isClosed()) {
LOG.info("RecoveryStrategy has been closed");
break;
}
LOG.info("Starting Replication Recovery.");
try {
replicate(zkController.getNodeName(), core, leaderprops);
if (isClosed()) {
LOG.info("RecoveryStrategy has been closed");
break;
}
replayFuture = replay(core);
replayed = true;
if (isClosed()) {
LOG.info("RecoveryStrategy has been closed");
break;
}
LOG.info("Replication Recovery was successful.");
successfulRecovery = true;
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
LOG.warn("Recovery was interrupted", e);
close = true;
} catch (Exception e) {
SolrException.log(LOG, "Error while trying to recover", e);
}
} catch (Exception e) {
SolrException.log(LOG, "Error while trying to recover. core=" + coreName, e);
} finally {
if (!replayed) {
// dropBufferedUpdate()s currently only supports returning to ACTIVE state, which risks additional updates
// being added w/o UpdateLog.FLAG_GAP, hence losing the info on restart that we are not up-to-date.
// For now, ulog will simply remain in BUFFERING state, and an additional call to bufferUpdates() will
// reset our starting point for playback.
LOG.info("Replay not started, or was not successful... still buffering updates.");
/** this prev code is retained in case we want to switch strategies.
try {
ulog.dropBufferedUpdates();
} catch (Exception e) {
SolrException.log(log, "", e);
}
**/
}
if (successfulRecovery) {
LOG.info("Registering as Active after recovery.");
try {
if (replicaType == Replica.Type.TLOG) {
zkController.startReplicationFromLeader(coreName, true);
}
zkController.publish(core.getCoreDescriptor(), Replica.State.ACTIVE);
} catch (Exception e) {
LOG.error("Could not publish as ACTIVE after succesful recovery", e);
successfulRecovery = false;
}
if (successfulRecovery) {
close = true;
recoveryListener.recovered();
}
}
}
if (!successfulRecovery) {
// Or do a fall off retry...
try {
if (isClosed()) {
LOG.info("RecoveryStrategy has been closed");
break;
}
LOG.error("Recovery failed - trying again... (" + retries + ")");
retries++;
if (retries >= maxRetries) {
SolrException.log(LOG, "Recovery failed - max retries exceeded (" + retries + ").");
try {
recoveryFailed(core, zkController, baseUrl, coreZkNodeName, core.getCoreDescriptor());
} catch (Exception e) {
SolrException.log(LOG, "Could not publish that recovery failed", e);
}
break;
}
} catch (Exception e) {
SolrException.log(LOG, "An error has occurred during recovery", e);
}
try {
// Wait an exponential interval between retries, start at 5 seconds and work up to a minute.
// If we're at attempt >= 4, there's no point computing pow(2, retries) because the result
// will always be the minimum of the two (12). Since we sleep at 5 seconds sub-intervals in
// order to check if we were closed, 12 is chosen as the maximum loopCount (5s * 12 = 1m).
double loopCount = retries < 4 ? Math.min(Math.pow(2, retries), 12) : 12;
LOG.info("Wait [{}] seconds before trying to recover again (attempt={})", loopCount, retries);
for (int i = 0; i < loopCount; i++) {
if (isClosed()) {
LOG.info("RecoveryStrategy has been closed");
// check if someone closed us
break;
}
Thread.sleep(startingRecoveryDelayMilliSeconds);
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
LOG.warn("Recovery was interrupted.", e);
close = true;
}
}
}
// then we still need to update version bucket seeds after recovery
if (successfulRecovery && replayFuture == null) {
LOG.info("Updating version bucket highest from index after successful recovery.");
core.seedVersionBuckets();
}
LOG.info("Finished recovery process, successful=[{}]", Boolean.toString(successfulRecovery));
}
Aggregations