use of org.elasticsearch.index.engine.RecoveryEngineException in project crate by crate.
the class BlobRecoverySourceHandler method recoverToTarget.
/**
* performs the recovery from the local engine to the target
*/
public RecoveryResponse recoverToTarget() {
final Engine engine = shard.engine();
assert engine.getTranslog() != null : "translog must not be null";
try (Translog.View translogView = engine.getTranslog().newView()) {
logger.trace("captured translog id [{}] for recovery", translogView.minTranslogGeneration());
final SnapshotIndexCommit phase1Snapshot;
try {
phase1Snapshot = shard.snapshotIndex(false);
} catch (Throwable e) {
IOUtils.closeWhileHandlingException(translogView);
throw new RecoveryEngineException(shard.shardId(), 1, "Snapshot failed", e);
}
try {
phase1(phase1Snapshot, translogView);
} catch (Throwable e) {
throw new RecoveryEngineException(shard.shardId(), 1, "phase1 failed", e);
} finally {
Releasables.closeWhileHandlingException(phase1Snapshot);
}
logger.trace("snapshot translog for recovery. current size is [{}]", translogView.totalOperations());
try (Translog.Snapshot phase2Snapshot = translogView.snapshot()) {
phase2(phase2Snapshot);
} catch (Throwable e) {
throw new RecoveryEngineException(shard.shardId(), 2, "phase2 failed", e);
}
finalizeRecovery();
}
return response;
}
use of org.elasticsearch.index.engine.RecoveryEngineException in project elasticsearch by elastic.
the class PeerRecoveryTargetService method doRecovery.
private void doRecovery(final long recoveryId) {
final StartRecoveryRequest request;
final CancellableThreads cancellableThreads;
final RecoveryState.Timer timer;
try (RecoveryRef recoveryRef = onGoingRecoveries.getRecovery(recoveryId)) {
if (recoveryRef == null) {
logger.trace("not running recovery with id [{}] - can not find it (probably finished)", recoveryId);
return;
}
final RecoveryTarget recoveryTarget = recoveryRef.target();
cancellableThreads = recoveryTarget.cancellableThreads();
timer = recoveryTarget.state().getTimer();
try {
assert recoveryTarget.sourceNode() != null : "can not do a recovery without a source node";
request = getStartRecoveryRequest(recoveryTarget);
logger.trace("{} preparing shard for peer recovery", recoveryTarget.shardId());
recoveryTarget.indexShard().prepareForIndexRecovery();
} catch (final Exception e) {
// this will be logged as warning later on...
logger.trace("unexpected error while preparing shard for peer recovery, failing recovery", e);
onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(recoveryTarget.state(), "failed to prepare shard for recovery", e), true);
return;
}
}
try {
logger.trace("{} starting recovery from {}", request.shardId(), request.sourceNode());
final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>();
cancellableThreads.execute(() -> responseHolder.set(transportService.submitRequest(request.sourceNode(), PeerRecoverySourceService.Actions.START_RECOVERY, request, new FutureTransportResponseHandler<RecoveryResponse>() {
@Override
public RecoveryResponse newInstance() {
return new RecoveryResponse();
}
}).txGet()));
final RecoveryResponse recoveryResponse = responseHolder.get();
final TimeValue recoveryTime = new TimeValue(timer.time());
// do this through ongoing recoveries to remove it from the collection
onGoingRecoveries.markRecoveryAsDone(recoveryId);
if (logger.isTraceEnabled()) {
StringBuilder sb = new StringBuilder();
sb.append('[').append(request.shardId().getIndex().getName()).append(']').append('[').append(request.shardId().id()).append("] ");
sb.append("recovery completed from ").append(request.sourceNode()).append(", took[").append(recoveryTime).append("]\n");
sb.append(" phase1: recovered_files [").append(recoveryResponse.phase1FileNames.size()).append("]").append(" with " + "total_size of [").append(new ByteSizeValue(recoveryResponse.phase1TotalSize)).append("]").append(", took [").append(timeValueMillis(recoveryResponse.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)).append(']').append("\n");
sb.append(" : reusing_files [").append(recoveryResponse.phase1ExistingFileNames.size()).append("] with " + "total_size of [").append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)).append("]\n");
sb.append(" phase2: start took [").append(timeValueMillis(recoveryResponse.startTime)).append("]\n");
sb.append(" : recovered [").append(recoveryResponse.phase2Operations).append("]").append(" transaction log " + "operations").append(", took [").append(timeValueMillis(recoveryResponse.phase2Time)).append("]").append("\n");
logger.trace("{}", sb);
} else {
logger.debug("{} recovery done from [{}], took [{}]", request.shardId(), request.sourceNode(), recoveryTime);
}
} catch (CancellableThreads.ExecutionCancelledException e) {
logger.trace("recovery cancelled", e);
} catch (Exception e) {
if (logger.isTraceEnabled()) {
logger.trace((Supplier<?>) () -> new ParameterizedMessage("[{}][{}] Got exception on recovery", request.shardId().getIndex().getName(), request.shardId().id()), e);
}
Throwable cause = ExceptionsHelper.unwrapCause(e);
if (cause instanceof CancellableThreads.ExecutionCancelledException) {
// this can also come from the source wrapped in a RemoteTransportException
onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source has canceled the recovery", cause), false);
return;
}
if (cause instanceof RecoveryEngineException) {
// unwrap an exception that was thrown as part of the recovery
cause = cause.getCause();
}
// do it twice, in case we have double transport exception
cause = ExceptionsHelper.unwrapCause(cause);
if (cause instanceof RecoveryEngineException) {
// unwrap an exception that was thrown as part of the recovery
cause = cause.getCause();
}
if (cause instanceof IllegalIndexShardStateException || cause instanceof IndexNotFoundException || cause instanceof ShardNotFoundException) {
// if the target is not ready yet, retry
retryRecovery(recoveryId, "remote shard not ready", recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
return;
}
if (cause instanceof DelayRecoveryException) {
retryRecovery(recoveryId, cause, recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
return;
}
if (cause instanceof ConnectTransportException) {
logger.debug("delaying recovery of {} for [{}] due to networking error [{}]", request.shardId(), recoverySettings.retryDelayNetwork(), cause.getMessage());
retryRecovery(recoveryId, cause.getMessage(), recoverySettings.retryDelayNetwork(), recoverySettings.activityTimeout());
return;
}
if (cause instanceof AlreadyClosedException) {
onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source shard is closed", cause), false);
return;
}
onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, e), true);
}
}
use of org.elasticsearch.index.engine.RecoveryEngineException in project elasticsearch by elastic.
the class RecoverySourceHandler method recoverToTarget.
/**
* performs the recovery from the local engine to the target
*/
public RecoveryResponse recoverToTarget() throws IOException {
try (Translog.View translogView = shard.acquireTranslogView()) {
logger.trace("captured translog id [{}] for recovery", translogView.minTranslogGeneration());
boolean isSequenceNumberBasedRecoveryPossible = request.startingSeqNo() != SequenceNumbersService.UNASSIGNED_SEQ_NO && isTranslogReadyForSequenceNumberBasedRecovery(translogView);
if (isSequenceNumberBasedRecoveryPossible) {
logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
} else {
final IndexCommit phase1Snapshot;
try {
phase1Snapshot = shard.acquireIndexCommit(false);
} catch (final Exception e) {
IOUtils.closeWhileHandlingException(translogView);
throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
}
try {
phase1(phase1Snapshot, translogView);
} catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "phase1 failed", e);
} finally {
try {
shard.releaseIndexCommit(phase1Snapshot);
} catch (final IOException ex) {
logger.warn("releasing snapshot caused exception", ex);
}
}
}
try {
prepareTargetForTranslog(translogView.totalOperations(), shard.segmentStats(false).getMaxUnsafeAutoIdTimestamp());
} catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "prepare target for translog failed", e);
}
// engine was just started at the end of phase1
if (shard.state() == IndexShardState.RELOCATED) {
assert request.isPrimaryRelocation() == false : "recovery target should not retry primary relocation if previous attempt made it past finalization step";
/*
* The primary shard has been relocated while we copied files. This means that we can't guarantee any more that all
* operations that were replicated during the file copy (when the target engine was not yet opened) will be present in the
* local translog and thus will be resent on phase2. The reason is that an operation replicated by the target primary is
* sent to the recovery target and the local shard (old primary) concurrently, meaning it may have arrived at the recovery
* target before we opened the engine and is still in-flight on the local shard.
*
* Checking the relocated status here, after we opened the engine on the target, is safe because primary relocation waits
* for all ongoing operations to complete and be fully replicated. Therefore all future operation by the new primary are
* guaranteed to reach the target shard when its engine is open.
*/
throw new IndexShardRelocatedException(request.shardId());
}
logger.trace("snapshot translog for recovery; current size is [{}]", translogView.totalOperations());
try {
phase2(isSequenceNumberBasedRecoveryPossible ? request.startingSeqNo() : SequenceNumbersService.UNASSIGNED_SEQ_NO, translogView.snapshot());
} catch (Exception e) {
throw new RecoveryEngineException(shard.shardId(), 2, "phase2 failed", e);
}
finalizeRecovery();
}
return response;
}
use of org.elasticsearch.index.engine.RecoveryEngineException in project elasticsearch by elastic.
the class ExceptionSerializationTests method testRecoveryEngineException.
public void testRecoveryEngineException() throws IOException {
ShardId id = new ShardId("foo", "_na_", 1);
RecoveryEngineException ex = serialize(new RecoveryEngineException(id, 10, "total failure", new NullPointerException()));
assertEquals(id, ex.getShardId());
assertEquals("Phase[10] total failure", ex.getMessage());
assertEquals(10, ex.phase());
ex = serialize(new RecoveryEngineException(null, -1, "total failure", new NullPointerException()));
assertNull(ex.getShardId());
assertEquals(-1, ex.phase());
assertTrue(ex.getCause() instanceof NullPointerException);
}
Aggregations