Search in sources :

Example 1 with RecoveryEngineException

use of org.elasticsearch.index.engine.RecoveryEngineException in project crate by crate.

the class BlobRecoverySourceHandler method recoverToTarget.

/**
     * performs the recovery from the local engine to the target
     */
public RecoveryResponse recoverToTarget() {
    final Engine engine = shard.engine();
    assert engine.getTranslog() != null : "translog must not be null";
    try (Translog.View translogView = engine.getTranslog().newView()) {
        logger.trace("captured translog id [{}] for recovery", translogView.minTranslogGeneration());
        final SnapshotIndexCommit phase1Snapshot;
        try {
            phase1Snapshot = shard.snapshotIndex(false);
        } catch (Throwable e) {
            IOUtils.closeWhileHandlingException(translogView);
            throw new RecoveryEngineException(shard.shardId(), 1, "Snapshot failed", e);
        }
        try {
            phase1(phase1Snapshot, translogView);
        } catch (Throwable e) {
            throw new RecoveryEngineException(shard.shardId(), 1, "phase1 failed", e);
        } finally {
            Releasables.closeWhileHandlingException(phase1Snapshot);
        }
        logger.trace("snapshot translog for recovery. current size is [{}]", translogView.totalOperations());
        try (Translog.Snapshot phase2Snapshot = translogView.snapshot()) {
            phase2(phase2Snapshot);
        } catch (Throwable e) {
            throw new RecoveryEngineException(shard.shardId(), 2, "phase2 failed", e);
        }
        finalizeRecovery();
    }
    return response;
}
Also used : RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) Engine(org.elasticsearch.index.engine.Engine) SnapshotIndexCommit(org.elasticsearch.index.deletionpolicy.SnapshotIndexCommit) Translog(org.elasticsearch.index.translog.Translog)

Example 2 with RecoveryEngineException

use of org.elasticsearch.index.engine.RecoveryEngineException in project elasticsearch by elastic.

the class PeerRecoveryTargetService method doRecovery.

private void doRecovery(final long recoveryId) {
    final StartRecoveryRequest request;
    final CancellableThreads cancellableThreads;
    final RecoveryState.Timer timer;
    try (RecoveryRef recoveryRef = onGoingRecoveries.getRecovery(recoveryId)) {
        if (recoveryRef == null) {
            logger.trace("not running recovery with id [{}] - can not find it (probably finished)", recoveryId);
            return;
        }
        final RecoveryTarget recoveryTarget = recoveryRef.target();
        cancellableThreads = recoveryTarget.cancellableThreads();
        timer = recoveryTarget.state().getTimer();
        try {
            assert recoveryTarget.sourceNode() != null : "can not do a recovery without a source node";
            request = getStartRecoveryRequest(recoveryTarget);
            logger.trace("{} preparing shard for peer recovery", recoveryTarget.shardId());
            recoveryTarget.indexShard().prepareForIndexRecovery();
        } catch (final Exception e) {
            // this will be logged as warning later on...
            logger.trace("unexpected error while preparing shard for peer recovery, failing recovery", e);
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(recoveryTarget.state(), "failed to prepare shard for recovery", e), true);
            return;
        }
    }
    try {
        logger.trace("{} starting recovery from {}", request.shardId(), request.sourceNode());
        final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>();
        cancellableThreads.execute(() -> responseHolder.set(transportService.submitRequest(request.sourceNode(), PeerRecoverySourceService.Actions.START_RECOVERY, request, new FutureTransportResponseHandler<RecoveryResponse>() {

            @Override
            public RecoveryResponse newInstance() {
                return new RecoveryResponse();
            }
        }).txGet()));
        final RecoveryResponse recoveryResponse = responseHolder.get();
        final TimeValue recoveryTime = new TimeValue(timer.time());
        // do this through ongoing recoveries to remove it from the collection
        onGoingRecoveries.markRecoveryAsDone(recoveryId);
        if (logger.isTraceEnabled()) {
            StringBuilder sb = new StringBuilder();
            sb.append('[').append(request.shardId().getIndex().getName()).append(']').append('[').append(request.shardId().id()).append("] ");
            sb.append("recovery completed from ").append(request.sourceNode()).append(", took[").append(recoveryTime).append("]\n");
            sb.append("   phase1: recovered_files [").append(recoveryResponse.phase1FileNames.size()).append("]").append(" with " + "total_size of [").append(new ByteSizeValue(recoveryResponse.phase1TotalSize)).append("]").append(", took [").append(timeValueMillis(recoveryResponse.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)).append(']').append("\n");
            sb.append("         : reusing_files   [").append(recoveryResponse.phase1ExistingFileNames.size()).append("] with " + "total_size of [").append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)).append("]\n");
            sb.append("   phase2: start took [").append(timeValueMillis(recoveryResponse.startTime)).append("]\n");
            sb.append("         : recovered [").append(recoveryResponse.phase2Operations).append("]").append(" transaction log " + "operations").append(", took [").append(timeValueMillis(recoveryResponse.phase2Time)).append("]").append("\n");
            logger.trace("{}", sb);
        } else {
            logger.debug("{} recovery done from [{}], took [{}]", request.shardId(), request.sourceNode(), recoveryTime);
        }
    } catch (CancellableThreads.ExecutionCancelledException e) {
        logger.trace("recovery cancelled", e);
    } catch (Exception e) {
        if (logger.isTraceEnabled()) {
            logger.trace((Supplier<?>) () -> new ParameterizedMessage("[{}][{}] Got exception on recovery", request.shardId().getIndex().getName(), request.shardId().id()), e);
        }
        Throwable cause = ExceptionsHelper.unwrapCause(e);
        if (cause instanceof CancellableThreads.ExecutionCancelledException) {
            // this can also come from the source wrapped in a RemoteTransportException
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source has canceled the recovery", cause), false);
            return;
        }
        if (cause instanceof RecoveryEngineException) {
            // unwrap an exception that was thrown as part of the recovery
            cause = cause.getCause();
        }
        // do it twice, in case we have double transport exception
        cause = ExceptionsHelper.unwrapCause(cause);
        if (cause instanceof RecoveryEngineException) {
            // unwrap an exception that was thrown as part of the recovery
            cause = cause.getCause();
        }
        if (cause instanceof IllegalIndexShardStateException || cause instanceof IndexNotFoundException || cause instanceof ShardNotFoundException) {
            // if the target is not ready yet, retry
            retryRecovery(recoveryId, "remote shard not ready", recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof DelayRecoveryException) {
            retryRecovery(recoveryId, cause, recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof ConnectTransportException) {
            logger.debug("delaying recovery of {} for [{}] due to networking error [{}]", request.shardId(), recoverySettings.retryDelayNetwork(), cause.getMessage());
            retryRecovery(recoveryId, cause.getMessage(), recoverySettings.retryDelayNetwork(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof AlreadyClosedException) {
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source shard is closed", cause), false);
            return;
        }
        onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, e), true);
    }
}
Also used : ByteSizeValue(org.elasticsearch.common.unit.ByteSizeValue) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) RecoveryRef(org.elasticsearch.indices.recovery.RecoveriesCollection.RecoveryRef) Supplier(org.apache.logging.log4j.util.Supplier) TimeValue(org.elasticsearch.common.unit.TimeValue) FutureTransportResponseHandler(org.elasticsearch.transport.FutureTransportResponseHandler) CancellableThreads(org.elasticsearch.common.util.CancellableThreads) AtomicReference(java.util.concurrent.atomic.AtomicReference) IllegalIndexShardStateException(org.elasticsearch.index.shard.IllegalIndexShardStateException) ElasticsearchException(org.elasticsearch.ElasticsearchException) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) NodeClosedException(org.elasticsearch.node.NodeClosedException) ShardNotFoundException(org.elasticsearch.index.shard.ShardNotFoundException) ElasticsearchTimeoutException(org.elasticsearch.ElasticsearchTimeoutException) ConnectTransportException(org.elasticsearch.transport.ConnectTransportException) IndexNotFoundException(org.elasticsearch.index.IndexNotFoundException) IOException(java.io.IOException) IllegalIndexShardStateException(org.elasticsearch.index.shard.IllegalIndexShardStateException) MapperException(org.elasticsearch.index.mapper.MapperException) ShardNotFoundException(org.elasticsearch.index.shard.ShardNotFoundException) ConnectTransportException(org.elasticsearch.transport.ConnectTransportException) IndexNotFoundException(org.elasticsearch.index.IndexNotFoundException) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage)

Example 3 with RecoveryEngineException

use of org.elasticsearch.index.engine.RecoveryEngineException in project elasticsearch by elastic.

the class RecoverySourceHandler method recoverToTarget.

/**
     * performs the recovery from the local engine to the target
     */
public RecoveryResponse recoverToTarget() throws IOException {
    try (Translog.View translogView = shard.acquireTranslogView()) {
        logger.trace("captured translog id [{}] for recovery", translogView.minTranslogGeneration());
        boolean isSequenceNumberBasedRecoveryPossible = request.startingSeqNo() != SequenceNumbersService.UNASSIGNED_SEQ_NO && isTranslogReadyForSequenceNumberBasedRecovery(translogView);
        if (isSequenceNumberBasedRecoveryPossible) {
            logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
        } else {
            final IndexCommit phase1Snapshot;
            try {
                phase1Snapshot = shard.acquireIndexCommit(false);
            } catch (final Exception e) {
                IOUtils.closeWhileHandlingException(translogView);
                throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
            }
            try {
                phase1(phase1Snapshot, translogView);
            } catch (final Exception e) {
                throw new RecoveryEngineException(shard.shardId(), 1, "phase1 failed", e);
            } finally {
                try {
                    shard.releaseIndexCommit(phase1Snapshot);
                } catch (final IOException ex) {
                    logger.warn("releasing snapshot caused exception", ex);
                }
            }
        }
        try {
            prepareTargetForTranslog(translogView.totalOperations(), shard.segmentStats(false).getMaxUnsafeAutoIdTimestamp());
        } catch (final Exception e) {
            throw new RecoveryEngineException(shard.shardId(), 1, "prepare target for translog failed", e);
        }
        // engine was just started at the end of phase1
        if (shard.state() == IndexShardState.RELOCATED) {
            assert request.isPrimaryRelocation() == false : "recovery target should not retry primary relocation if previous attempt made it past finalization step";
            /*
                 * The primary shard has been relocated while we copied files. This means that we can't guarantee any more that all
                 * operations that were replicated during the file copy (when the target engine was not yet opened) will be present in the
                 * local translog and thus will be resent on phase2. The reason is that an operation replicated by the target primary is
                 * sent to the recovery target and the local shard (old primary) concurrently, meaning it may have arrived at the recovery
                 * target before we opened the engine and is still in-flight on the local shard.
                 *
                 * Checking the relocated status here, after we opened the engine on the target, is safe because primary relocation waits
                 * for all ongoing operations to complete and be fully replicated. Therefore all future operation by the new primary are
                 * guaranteed to reach the target shard when its engine is open.
                 */
            throw new IndexShardRelocatedException(request.shardId());
        }
        logger.trace("snapshot translog for recovery; current size is [{}]", translogView.totalOperations());
        try {
            phase2(isSequenceNumberBasedRecoveryPossible ? request.startingSeqNo() : SequenceNumbersService.UNASSIGNED_SEQ_NO, translogView.snapshot());
        } catch (Exception e) {
            throw new RecoveryEngineException(shard.shardId(), 2, "phase2 failed", e);
        }
        finalizeRecovery();
    }
    return response;
}
Also used : IndexShardRelocatedException(org.elasticsearch.index.shard.IndexShardRelocatedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) IOException(java.io.IOException) IndexCommit(org.apache.lucene.index.IndexCommit) IndexShardClosedException(org.elasticsearch.index.shard.IndexShardClosedException) IndexFormatTooNewException(org.apache.lucene.index.IndexFormatTooNewException) IndexShardRelocatedException(org.elasticsearch.index.shard.IndexShardRelocatedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) IOException(java.io.IOException) IndexFormatTooOldException(org.apache.lucene.index.IndexFormatTooOldException) RemoteTransportException(org.elasticsearch.transport.RemoteTransportException) Translog(org.elasticsearch.index.translog.Translog)

Example 4 with RecoveryEngineException

use of org.elasticsearch.index.engine.RecoveryEngineException in project elasticsearch by elastic.

the class ExceptionSerializationTests method testRecoveryEngineException.

public void testRecoveryEngineException() throws IOException {
    ShardId id = new ShardId("foo", "_na_", 1);
    RecoveryEngineException ex = serialize(new RecoveryEngineException(id, 10, "total failure", new NullPointerException()));
    assertEquals(id, ex.getShardId());
    assertEquals("Phase[10] total failure", ex.getMessage());
    assertEquals(10, ex.phase());
    ex = serialize(new RecoveryEngineException(null, -1, "total failure", new NullPointerException()));
    assertNull(ex.getShardId());
    assertEquals(-1, ex.phase());
    assertTrue(ex.getCause() instanceof NullPointerException);
}
Also used : ShardId(org.elasticsearch.index.shard.ShardId) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException)

Aggregations

RecoveryEngineException (org.elasticsearch.index.engine.RecoveryEngineException)4 IOException (java.io.IOException)2 Translog (org.elasticsearch.index.translog.Translog)2 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 ParameterizedMessage (org.apache.logging.log4j.message.ParameterizedMessage)1 Supplier (org.apache.logging.log4j.util.Supplier)1 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)1 IndexCommit (org.apache.lucene.index.IndexCommit)1 IndexFormatTooNewException (org.apache.lucene.index.IndexFormatTooNewException)1 IndexFormatTooOldException (org.apache.lucene.index.IndexFormatTooOldException)1 AlreadyClosedException (org.apache.lucene.store.AlreadyClosedException)1 ElasticsearchException (org.elasticsearch.ElasticsearchException)1 ElasticsearchTimeoutException (org.elasticsearch.ElasticsearchTimeoutException)1 ByteSizeValue (org.elasticsearch.common.unit.ByteSizeValue)1 TimeValue (org.elasticsearch.common.unit.TimeValue)1 CancellableThreads (org.elasticsearch.common.util.CancellableThreads)1 IndexNotFoundException (org.elasticsearch.index.IndexNotFoundException)1 SnapshotIndexCommit (org.elasticsearch.index.deletionpolicy.SnapshotIndexCommit)1 Engine (org.elasticsearch.index.engine.Engine)1 MapperException (org.elasticsearch.index.mapper.MapperException)1