Search in sources :

Example 1 with RecoveryRef

use of org.elasticsearch.indices.recovery.RecoveriesCollection.RecoveryRef in project elasticsearch by elastic.

the class PeerRecoveryTargetService method doRecovery.

private void doRecovery(final long recoveryId) {
    final StartRecoveryRequest request;
    final CancellableThreads cancellableThreads;
    final RecoveryState.Timer timer;
    try (RecoveryRef recoveryRef = onGoingRecoveries.getRecovery(recoveryId)) {
        if (recoveryRef == null) {
            logger.trace("not running recovery with id [{}] - can not find it (probably finished)", recoveryId);
            return;
        }
        final RecoveryTarget recoveryTarget = recoveryRef.target();
        cancellableThreads = recoveryTarget.cancellableThreads();
        timer = recoveryTarget.state().getTimer();
        try {
            assert recoveryTarget.sourceNode() != null : "can not do a recovery without a source node";
            request = getStartRecoveryRequest(recoveryTarget);
            logger.trace("{} preparing shard for peer recovery", recoveryTarget.shardId());
            recoveryTarget.indexShard().prepareForIndexRecovery();
        } catch (final Exception e) {
            // this will be logged as warning later on...
            logger.trace("unexpected error while preparing shard for peer recovery, failing recovery", e);
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(recoveryTarget.state(), "failed to prepare shard for recovery", e), true);
            return;
        }
    }
    try {
        logger.trace("{} starting recovery from {}", request.shardId(), request.sourceNode());
        final AtomicReference<RecoveryResponse> responseHolder = new AtomicReference<>();
        cancellableThreads.execute(() -> responseHolder.set(transportService.submitRequest(request.sourceNode(), PeerRecoverySourceService.Actions.START_RECOVERY, request, new FutureTransportResponseHandler<RecoveryResponse>() {

            @Override
            public RecoveryResponse newInstance() {
                return new RecoveryResponse();
            }
        }).txGet()));
        final RecoveryResponse recoveryResponse = responseHolder.get();
        final TimeValue recoveryTime = new TimeValue(timer.time());
        // do this through ongoing recoveries to remove it from the collection
        onGoingRecoveries.markRecoveryAsDone(recoveryId);
        if (logger.isTraceEnabled()) {
            StringBuilder sb = new StringBuilder();
            sb.append('[').append(request.shardId().getIndex().getName()).append(']').append('[').append(request.shardId().id()).append("] ");
            sb.append("recovery completed from ").append(request.sourceNode()).append(", took[").append(recoveryTime).append("]\n");
            sb.append("   phase1: recovered_files [").append(recoveryResponse.phase1FileNames.size()).append("]").append(" with " + "total_size of [").append(new ByteSizeValue(recoveryResponse.phase1TotalSize)).append("]").append(", took [").append(timeValueMillis(recoveryResponse.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)).append(']').append("\n");
            sb.append("         : reusing_files   [").append(recoveryResponse.phase1ExistingFileNames.size()).append("] with " + "total_size of [").append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)).append("]\n");
            sb.append("   phase2: start took [").append(timeValueMillis(recoveryResponse.startTime)).append("]\n");
            sb.append("         : recovered [").append(recoveryResponse.phase2Operations).append("]").append(" transaction log " + "operations").append(", took [").append(timeValueMillis(recoveryResponse.phase2Time)).append("]").append("\n");
            logger.trace("{}", sb);
        } else {
            logger.debug("{} recovery done from [{}], took [{}]", request.shardId(), request.sourceNode(), recoveryTime);
        }
    } catch (CancellableThreads.ExecutionCancelledException e) {
        logger.trace("recovery cancelled", e);
    } catch (Exception e) {
        if (logger.isTraceEnabled()) {
            logger.trace((Supplier<?>) () -> new ParameterizedMessage("[{}][{}] Got exception on recovery", request.shardId().getIndex().getName(), request.shardId().id()), e);
        }
        Throwable cause = ExceptionsHelper.unwrapCause(e);
        if (cause instanceof CancellableThreads.ExecutionCancelledException) {
            // this can also come from the source wrapped in a RemoteTransportException
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source has canceled the recovery", cause), false);
            return;
        }
        if (cause instanceof RecoveryEngineException) {
            // unwrap an exception that was thrown as part of the recovery
            cause = cause.getCause();
        }
        // do it twice, in case we have double transport exception
        cause = ExceptionsHelper.unwrapCause(cause);
        if (cause instanceof RecoveryEngineException) {
            // unwrap an exception that was thrown as part of the recovery
            cause = cause.getCause();
        }
        if (cause instanceof IllegalIndexShardStateException || cause instanceof IndexNotFoundException || cause instanceof ShardNotFoundException) {
            // if the target is not ready yet, retry
            retryRecovery(recoveryId, "remote shard not ready", recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof DelayRecoveryException) {
            retryRecovery(recoveryId, cause, recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof ConnectTransportException) {
            logger.debug("delaying recovery of {} for [{}] due to networking error [{}]", request.shardId(), recoverySettings.retryDelayNetwork(), cause.getMessage());
            retryRecovery(recoveryId, cause.getMessage(), recoverySettings.retryDelayNetwork(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof AlreadyClosedException) {
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source shard is closed", cause), false);
            return;
        }
        onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, e), true);
    }
}
Also used : ByteSizeValue(org.elasticsearch.common.unit.ByteSizeValue) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) RecoveryRef(org.elasticsearch.indices.recovery.RecoveriesCollection.RecoveryRef) Supplier(org.apache.logging.log4j.util.Supplier) TimeValue(org.elasticsearch.common.unit.TimeValue) FutureTransportResponseHandler(org.elasticsearch.transport.FutureTransportResponseHandler) CancellableThreads(org.elasticsearch.common.util.CancellableThreads) AtomicReference(java.util.concurrent.atomic.AtomicReference) IllegalIndexShardStateException(org.elasticsearch.index.shard.IllegalIndexShardStateException) ElasticsearchException(org.elasticsearch.ElasticsearchException) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) NodeClosedException(org.elasticsearch.node.NodeClosedException) ShardNotFoundException(org.elasticsearch.index.shard.ShardNotFoundException) ElasticsearchTimeoutException(org.elasticsearch.ElasticsearchTimeoutException) ConnectTransportException(org.elasticsearch.transport.ConnectTransportException) IndexNotFoundException(org.elasticsearch.index.IndexNotFoundException) IOException(java.io.IOException) IllegalIndexShardStateException(org.elasticsearch.index.shard.IllegalIndexShardStateException) MapperException(org.elasticsearch.index.mapper.MapperException) ShardNotFoundException(org.elasticsearch.index.shard.ShardNotFoundException) ConnectTransportException(org.elasticsearch.transport.ConnectTransportException) IndexNotFoundException(org.elasticsearch.index.IndexNotFoundException) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage)

Example 2 with RecoveryRef

use of org.elasticsearch.indices.recovery.RecoveriesCollection.RecoveryRef in project crate by crate.

the class PeerRecoveryTargetService method doRecovery.

private void doRecovery(final long recoveryId) {
    final StartRecoveryRequest request;
    final RecoveryState.Timer timer;
    CancellableThreads cancellableThreads;
    try (RecoveryRef recoveryRef = onGoingRecoveries.getRecovery(recoveryId)) {
        if (recoveryRef == null) {
            LOGGER.trace("not running recovery with id [{}] - can not find it (probably finished)", recoveryId);
            return;
        }
        final RecoveryTarget recoveryTarget = recoveryRef.target();
        timer = recoveryTarget.state().getTimer();
        cancellableThreads = recoveryTarget.cancellableThreads();
        try {
            assert recoveryTarget.sourceNode() != null : "can not do a recovery without a source node";
            LOGGER.trace("{} preparing shard for peer recovery", recoveryTarget.shardId());
            recoveryTarget.indexShard().prepareForIndexRecovery();
            final long startingSeqNo = recoveryTarget.indexShard().recoverLocallyUpToGlobalCheckpoint();
            assert startingSeqNo == UNASSIGNED_SEQ_NO || recoveryTarget.state().getStage() == RecoveryState.Stage.TRANSLOG : "unexpected recovery stage [" + recoveryTarget.state().getStage() + "] starting seqno [ " + startingSeqNo + "]";
            request = getStartRecoveryRequest(LOGGER, clusterService.localNode(), recoveryTarget, startingSeqNo);
        } catch (final Exception e) {
            // this will be logged as warning later on...
            LOGGER.trace("unexpected error while preparing shard for peer recovery, failing recovery", e);
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(recoveryTarget.state(), "failed to prepare shard for recovery", e), true);
            return;
        }
    }
    Consumer<Exception> handleException = e -> {
        if (LOGGER.isTraceEnabled()) {
            LOGGER.trace(() -> new ParameterizedMessage("[{}][{}] Got exception on recovery", request.shardId().getIndex().getName(), request.shardId().id()), e);
        }
        Throwable cause = SQLExceptions.unwrap(e);
        if (cause instanceof CancellableThreads.ExecutionCancelledException) {
            // this can also come from the source wrapped in a RemoteTransportException
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source has canceled the recovery", cause), false);
            return;
        }
        if (cause instanceof RecoveryEngineException) {
            // unwrap an exception that was thrown as part of the recovery
            cause = cause.getCause();
        }
        // do it twice, in case we have double transport exception
        cause = SQLExceptions.unwrap(cause);
        if (cause instanceof RecoveryEngineException) {
            // unwrap an exception that was thrown as part of the recovery
            cause = cause.getCause();
        }
        if (cause instanceof IllegalIndexShardStateException || cause instanceof IndexNotFoundException || cause instanceof ShardNotFoundException) {
            // if the target is not ready yet, retry
            retryRecovery(recoveryId, "remote shard not ready", recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof DelayRecoveryException) {
            retryRecovery(recoveryId, cause, recoverySettings.retryDelayStateSync(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof ConnectTransportException) {
            LOGGER.debug("delaying recovery of {} for [{}] due to networking error [{}]", request.shardId(), recoverySettings.retryDelayNetwork(), cause.getMessage());
            retryRecovery(recoveryId, cause.getMessage(), recoverySettings.retryDelayNetwork(), recoverySettings.activityTimeout());
            return;
        }
        if (cause instanceof AlreadyClosedException) {
            onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, "source shard is closed", cause), false);
            return;
        }
        onGoingRecoveries.failRecovery(recoveryId, new RecoveryFailedException(request, e), true);
    };
    try {
        LOGGER.trace("{} starting recovery from {}", request.shardId(), request.sourceNode());
        cancellableThreads.executeIO(() -> transportService.sendRequest(request.sourceNode(), PeerRecoverySourceService.Actions.START_RECOVERY, request, new TransportResponseHandler<RecoveryResponse>() {

            @Override
            public void handleResponse(RecoveryResponse recoveryResponse) {
                final TimeValue recoveryTime = new TimeValue(timer.time());
                // do this through ongoing recoveries to remove it from the collection
                onGoingRecoveries.markRecoveryAsDone(recoveryId);
                if (LOGGER.isTraceEnabled()) {
                    StringBuilder sb = new StringBuilder();
                    sb.append('[').append(request.shardId().getIndex().getName()).append(']').append('[').append(request.shardId().id()).append("] ");
                    sb.append("recovery completed from ").append(request.sourceNode()).append(", took[").append(recoveryTime).append("]\n");
                    sb.append("   phase1: recovered_files [").append(recoveryResponse.phase1FileNames.size()).append("]").append(" with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1TotalSize)).append("]").append(", took [").append(timeValueMillis(recoveryResponse.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)).append(']').append("\n");
                    sb.append("         : reusing_files   [").append(recoveryResponse.phase1ExistingFileNames.size()).append("] with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)).append("]\n");
                    sb.append("   phase2: start took [").append(timeValueMillis(recoveryResponse.startTime)).append("]\n");
                    sb.append("         : recovered [").append(recoveryResponse.phase2Operations).append("]").append(" transaction log operations").append(", took [").append(timeValueMillis(recoveryResponse.phase2Time)).append("]").append("\n");
                    LOGGER.trace("{}", sb);
                } else {
                    LOGGER.debug("{} recovery done from [{}], took [{}]", request.shardId(), request.sourceNode(), recoveryTime);
                }
            }

            @Override
            public void handleException(TransportException e) {
                handleException.accept(e);
            }

            @Override
            public String executor() {
                // we do some heavy work like refreshes in the response so fork off to the generic threadpool
                return ThreadPool.Names.GENERIC;
            }

            @Override
            public RecoveryResponse read(StreamInput in) throws IOException {
                return new RecoveryResponse(in);
            }
        }));
    } catch (CancellableThreads.ExecutionCancelledException e) {
        LOGGER.trace("recovery cancelled", e);
    } catch (Exception e) {
        handleException.accept(e);
    }
}
Also used : ElasticsearchException(org.elasticsearch.ElasticsearchException) CancellableThreads(org.elasticsearch.common.util.CancellableThreads) ShardId(org.elasticsearch.index.shard.ShardId) TransportChannel(org.elasticsearch.transport.TransportChannel) IndexMetadata(org.elasticsearch.cluster.metadata.IndexMetadata) ClusterService(org.elasticsearch.cluster.service.ClusterService) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) ShardNotFoundException(org.elasticsearch.index.shard.ShardNotFoundException) TranslogCorruptedException(org.elasticsearch.index.translog.TranslogCorruptedException) ElasticsearchTimeoutException(org.elasticsearch.ElasticsearchTimeoutException) ClusterState(org.elasticsearch.cluster.ClusterState) DiscoveryNode(org.elasticsearch.cluster.node.DiscoveryNode) ConnectTransportException(org.elasticsearch.transport.ConnectTransportException) Settings(org.elasticsearch.common.settings.Settings) IndexNotFoundException(org.elasticsearch.index.IndexNotFoundException) Store(org.elasticsearch.index.store.Store) ChannelActionListener(org.elasticsearch.action.support.ChannelActionListener) ThreadPool(org.elasticsearch.threadpool.ThreadPool) TransportResponse(org.elasticsearch.transport.TransportResponse) TransportService(org.elasticsearch.transport.TransportService) ClusterStateObserver(org.elasticsearch.cluster.ClusterStateObserver) Nullable(javax.annotation.Nullable) ByteSizeValue(org.elasticsearch.common.unit.ByteSizeValue) RecoveryRef(org.elasticsearch.indices.recovery.RecoveriesCollection.RecoveryRef) IndexEventListener(org.elasticsearch.index.shard.IndexEventListener) IndexShard(org.elasticsearch.index.shard.IndexShard) UNASSIGNED_SEQ_NO(org.elasticsearch.index.seqno.SequenceNumbers.UNASSIGNED_SEQ_NO) IOException(java.io.IOException) IllegalIndexShardStateException(org.elasticsearch.index.shard.IllegalIndexShardStateException) TransportRequestHandler(org.elasticsearch.transport.TransportRequestHandler) Consumer(java.util.function.Consumer) AtomicLong(java.util.concurrent.atomic.AtomicLong) AbstractRunnable(org.elasticsearch.common.util.concurrent.AbstractRunnable) Logger(org.apache.logging.log4j.Logger) TimeValue.timeValueMillis(io.crate.common.unit.TimeValue.timeValueMillis) StreamInput(org.elasticsearch.common.io.stream.StreamInput) TimeValue(io.crate.common.unit.TimeValue) Translog(org.elasticsearch.index.translog.Translog) TransportResponseHandler(org.elasticsearch.transport.TransportResponseHandler) SQLExceptions(io.crate.exceptions.SQLExceptions) LogManager(org.apache.logging.log4j.LogManager) TransportException(org.elasticsearch.transport.TransportException) RateLimiter(org.apache.lucene.store.RateLimiter) ActionListener(org.elasticsearch.action.ActionListener) MapperException(org.elasticsearch.index.mapper.MapperException) ByteSizeValue(org.elasticsearch.common.unit.ByteSizeValue) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) RecoveryRef(org.elasticsearch.indices.recovery.RecoveriesCollection.RecoveryRef) TimeValue(io.crate.common.unit.TimeValue) CancellableThreads(org.elasticsearch.common.util.CancellableThreads) TransportResponseHandler(org.elasticsearch.transport.TransportResponseHandler) IllegalIndexShardStateException(org.elasticsearch.index.shard.IllegalIndexShardStateException) ConnectTransportException(org.elasticsearch.transport.ConnectTransportException) TransportException(org.elasticsearch.transport.TransportException) ElasticsearchException(org.elasticsearch.ElasticsearchException) AlreadyClosedException(org.apache.lucene.store.AlreadyClosedException) RecoveryEngineException(org.elasticsearch.index.engine.RecoveryEngineException) ShardNotFoundException(org.elasticsearch.index.shard.ShardNotFoundException) TranslogCorruptedException(org.elasticsearch.index.translog.TranslogCorruptedException) ElasticsearchTimeoutException(org.elasticsearch.ElasticsearchTimeoutException) ConnectTransportException(org.elasticsearch.transport.ConnectTransportException) IndexNotFoundException(org.elasticsearch.index.IndexNotFoundException) IOException(java.io.IOException) IllegalIndexShardStateException(org.elasticsearch.index.shard.IllegalIndexShardStateException) TransportException(org.elasticsearch.transport.TransportException) MapperException(org.elasticsearch.index.mapper.MapperException) ShardNotFoundException(org.elasticsearch.index.shard.ShardNotFoundException) ConnectTransportException(org.elasticsearch.transport.ConnectTransportException) StreamInput(org.elasticsearch.common.io.stream.StreamInput) IndexNotFoundException(org.elasticsearch.index.IndexNotFoundException) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage)

Aggregations

IOException (java.io.IOException)2 ParameterizedMessage (org.apache.logging.log4j.message.ParameterizedMessage)2 AlreadyClosedException (org.apache.lucene.store.AlreadyClosedException)2 ElasticsearchException (org.elasticsearch.ElasticsearchException)2 ElasticsearchTimeoutException (org.elasticsearch.ElasticsearchTimeoutException)2 ByteSizeValue (org.elasticsearch.common.unit.ByteSizeValue)2 CancellableThreads (org.elasticsearch.common.util.CancellableThreads)2 IndexNotFoundException (org.elasticsearch.index.IndexNotFoundException)2 RecoveryEngineException (org.elasticsearch.index.engine.RecoveryEngineException)2 MapperException (org.elasticsearch.index.mapper.MapperException)2 IllegalIndexShardStateException (org.elasticsearch.index.shard.IllegalIndexShardStateException)2 ShardNotFoundException (org.elasticsearch.index.shard.ShardNotFoundException)2 RecoveryRef (org.elasticsearch.indices.recovery.RecoveriesCollection.RecoveryRef)2 TimeValue (io.crate.common.unit.TimeValue)1 TimeValue.timeValueMillis (io.crate.common.unit.TimeValue.timeValueMillis)1 SQLExceptions (io.crate.exceptions.SQLExceptions)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Consumer (java.util.function.Consumer)1 Nullable (javax.annotation.Nullable)1