Search in sources :

Example 1 with RateLimiter

use of org.apache.lucene.store.RateLimiter in project elasticsearch by elastic.

the class RemoteRecoveryTargetHandler method writeFileChunk.

@Override
public void writeFileChunk(StoreFileMetaData fileMetaData, long position, BytesReference content, boolean lastChunk, int totalTranslogOps) throws IOException {
    // Pause using the rate limiter, if desired, to throttle the recovery
    final long throttleTimeInNanos;
    // always fetch the ratelimiter - it might be updated in real-time on the recovery settings
    final RateLimiter rl = recoverySettings.rateLimiter();
    if (rl != null) {
        long bytes = bytesSinceLastPause.addAndGet(content.length());
        if (bytes > rl.getMinPauseCheckBytes()) {
            // Time to pause
            bytesSinceLastPause.addAndGet(-bytes);
            try {
                throttleTimeInNanos = rl.pause(bytes);
                onSourceThrottle.accept(throttleTimeInNanos);
            } catch (IOException e) {
                throw new ElasticsearchException("failed to pause recovery", e);
            }
        } else {
            throttleTimeInNanos = 0;
        }
    } else {
        throttleTimeInNanos = 0;
    }
    transportService.submitRequest(targetNode, PeerRecoveryTargetService.Actions.FILE_CHUNK, new RecoveryFileChunkRequest(recoveryId, shardId, fileMetaData, position, content, lastChunk, totalTranslogOps, /* we send totalOperations with every request since we collect stats on the target and that way we can
                                 * see how many translog ops we accumulate while copying files across the network. A future optimization
                                 * would be in to restart file copy again (new deltas) if we have too many translog ops are piling up.
                                 */
    throttleTimeInNanos), fileChunkRequestOptions, EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
}
Also used : IOException(java.io.IOException) ElasticsearchException(org.elasticsearch.ElasticsearchException) RateLimiter(org.apache.lucene.store.RateLimiter)

Example 2 with RateLimiter

use of org.apache.lucene.store.RateLimiter in project crate by crate.

the class BlobRecoverySourceHandler method phase1.

/**
     * Perform phase1 of the recovery operations. Once this {@link SnapshotIndexCommit}
     * snapshot has been performed no commit operations (files being fsync'd)
     * are effectively allowed on this index until all recovery phases are done
     * <p/>
     * Phase1 examines the segment files on the target node and copies over the
     * segments that are missing. Only segments that have the same size and
     * checksum can be reused
     */
public void phase1(final SnapshotIndexCommit snapshot, final Translog.View translogView) {
    cancellableThreads.checkForCancel();
    // Total size of segment files that are recovered
    long totalSize = 0;
    // Total size of segment files that were able to be re-used
    long existingTotalSize = 0;
    final Store store = shard.store();
    store.incRef();
    try {
        // CRATE CHANGE
        if (blobRecoveryHandler != null) {
            blobRecoveryHandler.phase1();
        }
        StopWatch stopWatch = new StopWatch().start();
        final Store.MetadataSnapshot recoverySourceMetadata;
        try {
            recoverySourceMetadata = store.getMetadata(snapshot);
        } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
            shard.engine().failEngine("recovery", ex);
            throw ex;
        }
        for (String name : snapshot.getFiles()) {
            final StoreFileMetaData md = recoverySourceMetadata.get(name);
            if (md == null) {
                logger.info("Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap());
                throw new CorruptIndexException("Snapshot differs from actual index - maybe index was removed metadata has " + recoverySourceMetadata.asMap().size() + " files", name);
            }
        }
        // Generate a "diff" of all the identical, different, and missing
        // segment files on the target node, using the existing files on
        // the source node
        String recoverySourceSyncId = recoverySourceMetadata.getSyncId();
        String recoveryTargetSyncId = request.metadataSnapshot().getSyncId();
        final boolean recoverWithSyncId = recoverySourceSyncId != null && recoverySourceSyncId.equals(recoveryTargetSyncId);
        if (recoverWithSyncId) {
            final long numDocsTarget = request.metadataSnapshot().getNumDocs();
            final long numDocsSource = recoverySourceMetadata.getNumDocs();
            if (numDocsTarget != numDocsSource) {
                throw new IllegalStateException("try to recover " + request.shardId() + " from primary shard with sync id but number of docs differ: " + numDocsTarget + " (" + request.sourceNode().getName() + ", primary) vs " + numDocsSource + "(" + request.targetNode().getName() + ")");
            }
            // we shortcut recovery here because we have nothing to copy. but we must still start the engine on the target.
            // so we don't return here
            logger.trace("[{}][{}] skipping [phase1] to {} - identical sync id [{}] found on both source and target", indexName, shardId, request.targetNode(), recoverySourceSyncId);
        } else {
            final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
            for (StoreFileMetaData md : diff.identical) {
                response.phase1ExistingFileNames.add(md.name());
                response.phase1ExistingFileSizes.add(md.length());
                existingTotalSize += md.length();
                if (logger.isTraceEnabled()) {
                    logger.trace("[{}][{}] recovery [phase1] to {}: not recovering [{}], exists in local store and has checksum [{}], size [{}]", indexName, shardId, request.targetNode(), md.name(), md.checksum(), md.length());
                }
                totalSize += md.length();
            }
            for (StoreFileMetaData md : Iterables.concat(diff.different, diff.missing)) {
                if (request.metadataSnapshot().asMap().containsKey(md.name())) {
                    logger.trace("[{}][{}] recovery [phase1] to {}: recovering [{}], exists in local store, but is different: remote [{}], local [{}]", indexName, shardId, request.targetNode(), md.name(), request.metadataSnapshot().asMap().get(md.name()), md);
                } else {
                    logger.trace("[{}][{}] recovery [phase1] to {}: recovering [{}], does not exists in remote", indexName, shardId, request.targetNode(), md.name());
                }
                response.phase1FileNames.add(md.name());
                response.phase1FileSizes.add(md.length());
                totalSize += md.length();
            }
            response.phase1TotalSize = totalSize;
            response.phase1ExistingTotalSize = existingTotalSize;
            logger.trace("[{}][{}] recovery [phase1] to {}: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]", indexName, shardId, request.targetNode(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), response.phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSize));
            cancellableThreads.execute(new Interruptable() {

                @Override
                public void run() throws InterruptedException {
                    RecoveryFilesInfoRequest recoveryInfoFilesRequest = new RecoveryFilesInfoRequest(request.recoveryId(), request.shardId(), response.phase1FileNames, response.phase1FileSizes, response.phase1ExistingFileNames, response.phase1ExistingFileSizes, translogView.totalOperations());
                    transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.FILES_INFO, recoveryInfoFilesRequest, TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(), EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
                }
            });
            // This latch will be used to wait until all files have been transferred to the target node
            final CountDownLatch latch = new CountDownLatch(response.phase1FileNames.size());
            final CopyOnWriteArrayList<Throwable> exceptions = new CopyOnWriteArrayList<>();
            final AtomicReference<Throwable> corruptedEngine = new AtomicReference<>();
            int fileIndex = 0;
            ThreadPoolExecutor pool;
            // How many bytes we've copied since we last called RateLimiter.pause
            final AtomicLong bytesSinceLastPause = new AtomicLong();
            for (final String name : response.phase1FileNames) {
                long fileSize = response.phase1FileSizes.get(fileIndex);
                // separately.
                if (fileSize > RecoverySettings.SMALL_FILE_CUTOFF_BYTES) {
                    pool = recoverySettings.concurrentStreamPool();
                } else {
                    pool = recoverySettings.concurrentSmallFileStreamPool();
                }
                pool.execute(new AbstractRunnable() {

                    @Override
                    public void onFailure(Throwable t) {
                        // we either got rejected or the store can't be incremented / we are canceled
                        logger.debug("Failed to transfer file [" + name + "] on recovery");
                    }

                    @Override
                    public void onAfter() {
                        // Signify this file has completed by decrementing the latch
                        latch.countDown();
                    }

                    @Override
                    protected void doRun() {
                        cancellableThreads.checkForCancel();
                        store.incRef();
                        final StoreFileMetaData md = recoverySourceMetadata.get(name);
                        try (final IndexInput indexInput = store.directory().openInput(name, IOContext.READONCE)) {
                            // at least one!
                            final int BUFFER_SIZE = (int) Math.max(1, recoverySettings.fileChunkSize().getBytes());
                            final byte[] buf = new byte[BUFFER_SIZE];
                            boolean shouldCompressRequest = recoverySettings.compress();
                            if (CompressorFactory.isCompressed(indexInput)) {
                                shouldCompressRequest = false;
                            }
                            final long len = indexInput.length();
                            long readCount = 0;
                            final TransportRequestOptions requestOptions = TransportRequestOptions.builder().withCompress(shouldCompressRequest).withType(TransportRequestOptions.Type.RECOVERY).withTimeout(recoverySettings.internalActionTimeout()).build();
                            while (readCount < len) {
                                if (shard.state() == IndexShardState.CLOSED) {
                                    // check if the shard got closed on us
                                    throw new IndexShardClosedException(shard.shardId());
                                }
                                int toRead = readCount + BUFFER_SIZE > len ? (int) (len - readCount) : BUFFER_SIZE;
                                final long position = indexInput.getFilePointer();
                                // Pause using the rate limiter, if desired, to throttle the recovery
                                RateLimiter rl = recoverySettings.rateLimiter();
                                long throttleTimeInNanos = 0;
                                if (rl != null) {
                                    long bytes = bytesSinceLastPause.addAndGet(toRead);
                                    if (bytes > rl.getMinPauseCheckBytes()) {
                                        // Time to pause
                                        bytesSinceLastPause.addAndGet(-bytes);
                                        throttleTimeInNanos = rl.pause(bytes);
                                        shard.recoveryStats().addThrottleTime(throttleTimeInNanos);
                                    }
                                }
                                indexInput.readBytes(buf, 0, toRead, false);
                                final BytesArray content = new BytesArray(buf, 0, toRead);
                                readCount += toRead;
                                final boolean lastChunk = readCount == len;
                                final RecoveryFileChunkRequest fileChunkRequest = new RecoveryFileChunkRequest(request.recoveryId(), request.shardId(), md, position, content, lastChunk, translogView.totalOperations(), throttleTimeInNanos);
                                cancellableThreads.execute(new Interruptable() {

                                    @Override
                                    public void run() throws InterruptedException {
                                        // Actually send the file chunk to the target node, waiting for it to complete
                                        transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.FILE_CHUNK, fileChunkRequest, requestOptions, EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
                                    }
                                });
                            }
                        } catch (Throwable e) {
                            final Throwable corruptIndexException;
                            if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(e)) != null) {
                                if (store.checkIntegrityNoException(md) == false) {
                                    // we are corrupted on the primary -- fail!
                                    logger.warn("{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
                                    if (corruptedEngine.compareAndSet(null, corruptIndexException) == false) {
                                        // if we are not the first exception, add ourselves as suppressed to the main one:
                                        corruptedEngine.get().addSuppressed(e);
                                    }
                                } else {
                                    // corruption has happened on the way to replica
                                    RemoteTransportException exception = new RemoteTransportException("File corruption occurred on recovery but checksums are ok", null);
                                    exception.addSuppressed(e);
                                    // last exception first
                                    exceptions.add(0, exception);
                                    logger.warn("{} Remote file corruption on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode(), md);
                                }
                            } else {
                                // last exceptions first
                                exceptions.add(0, e);
                            }
                        } finally {
                            store.decRef();
                        }
                    }
                });
                fileIndex++;
            }
            cancellableThreads.execute(new Interruptable() {

                @Override
                public void run() throws InterruptedException {
                    // Wait for all files that need to be transferred to finish transferring
                    latch.await();
                }
            });
            if (corruptedEngine.get() != null) {
                shard.engine().failEngine("recovery", corruptedEngine.get());
                throw corruptedEngine.get();
            } else {
                ExceptionsHelper.rethrowAndSuppress(exceptions);
            }
            cancellableThreads.execute(new Interruptable() {

                @Override
                public void run() throws InterruptedException {
                    // are deleted
                    try {
                        transportService.submitRequest(request.targetNode(), RecoveryTarget.Actions.CLEAN_FILES, new RecoveryCleanFilesRequest(request.recoveryId(), shard.shardId(), recoverySourceMetadata, translogView.totalOperations()), TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionTimeout()).build(), EmptyTransportResponseHandler.INSTANCE_SAME).txGet();
                    } catch (RemoteTransportException remoteException) {
                        final IOException corruptIndexException;
                        //   - maybe due to old segments without checksums or length only checks
                        if ((corruptIndexException = ExceptionsHelper.unwrapCorruption(remoteException)) != null) {
                            try {
                                final Store.MetadataSnapshot recoverySourceMetadata = store.getMetadata(snapshot);
                                StoreFileMetaData[] metadata = Iterables.toArray(recoverySourceMetadata, StoreFileMetaData.class);
                                ArrayUtil.timSort(metadata, new Comparator<StoreFileMetaData>() {

                                    @Override
                                    public int compare(StoreFileMetaData o1, StoreFileMetaData o2) {
                                        // check small files first
                                        return Long.compare(o1.length(), o2.length());
                                    }
                                });
                                for (StoreFileMetaData md : metadata) {
                                    logger.debug("{} checking integrity for file {} after remove corruption exception", shard.shardId(), md);
                                    if (store.checkIntegrityNoException(md) == false) {
                                        // we are corrupted on the primary -- fail!
                                        shard.engine().failEngine("recovery", corruptIndexException);
                                        logger.warn("{} Corrupted file detected {} checksum mismatch", shard.shardId(), md);
                                        throw corruptIndexException;
                                    }
                                }
                            } catch (IOException ex) {
                                remoteException.addSuppressed(ex);
                                throw remoteException;
                            }
                            // corruption has happened on the way to replica
                            RemoteTransportException exception = new RemoteTransportException("File corruption occurred on recovery but checksums are ok", null);
                            exception.addSuppressed(remoteException);
                            logger.warn("{} Remote file corruption during finalization on node {}, recovering {}. local checksum OK", corruptIndexException, shard.shardId(), request.targetNode());
                            throw exception;
                        } else {
                            throw remoteException;
                        }
                    }
                }
            });
        }
        prepareTargetForTranslog(translogView);
        logger.trace("[{}][{}] recovery [phase1] to {}: took [{}]", indexName, shardId, request.targetNode(), stopWatch.totalTime());
        response.phase1Time = stopWatch.totalTime().millis();
    } catch (Throwable e) {
        throw new RecoverFilesRecoveryException(request.shardId(), response.phase1FileNames.size(), new ByteSizeValue(totalSize), e);
    } finally {
        store.decRef();
    }
}
Also used : AbstractRunnable(org.elasticsearch.common.util.concurrent.AbstractRunnable) ByteSizeValue(org.elasticsearch.common.unit.ByteSizeValue) Store(org.elasticsearch.index.store.Store) IndexFormatTooOldException(org.apache.lucene.index.IndexFormatTooOldException) StoreFileMetaData(org.elasticsearch.index.store.StoreFileMetaData) IndexInput(org.apache.lucene.store.IndexInput) TransportRequestOptions(org.elasticsearch.transport.TransportRequestOptions) RemoteTransportException(org.elasticsearch.transport.RemoteTransportException) BytesArray(org.elasticsearch.common.bytes.BytesArray) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) Interruptable(org.elasticsearch.common.util.CancellableThreads.Interruptable) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) CountDownLatch(java.util.concurrent.CountDownLatch) RateLimiter(org.apache.lucene.store.RateLimiter) StopWatch(org.elasticsearch.common.StopWatch) AtomicLong(java.util.concurrent.atomic.AtomicLong) IndexShardClosedException(org.elasticsearch.index.shard.IndexShardClosedException) IndexFormatTooNewException(org.apache.lucene.index.IndexFormatTooNewException) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList)

Example 3 with RateLimiter

use of org.apache.lucene.store.RateLimiter in project crate by crate.

the class RemoteRecoveryTargetHandler method writeFileChunk.

@Override
public void writeFileChunk(StoreFileMetadata fileMetadata, long position, BytesReference content, boolean lastChunk, int totalTranslogOps, ActionListener<Void> listener) {
    // Pause using the rate limiter, if desired, to throttle the recovery
    final long throttleTimeInNanos;
    // always fetch the ratelimiter - it might be updated in real-time on the recovery settings
    final RateLimiter rl = recoverySettings.rateLimiter();
    if (rl != null) {
        long bytes = bytesSinceLastPause.addAndGet(content.length());
        if (bytes > rl.getMinPauseCheckBytes()) {
            // Time to pause
            bytesSinceLastPause.addAndGet(-bytes);
            try {
                throttleTimeInNanos = rl.pause(bytes);
                onSourceThrottle.accept(throttleTimeInNanos);
            } catch (IOException e) {
                throw new ElasticsearchException("failed to pause recovery", e);
            }
        } else {
            throttleTimeInNanos = 0;
        }
    } else {
        throttleTimeInNanos = 0;
    }
    final String action = PeerRecoveryTargetService.Actions.FILE_CHUNK;
    /* we send estimateTotalOperations with every request since we collect stats on the target and that way we can
         * see how many translog ops we accumulate while copying files across the network. A future optimization
         * would be in to restart file copy again (new deltas) if we have too many translog ops are piling up.
         */
    final RecoveryFileChunkRequest request = new RecoveryFileChunkRequest(recoveryId, shardId, fileMetadata, position, content, lastChunk, totalTranslogOps, throttleTimeInNanos);
    final Writeable.Reader<TransportResponse.Empty> reader = in -> TransportResponse.Empty.INSTANCE;
    executeRetryableAction(action, request, fileChunkRequestOptions, ActionListener.map(listener, r -> null), reader);
}
Also used : ElasticsearchException(org.elasticsearch.ElasticsearchException) CancellableThreads(org.elasticsearch.common.util.CancellableThreads) ShardId(org.elasticsearch.index.shard.ShardId) TransportRequest(org.elasticsearch.transport.TransportRequest) ConcurrentCollections(org.elasticsearch.common.util.concurrent.ConcurrentCollections) StoreFileMetadata(org.elasticsearch.index.store.StoreFileMetadata) RetentionLeases(org.elasticsearch.index.seqno.RetentionLeases) DiscoveryNode(org.elasticsearch.cluster.node.DiscoveryNode) ActionListenerResponseHandler(org.elasticsearch.action.ActionListenerResponseHandler) Store(org.elasticsearch.index.store.Store) Map(java.util.Map) ThreadPool(org.elasticsearch.threadpool.ThreadPool) TransportResponse(org.elasticsearch.transport.TransportResponse) TransportService(org.elasticsearch.transport.TransportService) IOException(java.io.IOException) BytesReference(org.elasticsearch.common.bytes.BytesReference) Consumer(java.util.function.Consumer) AtomicLong(java.util.concurrent.atomic.AtomicLong) RemoteTransportException(org.elasticsearch.transport.RemoteTransportException) List(java.util.List) Logger(org.apache.logging.log4j.Logger) CircuitBreakingException(org.elasticsearch.common.breaker.CircuitBreakingException) EsRejectedExecutionException(org.elasticsearch.common.util.concurrent.EsRejectedExecutionException) TransportFuture(org.elasticsearch.transport.TransportFuture) TimeValue(io.crate.common.unit.TimeValue) Translog(org.elasticsearch.index.translog.Translog) EmptyTransportResponseHandler(org.elasticsearch.transport.EmptyTransportResponseHandler) RetryableAction(org.elasticsearch.action.support.RetryableAction) ReplicationTracker(org.elasticsearch.index.seqno.ReplicationTracker) TransportRequestOptions(org.elasticsearch.transport.TransportRequestOptions) SQLExceptions(io.crate.exceptions.SQLExceptions) Writeable(org.elasticsearch.common.io.stream.Writeable) LogManager(org.apache.logging.log4j.LogManager) RateLimiter(org.apache.lucene.store.RateLimiter) ActionListener(org.elasticsearch.action.ActionListener) IOException(java.io.IOException) ElasticsearchException(org.elasticsearch.ElasticsearchException) Writeable(org.elasticsearch.common.io.stream.Writeable) RateLimiter(org.apache.lucene.store.RateLimiter)

Example 4 with RateLimiter

use of org.apache.lucene.store.RateLimiter in project lucene-solr by apache.

the class ConcurrentMergeScheduler method wrapForMerge.

@Override
public Directory wrapForMerge(OneMerge merge, Directory in) {
    Thread mergeThread = Thread.currentThread();
    if (!MergeThread.class.isInstance(mergeThread)) {
        throw new AssertionError("wrapForMerge should be called from MergeThread. Current thread: " + mergeThread);
    }
    // Return a wrapped Directory which has rate-limited output.
    RateLimiter rateLimiter = ((MergeThread) mergeThread).rateLimiter;
    return new FilterDirectory(in) {

        @Override
        public IndexOutput createOutput(String name, IOContext context) throws IOException {
            ensureOpen();
            // somewhere that is failing to pass down the right IOContext:
            assert context.context == IOContext.Context.MERGE : "got context=" + context.context;
            // always be called from that context. Verify this.
            assert mergeThread == Thread.currentThread() : "Not the same merge thread, current=" + Thread.currentThread() + ", expected=" + mergeThread;
            return new RateLimitedIndexOutput(rateLimiter, in.createOutput(name, context));
        }
    };
}
Also used : RateLimitedIndexOutput(org.apache.lucene.store.RateLimitedIndexOutput) FilterDirectory(org.apache.lucene.store.FilterDirectory) IOContext(org.apache.lucene.store.IOContext) RateLimiter(org.apache.lucene.store.RateLimiter)

Aggregations

RateLimiter (org.apache.lucene.store.RateLimiter)4 IOException (java.io.IOException)3 AtomicLong (java.util.concurrent.atomic.AtomicLong)2 ElasticsearchException (org.elasticsearch.ElasticsearchException)2 Store (org.elasticsearch.index.store.Store)2 RemoteTransportException (org.elasticsearch.transport.RemoteTransportException)2 TransportRequestOptions (org.elasticsearch.transport.TransportRequestOptions)2 TimeValue (io.crate.common.unit.TimeValue)1 SQLExceptions (io.crate.exceptions.SQLExceptions)1 List (java.util.List)1 Map (java.util.Map)1 CopyOnWriteArrayList (java.util.concurrent.CopyOnWriteArrayList)1 CountDownLatch (java.util.concurrent.CountDownLatch)1 ThreadPoolExecutor (java.util.concurrent.ThreadPoolExecutor)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 Consumer (java.util.function.Consumer)1 LogManager (org.apache.logging.log4j.LogManager)1 Logger (org.apache.logging.log4j.Logger)1 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)1 IndexFormatTooNewException (org.apache.lucene.index.IndexFormatTooNewException)1