Search in sources :

Example 6 with StopWatch

use of org.opensearch.common.StopWatch in project OpenSearch by opensearch-project.

the class RecoverySourceHandler method phase1.

/**
 * Perform phase1 of the recovery operations. Once this {@link IndexCommit}
 * snapshot has been performed no commit operations (files being fsync'd)
 * are effectively allowed on this index until all recovery phases are done
 * <p>
 * Phase1 examines the segment files on the target node and copies over the
 * segments that are missing. Only segments that have the same size and
 * checksum can be reused
 */
void phase1(IndexCommit snapshot, long startingSeqNo, IntSupplier translogOps, ActionListener<SendFileResult> listener) {
    cancellableThreads.checkForCancel();
    final Store store = shard.store();
    try {
        StopWatch stopWatch = new StopWatch().start();
        final Store.MetadataSnapshot recoverySourceMetadata;
        try {
            recoverySourceMetadata = store.getMetadata(snapshot);
        } catch (CorruptIndexException | IndexFormatTooOldException | IndexFormatTooNewException ex) {
            shard.failShard("recovery", ex);
            throw ex;
        }
        for (String name : snapshot.getFileNames()) {
            final StoreFileMetadata md = recoverySourceMetadata.get(name);
            if (md == null) {
                logger.info("Snapshot differs from actual index for file: {} meta: {}", name, recoverySourceMetadata.asMap());
                throw new CorruptIndexException("Snapshot differs from actual index - maybe index was removed metadata has " + recoverySourceMetadata.asMap().size() + " files", name);
            }
        }
        if (canSkipPhase1(recoverySourceMetadata, request.metadataSnapshot()) == false) {
            final List<String> phase1FileNames = new ArrayList<>();
            final List<Long> phase1FileSizes = new ArrayList<>();
            final List<String> phase1ExistingFileNames = new ArrayList<>();
            final List<Long> phase1ExistingFileSizes = new ArrayList<>();
            // Total size of segment files that are recovered
            long totalSizeInBytes = 0;
            // Total size of segment files that were able to be re-used
            long existingTotalSizeInBytes = 0;
            // Generate a "diff" of all the identical, different, and missing
            // segment files on the target node, using the existing files on
            // the source node
            final Store.RecoveryDiff diff = recoverySourceMetadata.recoveryDiff(request.metadataSnapshot());
            for (StoreFileMetadata md : diff.identical) {
                phase1ExistingFileNames.add(md.name());
                phase1ExistingFileSizes.add(md.length());
                existingTotalSizeInBytes += md.length();
                if (logger.isTraceEnabled()) {
                    logger.trace("recovery [phase1]: not recovering [{}], exist in local store and has checksum [{}]," + " size [{}]", md.name(), md.checksum(), md.length());
                }
                totalSizeInBytes += md.length();
            }
            List<StoreFileMetadata> phase1Files = new ArrayList<>(diff.different.size() + diff.missing.size());
            phase1Files.addAll(diff.different);
            phase1Files.addAll(diff.missing);
            for (StoreFileMetadata md : phase1Files) {
                if (request.metadataSnapshot().asMap().containsKey(md.name())) {
                    logger.trace("recovery [phase1]: recovering [{}], exists in local store, but is different: remote [{}], local [{}]", md.name(), request.metadataSnapshot().asMap().get(md.name()), md);
                } else {
                    logger.trace("recovery [phase1]: recovering [{}], does not exist in remote", md.name());
                }
                phase1FileNames.add(md.name());
                phase1FileSizes.add(md.length());
                totalSizeInBytes += md.length();
            }
            logger.trace("recovery [phase1]: recovering_files [{}] with total_size [{}], reusing_files [{}] with total_size [{}]", phase1FileNames.size(), new ByteSizeValue(totalSizeInBytes), phase1ExistingFileNames.size(), new ByteSizeValue(existingTotalSizeInBytes));
            final StepListener<Void> sendFileInfoStep = new StepListener<>();
            final StepListener<Void> sendFilesStep = new StepListener<>();
            final StepListener<RetentionLease> createRetentionLeaseStep = new StepListener<>();
            final StepListener<Void> cleanFilesStep = new StepListener<>();
            cancellableThreads.checkForCancel();
            recoveryTarget.receiveFileInfo(phase1FileNames, phase1FileSizes, phase1ExistingFileNames, phase1ExistingFileSizes, translogOps.getAsInt(), sendFileInfoStep);
            sendFileInfoStep.whenComplete(r -> sendFiles(store, phase1Files.toArray(new StoreFileMetadata[0]), translogOps, sendFilesStep), listener::onFailure);
            sendFilesStep.whenComplete(r -> createRetentionLease(startingSeqNo, createRetentionLeaseStep), listener::onFailure);
            createRetentionLeaseStep.whenComplete(retentionLease -> {
                final long lastKnownGlobalCheckpoint = shard.getLastKnownGlobalCheckpoint();
                assert retentionLease == null || retentionLease.retainingSequenceNumber() - 1 <= lastKnownGlobalCheckpoint : retentionLease + " vs " + lastKnownGlobalCheckpoint;
                // Establishes new empty translog on the replica with global checkpoint set to lastKnownGlobalCheckpoint. We want
                // the commit we just copied to be a safe commit on the replica, so why not set the global checkpoint on the replica
                // to the max seqno of this commit? Because (in rare corner cases) this commit might not be a safe commit here on
                // the primary, and in these cases the max seqno would be too high to be valid as a global checkpoint.
                cleanFiles(store, recoverySourceMetadata, translogOps, lastKnownGlobalCheckpoint, cleanFilesStep);
            }, listener::onFailure);
            final long totalSize = totalSizeInBytes;
            final long existingTotalSize = existingTotalSizeInBytes;
            cleanFilesStep.whenComplete(r -> {
                final TimeValue took = stopWatch.totalTime();
                logger.trace("recovery [phase1]: took [{}]", took);
                listener.onResponse(new SendFileResult(phase1FileNames, phase1FileSizes, totalSize, phase1ExistingFileNames, phase1ExistingFileSizes, existingTotalSize, took));
            }, listener::onFailure);
        } else {
            logger.trace("skipping [phase1] since source and target have identical sync id [{}]", recoverySourceMetadata.getSyncId());
            // but we must still create a retention lease
            final StepListener<RetentionLease> createRetentionLeaseStep = new StepListener<>();
            createRetentionLease(startingSeqNo, createRetentionLeaseStep);
            createRetentionLeaseStep.whenComplete(retentionLease -> {
                final TimeValue took = stopWatch.totalTime();
                logger.trace("recovery [phase1]: took [{}]", took);
                listener.onResponse(new SendFileResult(Collections.emptyList(), Collections.emptyList(), 0L, Collections.emptyList(), Collections.emptyList(), 0L, took));
            }, listener::onFailure);
        }
    } catch (Exception e) {
        throw new RecoverFilesRecoveryException(request.shardId(), 0, new ByteSizeValue(0L), e);
    }
}
Also used : CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ArrayList(java.util.ArrayList) ByteSizeValue(org.opensearch.common.unit.ByteSizeValue) Store(org.opensearch.index.store.Store) StoreFileMetadata(org.opensearch.index.store.StoreFileMetadata) IndexFormatTooOldException(org.apache.lucene.index.IndexFormatTooOldException) TimeValue(org.opensearch.common.unit.TimeValue) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) IndexFormatTooNewException(org.apache.lucene.index.IndexFormatTooNewException) RecoveryEngineException(org.opensearch.index.engine.RecoveryEngineException) CorruptIndexException(org.apache.lucene.index.CorruptIndexException) RemoteTransportException(org.opensearch.transport.RemoteTransportException) IndexShardClosedException(org.opensearch.index.shard.IndexShardClosedException) IOException(java.io.IOException) IndexFormatTooOldException(org.apache.lucene.index.IndexFormatTooOldException) IndexShardRelocatedException(org.opensearch.index.shard.IndexShardRelocatedException) RetentionLeaseNotFoundException(org.opensearch.index.seqno.RetentionLeaseNotFoundException) StopWatch(org.opensearch.common.StopWatch) RetentionLease(org.opensearch.index.seqno.RetentionLease) AtomicLong(java.util.concurrent.atomic.AtomicLong) StepListener(org.opensearch.action.StepListener) IndexFormatTooNewException(org.apache.lucene.index.IndexFormatTooNewException)

Example 7 with StopWatch

use of org.opensearch.common.StopWatch in project OpenSearch by opensearch-project.

the class Node method close.

// During concurrent close() calls we want to make sure that all of them return after the node has completed it's shutdown cycle.
// If not, the hook that is added in Bootstrap#setup() will be useless:
// close() might not be executed, in case another (for example api) call to close() has already set some lifecycles to stopped.
// In this case the process will be terminated even if the first call to close() has not finished yet.
@Override
public synchronized void close() throws IOException {
    synchronized (lifecycle) {
        if (lifecycle.started()) {
            stop();
        }
        if (!lifecycle.moveToClosed()) {
            return;
        }
    }
    logger.info("closing ...");
    List<Closeable> toClose = new ArrayList<>();
    StopWatch stopWatch = new StopWatch("node_close");
    toClose.add(() -> stopWatch.start("node_service"));
    toClose.add(nodeService);
    toClose.add(() -> stopWatch.stop().start("http"));
    toClose.add(injector.getInstance(HttpServerTransport.class));
    toClose.add(() -> stopWatch.stop().start("snapshot_service"));
    toClose.add(injector.getInstance(SnapshotsService.class));
    toClose.add(injector.getInstance(SnapshotShardsService.class));
    toClose.add(injector.getInstance(RepositoriesService.class));
    toClose.add(() -> stopWatch.stop().start("client"));
    Releasables.close(injector.getInstance(Client.class));
    toClose.add(() -> stopWatch.stop().start("indices_cluster"));
    toClose.add(injector.getInstance(IndicesClusterStateService.class));
    toClose.add(() -> stopWatch.stop().start("indices"));
    toClose.add(injector.getInstance(IndicesService.class));
    // close filter/fielddata caches after indices
    toClose.add(injector.getInstance(IndicesStore.class));
    toClose.add(injector.getInstance(PeerRecoverySourceService.class));
    toClose.add(() -> stopWatch.stop().start("cluster"));
    toClose.add(injector.getInstance(ClusterService.class));
    toClose.add(() -> stopWatch.stop().start("node_connections_service"));
    toClose.add(injector.getInstance(NodeConnectionsService.class));
    toClose.add(() -> stopWatch.stop().start("discovery"));
    toClose.add(injector.getInstance(Discovery.class));
    toClose.add(() -> stopWatch.stop().start("monitor"));
    toClose.add(nodeService.getMonitorService());
    toClose.add(() -> stopWatch.stop().start("fsHealth"));
    toClose.add(injector.getInstance(FsHealthService.class));
    toClose.add(() -> stopWatch.stop().start("gateway"));
    toClose.add(injector.getInstance(GatewayService.class));
    toClose.add(() -> stopWatch.stop().start("search"));
    toClose.add(injector.getInstance(SearchService.class));
    toClose.add(() -> stopWatch.stop().start("transport"));
    toClose.add(injector.getInstance(TransportService.class));
    for (LifecycleComponent plugin : pluginLifecycleComponents) {
        toClose.add(() -> stopWatch.stop().start("plugin(" + plugin.getClass().getName() + ")"));
        toClose.add(plugin);
    }
    toClose.addAll(pluginsService.filterPlugins(Plugin.class));
    toClose.add(() -> stopWatch.stop().start("script"));
    toClose.add(injector.getInstance(ScriptService.class));
    toClose.add(() -> stopWatch.stop().start("thread_pool"));
    toClose.add(() -> injector.getInstance(ThreadPool.class).shutdown());
    // Don't call shutdownNow here, it might break ongoing operations on Lucene indices.
    // See https://issues.apache.org/jira/browse/LUCENE-7248. We call shutdownNow in
    // awaitClose if the node doesn't finish closing within the specified time.
    toClose.add(() -> stopWatch.stop().start("gateway_meta_state"));
    toClose.add(injector.getInstance(GatewayMetaState.class));
    toClose.add(() -> stopWatch.stop().start("node_environment"));
    toClose.add(injector.getInstance(NodeEnvironment.class));
    toClose.add(stopWatch::stop);
    if (logger.isTraceEnabled()) {
        toClose.add(() -> logger.trace("Close times for each service:\n{}", stopWatch.prettyPrint()));
    }
    IOUtils.close(toClose);
    logger.info("closed");
}
Also used : SnapshotsService(org.opensearch.snapshots.SnapshotsService) SnapshotShardsService(org.opensearch.snapshots.SnapshotShardsService) NodeConnectionsService(org.opensearch.cluster.NodeConnectionsService) NodeEnvironment(org.opensearch.env.NodeEnvironment) Closeable(java.io.Closeable) IndicesStore(org.opensearch.indices.store.IndicesStore) ArrayList(java.util.ArrayList) HttpServerTransport(org.opensearch.http.HttpServerTransport) GatewayMetaState(org.opensearch.gateway.GatewayMetaState) ScriptService(org.opensearch.script.ScriptService) IndicesClusterStateService(org.opensearch.indices.cluster.IndicesClusterStateService) LifecycleComponent(org.opensearch.common.component.LifecycleComponent) SearchService(org.opensearch.search.SearchService) PeerRecoverySourceService(org.opensearch.indices.recovery.PeerRecoverySourceService) Client(org.opensearch.client.Client) NodeClient(org.opensearch.client.node.NodeClient) FsHealthService(org.opensearch.monitor.fs.FsHealthService) Discovery(org.opensearch.discovery.Discovery) IndicesService(org.opensearch.indices.IndicesService) StopWatch(org.opensearch.common.StopWatch) GatewayService(org.opensearch.gateway.GatewayService) ClusterService(org.opensearch.cluster.service.ClusterService) PersistentTasksClusterService(org.opensearch.persistent.PersistentTasksClusterService) RemoteClusterService(org.opensearch.transport.RemoteClusterService) TransportService(org.opensearch.transport.TransportService) SearchTransportService(org.opensearch.action.search.SearchTransportService) RepositoriesService(org.opensearch.repositories.RepositoriesService) ActionPlugin(org.opensearch.plugins.ActionPlugin) SystemIndexPlugin(org.opensearch.plugins.SystemIndexPlugin) CircuitBreakerPlugin(org.opensearch.plugins.CircuitBreakerPlugin) EnginePlugin(org.opensearch.plugins.EnginePlugin) DiscoveryPlugin(org.opensearch.plugins.DiscoveryPlugin) RepositoryPlugin(org.opensearch.plugins.RepositoryPlugin) NetworkPlugin(org.opensearch.plugins.NetworkPlugin) SearchPlugin(org.opensearch.plugins.SearchPlugin) Plugin(org.opensearch.plugins.Plugin) IngestPlugin(org.opensearch.plugins.IngestPlugin) IndexStorePlugin(org.opensearch.plugins.IndexStorePlugin) ScriptPlugin(org.opensearch.plugins.ScriptPlugin) ClusterPlugin(org.opensearch.plugins.ClusterPlugin) MapperPlugin(org.opensearch.plugins.MapperPlugin) PersistentTaskPlugin(org.opensearch.plugins.PersistentTaskPlugin) AnalysisPlugin(org.opensearch.plugins.AnalysisPlugin)

Aggregations

StopWatch (org.opensearch.common.StopWatch)7 TimeValue (org.opensearch.common.unit.TimeValue)5 StepListener (org.opensearch.action.StepListener)3 IndexShardClosedException (org.opensearch.index.shard.IndexShardClosedException)3 ArrayList (java.util.ArrayList)2 RecoveryEngineException (org.opensearch.index.engine.RecoveryEngineException)2 Closeable (java.io.Closeable)1 IOException (java.io.IOException)1 CopyOnWriteArrayList (java.util.concurrent.CopyOnWriteArrayList)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 ParameterizedMessage (org.apache.logging.log4j.message.ParameterizedMessage)1 CorruptIndexException (org.apache.lucene.index.CorruptIndexException)1 IndexFormatTooNewException (org.apache.lucene.index.IndexFormatTooNewException)1 IndexFormatTooOldException (org.apache.lucene.index.IndexFormatTooOldException)1 Alias (org.opensearch.action.admin.indices.alias.Alias)1 SearchTransportService (org.opensearch.action.search.SearchTransportService)1 Client (org.opensearch.client.Client)1 NodeClient (org.opensearch.client.node.NodeClient)1 ClusterState (org.opensearch.cluster.ClusterState)1 NodeConnectionsService (org.opensearch.cluster.NodeConnectionsService)1