use of org.elasticsearch.action.StepListener in project crate by crate.
the class BlobStoreRepository method doDeleteShardSnapshots.
/**
* After updating the {@link RepositoryData} each of the shards directories is individually first moved to the next shard generation
* and then has all now unreferenced blobs in it deleted.
*
* @param snapshotId SnapshotId to delete
* @param repositoryStateId Expected repository state id
* @param foundIndices All indices folders found in the repository before executing any writes to the repository during this
* delete operation
* @param rootBlobs All blobs found at the root of the repository before executing any writes to the repository during this
* delete operation
* @param repositoryData RepositoryData found the in the repository before executing this delete
* @param listener Listener to invoke once finished
*/
private void doDeleteShardSnapshots(SnapshotId snapshotId, long repositoryStateId, Map<String, BlobContainer> foundIndices, Map<String, BlobMetadata> rootBlobs, RepositoryData repositoryData, boolean writeShardGens, ActionListener<Void> listener) {
if (writeShardGens) {
// First write the new shard state metadata (with the removed snapshot) and compute deletion targets
final StepListener<Collection<ShardSnapshotMetaDeleteResult>> writeShardMetadataAndComputeDeletesStep = new StepListener<>();
writeUpdatedShardMetadataAndComputeDeletes(snapshotId, repositoryData, true, writeShardMetadataAndComputeDeletesStep);
// Once we have put the new shard-level metadata into place, we can update the repository metadata as follows:
// 1. Remove the snapshot from the list of existing snapshots
// 2. Update the index shard generations of all updated shard folders
//
// Note: If we fail updating any of the individual shard paths, none of them are changed since the newly created
// index-${gen_uuid} will not be referenced by the existing RepositoryData and new RepositoryData is only
// written if all shard paths have been successfully updated.
final StepListener<RepositoryData> writeUpdatedRepoDataStep = new StepListener<>();
writeShardMetadataAndComputeDeletesStep.whenComplete(deleteResults -> {
final ShardGenerations.Builder builder = ShardGenerations.builder();
for (ShardSnapshotMetaDeleteResult newGen : deleteResults) {
builder.put(newGen.indexId, newGen.shardId, newGen.newGeneration);
}
final RepositoryData updatedRepoData = repositoryData.removeSnapshot(snapshotId, builder.build());
writeIndexGen(updatedRepoData, repositoryStateId, true, ActionListener.wrap(v -> writeUpdatedRepoDataStep.onResponse(updatedRepoData), listener::onFailure));
}, listener::onFailure);
// Once we have updated the repository, run the clean-ups
writeUpdatedRepoDataStep.whenComplete(updatedRepoData -> {
// Run unreferenced blobs cleanup in parallel to shard-level snapshot deletion
final ActionListener<Void> afterCleanupsListener = new GroupedActionListener<>(ActionListener.wrap(() -> listener.onResponse(null)), 2);
asyncCleanupUnlinkedRootAndIndicesBlobs(foundIndices, rootBlobs, updatedRepoData, afterCleanupsListener);
asyncCleanupUnlinkedShardLevelBlobs(snapshotId, writeShardMetadataAndComputeDeletesStep.result(), afterCleanupsListener);
}, listener::onFailure);
} else {
// Write the new repository data first (with the removed snapshot), using no shard generations
final RepositoryData updatedRepoData = repositoryData.removeSnapshot(snapshotId, ShardGenerations.EMPTY);
writeIndexGen(updatedRepoData, repositoryStateId, false, ActionListener.wrap(v -> {
// Run unreferenced blobs cleanup in parallel to shard-level snapshot deletion
final ActionListener<Void> afterCleanupsListener = new GroupedActionListener<>(ActionListener.wrap(() -> listener.onResponse(null)), 2);
asyncCleanupUnlinkedRootAndIndicesBlobs(foundIndices, rootBlobs, updatedRepoData, afterCleanupsListener);
final StepListener<Collection<ShardSnapshotMetaDeleteResult>> writeMetaAndComputeDeletesStep = new StepListener<>();
writeUpdatedShardMetadataAndComputeDeletes(snapshotId, repositoryData, false, writeMetaAndComputeDeletesStep);
writeMetaAndComputeDeletesStep.whenComplete(deleteResults -> asyncCleanupUnlinkedShardLevelBlobs(snapshotId, deleteResults, afterCleanupsListener), afterCleanupsListener::onFailure);
}, listener::onFailure));
}
}
use of org.elasticsearch.action.StepListener in project crate by crate.
the class BlobStoreRepository method snapshotShard.
@Override
public void snapshotShard(Store store, MapperService mapperService, SnapshotId snapshotId, IndexId indexId, IndexCommit snapshotIndexCommit, IndexShardSnapshotStatus snapshotStatus, boolean writeShardGens, ActionListener<String> listener) {
final ShardId shardId = store.shardId();
final long startTime = threadPool.absoluteTimeInMillis();
try {
final String generation = snapshotStatus.generation();
LOGGER.debug("[{}] [{}] snapshot to [{}] [{}] ...", shardId, snapshotId, metadata.name(), generation);
final BlobContainer shardContainer = shardContainer(indexId, shardId);
final Set<String> blobs;
if (generation == null) {
try {
blobs = shardContainer.listBlobsByPrefix(INDEX_FILE_PREFIX).keySet();
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(shardId, "failed to list blobs", e);
}
} else {
blobs = Collections.singleton(INDEX_FILE_PREFIX + generation);
}
Tuple<BlobStoreIndexShardSnapshots, String> tuple = buildBlobStoreIndexShardSnapshots(blobs, shardContainer, generation);
BlobStoreIndexShardSnapshots snapshots = tuple.v1();
final String fileListGeneration = tuple.v2();
if (snapshots.snapshots().stream().anyMatch(sf -> sf.snapshot().equals(snapshotId.getName()))) {
throw new IndexShardSnapshotFailedException(shardId, "Duplicate snapshot name [" + snapshotId.getName() + "] detected, aborting");
}
final List<BlobStoreIndexShardSnapshot.FileInfo> indexCommitPointFiles = new ArrayList<>();
final BlockingQueue<BlobStoreIndexShardSnapshot.FileInfo> filesToSnapshot = new LinkedBlockingQueue<>();
store.incRef();
final Collection<String> fileNames;
final Store.MetadataSnapshot metadataFromStore;
try {
// TODO apparently we don't use the MetadataSnapshot#.recoveryDiff(...) here but we should
try {
LOGGER.trace("[{}] [{}] Loading store metadata using index commit [{}]", shardId, snapshotId, snapshotIndexCommit);
metadataFromStore = store.getMetadata(snapshotIndexCommit);
fileNames = snapshotIndexCommit.getFileNames();
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(shardId, "Failed to get store file metadata", e);
}
} finally {
store.decRef();
}
int indexIncrementalFileCount = 0;
int indexTotalNumberOfFiles = 0;
long indexIncrementalSize = 0;
long indexTotalFileCount = 0;
for (String fileName : fileNames) {
if (snapshotStatus.isAborted()) {
LOGGER.debug("[{}] [{}] Aborted on the file [{}], exiting", shardId, snapshotId, fileName);
throw new IndexShardSnapshotFailedException(shardId, "Aborted");
}
LOGGER.trace("[{}] [{}] Processing [{}]", shardId, snapshotId, fileName);
final StoreFileMetadata md = metadataFromStore.get(fileName);
BlobStoreIndexShardSnapshot.FileInfo existingFileInfo = null;
List<BlobStoreIndexShardSnapshot.FileInfo> filesInfo = snapshots.findPhysicalIndexFiles(fileName);
if (filesInfo != null) {
for (BlobStoreIndexShardSnapshot.FileInfo fileInfo : filesInfo) {
if (fileInfo.isSame(md)) {
// a commit point file with the same name, size and checksum was already copied to repository
// we will reuse it for this snapshot
existingFileInfo = fileInfo;
break;
}
}
}
indexTotalFileCount += md.length();
indexTotalNumberOfFiles++;
if (existingFileInfo == null) {
indexIncrementalFileCount++;
indexIncrementalSize += md.length();
// create a new FileInfo
BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo = new BlobStoreIndexShardSnapshot.FileInfo(DATA_BLOB_PREFIX + UUIDs.randomBase64UUID(), md, chunkSize());
indexCommitPointFiles.add(snapshotFileInfo);
filesToSnapshot.add(snapshotFileInfo);
} else {
indexCommitPointFiles.add(existingFileInfo);
}
}
snapshotStatus.moveToStarted(startTime, indexIncrementalFileCount, indexTotalNumberOfFiles, indexIncrementalSize, indexTotalFileCount);
assert indexIncrementalFileCount == filesToSnapshot.size();
final StepListener<Collection<Void>> allFilesUploadedListener = new StepListener<>();
allFilesUploadedListener.whenComplete(v -> {
final IndexShardSnapshotStatus.Copy lastSnapshotStatus = snapshotStatus.moveToFinalize(snapshotIndexCommit.getGeneration());
// now create and write the commit point
final BlobStoreIndexShardSnapshot snapshot = new BlobStoreIndexShardSnapshot(snapshotId.getName(), lastSnapshotStatus.getIndexVersion(), indexCommitPointFiles, lastSnapshotStatus.getStartTime(), threadPool.absoluteTimeInMillis() - lastSnapshotStatus.getStartTime(), lastSnapshotStatus.getIncrementalFileCount(), lastSnapshotStatus.getIncrementalSize());
LOGGER.trace("[{}] [{}] writing shard snapshot file", shardId, snapshotId);
try {
indexShardSnapshotFormat.write(snapshot, shardContainer, snapshotId.getUUID(), false);
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(shardId, "Failed to write commit point", e);
}
// build a new BlobStoreIndexShardSnapshot, that includes this one and all the saved ones
List<SnapshotFiles> newSnapshotsList = new ArrayList<>();
newSnapshotsList.add(new SnapshotFiles(snapshot.snapshot(), snapshot.indexFiles()));
for (SnapshotFiles point : snapshots) {
newSnapshotsList.add(point);
}
final List<String> blobsToDelete;
final String indexGeneration;
if (writeShardGens) {
indexGeneration = UUIDs.randomBase64UUID();
blobsToDelete = Collections.emptyList();
} else {
indexGeneration = Long.toString(Long.parseLong(fileListGeneration) + 1);
// Delete all previous index-N blobs
blobsToDelete = blobs.stream().filter(blob -> blob.startsWith(SNAPSHOT_INDEX_PREFIX)).collect(Collectors.toList());
assert blobsToDelete.stream().mapToLong(b -> Long.parseLong(b.replaceFirst(SNAPSHOT_INDEX_PREFIX, ""))).max().orElse(-1L) < Long.parseLong(indexGeneration) : "Tried to delete an index-N blob newer than the current generation [" + indexGeneration + "] when deleting index-N blobs " + blobsToDelete;
}
try {
writeShardIndexBlob(shardContainer, indexGeneration, new BlobStoreIndexShardSnapshots(newSnapshotsList));
} catch (IOException e) {
throw new IndexShardSnapshotFailedException(shardId, "Failed to finalize snapshot creation [" + snapshotId + "] with shard index [" + indexShardSnapshotsFormat.blobName(indexGeneration) + "]", e);
}
if (writeShardGens == false) {
try {
shardContainer.deleteBlobsIgnoringIfNotExists(blobsToDelete);
} catch (IOException e) {
LOGGER.warn(() -> new ParameterizedMessage("[{}][{}] failed to delete old index-N blobs during finalization", snapshotId, shardId), e);
}
}
snapshotStatus.moveToDone(threadPool.absoluteTimeInMillis(), indexGeneration);
listener.onResponse(indexGeneration);
}, listener::onFailure);
if (indexIncrementalFileCount == 0) {
allFilesUploadedListener.onResponse(Collections.emptyList());
return;
}
final Executor executor = threadPool.executor(ThreadPool.Names.SNAPSHOT);
int maximumPoolSize = executor instanceof ThreadPoolExecutor ? ((ThreadPoolExecutor) executor).getMaximumPoolSize() : 1;
// Start as many workers as fit into the snapshot pool at once at the most
final int workers = Math.min(maximumPoolSize, indexIncrementalFileCount);
final ActionListener<Void> filesListener = fileQueueListener(filesToSnapshot, workers, allFilesUploadedListener);
for (int i = 0; i < workers; ++i) {
executor.execute(ActionRunnable.run(filesListener, () -> {
BlobStoreIndexShardSnapshot.FileInfo snapshotFileInfo = filesToSnapshot.poll(0L, TimeUnit.MILLISECONDS);
if (snapshotFileInfo != null) {
store.incRef();
try {
do {
snapshotFile(snapshotFileInfo, indexId, shardId, snapshotId, snapshotStatus, store);
snapshotFileInfo = filesToSnapshot.poll(0L, TimeUnit.MILLISECONDS);
} while (snapshotFileInfo != null);
} finally {
store.decRef();
}
}
}));
}
} catch (Exception e) {
listener.onFailure(e);
}
}
use of org.elasticsearch.action.StepListener in project crate by crate.
the class SnapshotsService method beginSnapshot.
/**
* Starts snapshot.
* <p>
* Creates snapshot in repository and updates snapshot metadata record with list of shards that needs to be processed.
*
* @param clusterState cluster state
* @param snapshot snapshot meta data
* @param partial allow partial snapshots
* @param userCreateSnapshotListener listener
*/
private void beginSnapshot(final ClusterState clusterState, final SnapshotsInProgress.Entry snapshot, final boolean partial, final ActionListener<Snapshot> userCreateSnapshotListener) {
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(new AbstractRunnable() {
boolean snapshotCreated;
boolean hadAbortedInitializations;
@Override
protected void doRun() {
assert initializingSnapshots.contains(snapshot.snapshot());
Repository repository = repositoriesService.repository(snapshot.snapshot().getRepository());
if (repository.isReadOnly()) {
throw new RepositoryException(repository.getMetadata().name(), "cannot create snapshot in a readonly repository");
}
final String snapshotName = snapshot.snapshot().getSnapshotId().getName();
final StepListener<RepositoryData> repositoryDataListener = new StepListener<>();
repository.getRepositoryData(repositoryDataListener);
repositoryDataListener.whenComplete(repositoryData -> {
// check if the snapshot name already exists in the repository
if (repositoryData.getSnapshotIds().stream().anyMatch(s -> s.getName().equals(snapshotName))) {
throw new InvalidSnapshotNameException(repository.getMetadata().name(), snapshotName, "snapshot with the same name already exists");
}
snapshotCreated = true;
LOGGER.info("snapshot [{}] started", snapshot.snapshot());
if (snapshot.indices().isEmpty()) {
// No indices in this snapshot - we are done
userCreateSnapshotListener.onResponse(snapshot.snapshot());
endSnapshot(snapshot, clusterState.metadata());
return;
}
clusterService.submitStateUpdateTask("update_snapshot [" + snapshot.snapshot() + "]", new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
List<SnapshotsInProgress.Entry> entries = new ArrayList<>();
for (SnapshotsInProgress.Entry entry : snapshots.entries()) {
if (entry.snapshot().equals(snapshot.snapshot()) == false) {
entries.add(entry);
continue;
}
if (entry.state() == State.ABORTED) {
entries.add(entry);
assert entry.shards().isEmpty();
hadAbortedInitializations = true;
} else {
// Replace the snapshot that was just initialized
ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards = shards(currentState, entry, repositoryData);
if (!partial) {
Tuple<Set<String>, Set<String>> indicesWithMissingShards = indicesWithMissingShards(shards, currentState.metadata());
Set<String> missing = indicesWithMissingShards.v1();
Set<String> closed = indicesWithMissingShards.v2();
if (missing.isEmpty() == false || closed.isEmpty() == false) {
final StringBuilder failureMessage = new StringBuilder();
if (missing.isEmpty() == false) {
failureMessage.append("Indices don't have primary shards ");
failureMessage.append(missing);
}
if (closed.isEmpty() == false) {
if (failureMessage.length() > 0) {
failureMessage.append("; ");
}
failureMessage.append("Indices are closed ");
failureMessage.append(closed);
}
entries.add(new SnapshotsInProgress.Entry(entry, State.FAILED, shards, failureMessage.toString()));
continue;
}
}
entries.add(new SnapshotsInProgress.Entry(entry, State.STARTED, shards));
}
}
return ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, new SnapshotsInProgress(unmodifiableList(entries))).build();
}
@Override
public void onFailure(String source, Exception e) {
LOGGER.warn(() -> new ParameterizedMessage("[{}] failed to create snapshot", snapshot.snapshot().getSnapshotId()), e);
removeSnapshotFromClusterState(snapshot.snapshot(), null, e, new CleanupAfterErrorListener(snapshot, true, userCreateSnapshotListener, e));
}
@Override
public void onNoLongerMaster(String source) {
// We are not longer a master - we shouldn't try to do any cleanup
// The new master will take care of it
LOGGER.warn("[{}] failed to create snapshot - no longer a master", snapshot.snapshot().getSnapshotId());
userCreateSnapshotListener.onFailure(new SnapshotException(snapshot.snapshot(), "master changed during snapshot initialization"));
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
// The userCreateSnapshotListener.onResponse() notifies caller that the snapshot was accepted
// for processing. If client wants to wait for the snapshot completion, it can register snapshot
// completion listener in this method. For the snapshot completion to work properly, the snapshot
// should still exist when listener is registered.
userCreateSnapshotListener.onResponse(snapshot.snapshot());
if (hadAbortedInitializations) {
final SnapshotsInProgress snapshotsInProgress = newState.custom(SnapshotsInProgress.TYPE);
assert snapshotsInProgress != null;
final SnapshotsInProgress.Entry entry = snapshotsInProgress.snapshot(snapshot.snapshot());
assert entry != null;
endSnapshot(entry, newState.metadata());
}
}
});
}, this::onFailure);
}
@Override
public void onFailure(Exception e) {
LOGGER.warn(() -> new ParameterizedMessage("failed to create snapshot [{}]", snapshot.snapshot().getSnapshotId()), e);
removeSnapshotFromClusterState(snapshot.snapshot(), null, e, new CleanupAfterErrorListener(snapshot, snapshotCreated, userCreateSnapshotListener, e));
}
});
}
use of org.elasticsearch.action.StepListener in project crate by crate.
the class RecoverySourceHandlerTests method testCancelRecoveryDuringPhase1.
@Test
public void testCancelRecoveryDuringPhase1() throws Exception {
Store store = newStore(createTempDir("source"), false);
IndexShard shard = mock(IndexShard.class);
when(shard.store()).thenReturn(store);
Directory dir = store.directory();
RandomIndexWriter writer = new RandomIndexWriter(random(), dir, newIndexWriterConfig());
int numDocs = randomIntBetween(10, 100);
for (int i = 0; i < numDocs; i++) {
Document document = new Document();
document.add(new StringField("id", Integer.toString(i), Field.Store.YES));
document.add(newField("field", randomUnicodeOfCodepointLengthBetween(1, 10), TextField.TYPE_STORED));
writer.addDocument(document);
}
writer.commit();
writer.close();
AtomicBoolean wasCancelled = new AtomicBoolean();
SetOnce<Runnable> cancelRecovery = new SetOnce<>();
final TestRecoveryTargetHandler recoveryTarget = new TestRecoveryTargetHandler() {
@Override
public void receiveFileInfo(List<String> phase1FileNames, List<Long> phase1FileSizes, List<String> phase1ExistingFileNames, List<Long> phase1ExistingFileSizes, int totalTranslogOps, ActionListener<Void> listener) {
recoveryExecutor.execute(() -> listener.onResponse(null));
if (randomBoolean()) {
wasCancelled.set(true);
cancelRecovery.get().run();
}
}
@Override
public void writeFileChunk(StoreFileMetadata md, long position, BytesReference content, boolean lastChunk, int totalTranslogOps, ActionListener<Void> listener) {
recoveryExecutor.execute(() -> listener.onResponse(null));
if (rarely()) {
wasCancelled.set(true);
cancelRecovery.get().run();
}
}
@Override
public void cleanFiles(int totalTranslogOps, long globalCheckpoint, Store.MetadataSnapshot sourceMetadata, ActionListener<Void> listener) {
recoveryExecutor.execute(() -> listener.onResponse(null));
if (randomBoolean()) {
wasCancelled.set(true);
cancelRecovery.get().run();
}
}
};
StartRecoveryRequest startRecoveryRequest = getStartRecoveryRequest();
final RecoverySourceHandler handler = new RecoverySourceHandler(shard, recoveryTarget, threadPool, startRecoveryRequest, between(1, 16), between(1, 4), between(1, 4)) {
@Override
void createRetentionLease(long startingSeqNo, ActionListener<RetentionLease> listener) {
final String leaseId = ReplicationTracker.getPeerRecoveryRetentionLeaseId(startRecoveryRequest.targetNode().getId());
listener.onResponse(new RetentionLease(leaseId, startingSeqNo, threadPool.absoluteTimeInMillis(), ReplicationTracker.PEER_RECOVERY_RETENTION_LEASE_SOURCE));
}
};
cancelRecovery.set(() -> handler.cancel("test"));
final StepListener<RecoverySourceHandler.SendFileResult> phase1Listener = new StepListener<>();
try {
final CountDownLatch latch = new CountDownLatch(1);
handler.phase1(DirectoryReader.listCommits(dir).get(0), 0, () -> 0, new LatchedActionListener<>(phase1Listener, latch));
latch.await();
phase1Listener.result();
} catch (Exception e) {
assertTrue(wasCancelled.get());
assertNotNull(ExceptionsHelper.unwrap(e, CancellableThreads.ExecutionCancelledException.class));
}
store.close();
}
use of org.elasticsearch.action.StepListener in project crate by crate.
the class RecoverySourceHandler method recoverToTarget.
/**
* performs the recovery from the local engine to the target
*/
public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
final Closeable releaseResources = () -> IOUtils.close(resources);
final ActionListener<RecoveryResponse> wrappedListener = ActionListener.notifyOnce(listener);
try {
cancellableThreads.setOnCancel((reason, beforeCancelEx) -> {
final RuntimeException e;
if (shard.state() == IndexShardState.CLOSED) {
// check if the shard got closed on us
e = new IndexShardClosedException(shard.shardId(), "shard is closed and recovery was canceled reason [" + reason + "]");
} else {
e = new CancellableThreads.ExecutionCancelledException("recovery was canceled reason [" + reason + "]");
}
if (beforeCancelEx != null) {
e.addSuppressed(beforeCancelEx);
}
IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
throw e;
});
final Consumer<Exception> onFailure = e -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[onFailure]");
IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
};
final boolean softDeletesEnabled = shard.indexSettings().isSoftDeleteEnabled();
final SetOnce<RetentionLease> retentionLeaseRef = new SetOnce<>();
runUnderPrimaryPermit(() -> {
final IndexShardRoutingTable routingTable = shard.getReplicationGroup().getRoutingTable();
ShardRouting targetShardRouting = routingTable.getByAllocationId(request.targetAllocationId());
if (targetShardRouting == null) {
logger.debug("delaying recovery of {} as it is not listed as assigned to target node {}", request.shardId(), request.targetNode());
throw new DelayRecoveryException("source node does not have the shard listed in its state as allocated on the node");
}
assert targetShardRouting.initializing() : "expected recovery target to be initializing but was " + targetShardRouting;
retentionLeaseRef.set(shard.getRetentionLeases().get(ReplicationTracker.getPeerRecoveryRetentionLeaseId(targetShardRouting)));
}, shardId + " validating recovery target [" + request.targetAllocationId() + "] registered ", shard, cancellableThreads, logger);
final Engine.HistorySource historySource;
if (softDeletesEnabled && (shard.useRetentionLeasesInPeerRecovery() || retentionLeaseRef.get() != null)) {
historySource = Engine.HistorySource.INDEX;
} else {
historySource = Engine.HistorySource.TRANSLOG;
}
final Closeable retentionLock = shard.acquireHistoryRetentionLock(historySource);
resources.add(retentionLock);
final long startingSeqNo;
final boolean isSequenceNumberBasedRecovery = request.startingSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO && isTargetSameHistory() && shard.hasCompleteHistoryOperations("peer-recovery", historySource, request.startingSeqNo()) && (historySource == Engine.HistorySource.TRANSLOG || (retentionLeaseRef.get() != null && retentionLeaseRef.get().retainingSequenceNumber() <= request.startingSeqNo()));
if (isSequenceNumberBasedRecovery && softDeletesEnabled && retentionLeaseRef.get() != null) {
// all the history we need is retained by an existing retention lease, so we do not need a separate retention lock
retentionLock.close();
logger.trace("history is retained by {}", retentionLeaseRef.get());
} else {
// all the history we need is retained by the retention lock, obtained before calling shard.hasCompleteHistoryOperations()
// and before acquiring the safe commit we'll be using, so we can be certain that all operations after the safe commit's
// local checkpoint will be retained for the duration of this recovery.
logger.trace("history is retained by retention lock");
}
final StepListener<SendFileResult> sendFileStep = new StepListener<>();
final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
final StepListener<SendSnapshotResult> sendSnapshotStep = new StepListener<>();
final StepListener<Void> finalizeStep = new StepListener<>();
if (isSequenceNumberBasedRecovery) {
logger.trace("performing sequence numbers based recovery. starting at [{}]", request.startingSeqNo());
startingSeqNo = request.startingSeqNo();
if (retentionLeaseRef.get() == null) {
createRetentionLease(startingSeqNo, ActionListener.map(sendFileStep, ignored -> SendFileResult.EMPTY));
} else {
sendFileStep.onResponse(SendFileResult.EMPTY);
}
} else {
final Engine.IndexCommitRef safeCommitRef;
try {
safeCommitRef = shard.acquireSafeIndexCommit();
resources.add(safeCommitRef);
} catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "snapshot failed", e);
}
// Try and copy enough operations to the recovering peer so that if it is promoted to primary then it has a chance of being
// able to recover other replicas using operations-based recoveries. If we are not using retention leases then we
// conservatively copy all available operations. If we are using retention leases then "enough operations" is just the
// operations from the local checkpoint of the safe commit onwards, because when using soft deletes the safe commit retains
// at least as much history as anything else. The safe commit will often contain all the history retained by the current set
// of retention leases, but this is not guaranteed: an earlier peer recovery from a different primary might have created a
// retention lease for some history that this primary already discarded, since we discard history when the global checkpoint
// advances and not when creating a new safe commit. In any case this is a best-effort thing since future recoveries can
// always fall back to file-based ones, and only really presents a problem if this primary fails before things have settled
// down.
startingSeqNo = softDeletesEnabled ? Long.parseLong(safeCommitRef.getIndexCommit().getUserData().get(SequenceNumbers.LOCAL_CHECKPOINT_KEY)) + 1L : 0;
logger.trace("performing file-based recovery followed by history replay starting at [{}]", startingSeqNo);
try {
final int estimateNumOps = shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo);
final Releasable releaseStore = acquireStore(shard.store());
resources.add(releaseStore);
sendFileStep.whenComplete(r -> IOUtils.close(safeCommitRef, releaseStore), e -> {
try {
IOUtils.close(safeCommitRef, releaseStore);
} catch (final IOException ex) {
logger.warn("releasing snapshot caused exception", ex);
}
});
final StepListener<ReplicationResponse> deleteRetentionLeaseStep = new StepListener<>();
runUnderPrimaryPermit(() -> {
try {
// If the target previously had a copy of this shard then a file-based recovery might move its global
// checkpoint backwards. We must therefore remove any existing retention lease so that we can create a
// new one later on in the recovery.
shard.removePeerRecoveryRetentionLease(request.targetNode().getId(), new ThreadedActionListener<>(logger, shard.getThreadPool(), ThreadPool.Names.GENERIC, deleteRetentionLeaseStep, false));
} catch (RetentionLeaseNotFoundException e) {
logger.debug("no peer-recovery retention lease for " + request.targetAllocationId());
deleteRetentionLeaseStep.onResponse(null);
}
}, shardId + " removing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger);
deleteRetentionLeaseStep.whenComplete(ignored -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase1]");
phase1(safeCommitRef.getIndexCommit(), startingSeqNo, () -> estimateNumOps, sendFileStep);
}, onFailure);
} catch (final Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "sendFileStep failed", e);
}
}
assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;
sendFileStep.whenComplete(r -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[prepareTargetForTranslog]");
// For a sequence based recovery, the target can keep its local translog
prepareTargetForTranslog(shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo), prepareEngineStep);
}, onFailure);
prepareEngineStep.whenComplete(prepareEngineTime -> {
assert Transports.assertNotTransportThread(RecoverySourceHandler.this + "[phase2]");
/*
* add shard to replication group (shard will receive replication requests from this point on)
* now that engine is open. This means that any document indexed into the primary after
* this will be replicated to this replica as well make sure to do this before sampling
* the max sequence number in the next step, to ensure that we send all documents up to
* maxSeqNo in phase2.
*/
runUnderPrimaryPermit(() -> shard.initiateTracking(request.targetAllocationId()), shardId + " initiating tracking of " + request.targetAllocationId(), shard, cancellableThreads, logger);
final long endingSeqNo = shard.seqNoStats().getMaxSeqNo();
// CRATE_PATCH
try {
blobRecoveryHook();
} catch (Exception e) {
throw new RecoveryEngineException(shard.shardId(), 1, "blobRecoveryHook failed", e);
}
if (logger.isTraceEnabled()) {
logger.trace("snapshot translog for recovery; current size is [{}]", shard.estimateNumberOfHistoryOperations("peer-recovery", historySource, startingSeqNo));
}
final Translog.Snapshot phase2Snapshot = shard.getHistoryOperations("peer-recovery", historySource, startingSeqNo);
resources.add(phase2Snapshot);
retentionLock.close();
// we have to capture the max_seen_auto_id_timestamp and the max_seq_no_of_updates to make sure that these values
// are at least as high as the corresponding values on the primary when any of these operations were executed on it.
final long maxSeenAutoIdTimestamp = shard.getMaxSeenAutoIdTimestamp();
final long maxSeqNoOfUpdatesOrDeletes = shard.getMaxSeqNoOfUpdatesOrDeletes();
final RetentionLeases retentionLeases = shard.getRetentionLeases();
final long mappingVersionOnPrimary = shard.indexSettings().getIndexMetadata().getMappingVersion();
phase2(startingSeqNo, endingSeqNo, phase2Snapshot, maxSeenAutoIdTimestamp, maxSeqNoOfUpdatesOrDeletes, retentionLeases, mappingVersionOnPrimary, sendSnapshotStep);
}, onFailure);
// Recovery target can trim all operations >= startingSeqNo as we have sent all these operations in the phase 2
final long trimAboveSeqNo = startingSeqNo - 1;
sendSnapshotStep.whenComplete(r -> finalizeRecovery(r.targetLocalCheckpoint, trimAboveSeqNo, finalizeStep), onFailure);
finalizeStep.whenComplete(r -> {
// TODO: return the actual throttle time
final long phase1ThrottlingWaitTime = 0L;
final SendSnapshotResult sendSnapshotResult = sendSnapshotStep.result();
final SendFileResult sendFileResult = sendFileStep.result();
final RecoveryResponse response = new RecoveryResponse(sendFileResult.phase1FileNames, sendFileResult.phase1FileSizes, sendFileResult.phase1ExistingFileNames, sendFileResult.phase1ExistingFileSizes, sendFileResult.totalSize, sendFileResult.existingTotalSize, sendFileResult.took.millis(), phase1ThrottlingWaitTime, prepareEngineStep.result().millis(), sendSnapshotResult.sentOperations, sendSnapshotResult.tookTime.millis());
try {
wrappedListener.onResponse(response);
} finally {
IOUtils.close(resources);
}
}, onFailure);
} catch (Exception e) {
IOUtils.closeWhileHandlingException(releaseResources, () -> wrappedListener.onFailure(e));
}
}
Aggregations