use of org.opensearch.repositories.RepositoryException in project OpenSearch by opensearch-project.
the class ConcurrentSnapshotsIT method testMasterFailOverWithQueuedDeletes.
public void testMasterFailOverWithQueuedDeletes() throws Exception {
internalCluster().startMasterOnlyNodes(3);
final String dataNode = internalCluster().startDataOnlyNode();
final String repoName = "test-repo";
createRepository(repoName, "mock");
final String firstIndex = "index-one";
createIndexWithContent(firstIndex);
final String firstSnapshot = "snapshot-one";
blockDataNode(repoName, dataNode);
final ActionFuture<CreateSnapshotResponse> firstSnapshotResponse = startFullSnapshotFromNonMasterClient(repoName, firstSnapshot);
waitForBlock(dataNode, repoName, TimeValue.timeValueSeconds(30L));
final String dataNode2 = internalCluster().startDataOnlyNode();
ensureStableCluster(5);
final String secondIndex = "index-two";
createIndexWithContent(secondIndex, dataNode2, dataNode);
final String secondSnapshot = "snapshot-two";
final ActionFuture<CreateSnapshotResponse> secondSnapshotResponse = startFullSnapshot(repoName, secondSnapshot);
logger.info("--> wait for snapshot on second data node to finish");
awaitClusterState(state -> {
final SnapshotsInProgress snapshotsInProgress = state.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY);
return snapshotsInProgress.entries().size() == 2 && snapshotHasCompletedShard(secondSnapshot, snapshotsInProgress);
});
final ActionFuture<AcknowledgedResponse> firstDeleteFuture = startDeleteFromNonMasterClient(repoName, firstSnapshot);
awaitNDeletionsInProgress(1);
blockNodeOnAnyFiles(repoName, dataNode2);
final ActionFuture<CreateSnapshotResponse> snapshotThreeFuture = startFullSnapshotFromNonMasterClient(repoName, "snapshot-three");
waitForBlock(dataNode2, repoName, TimeValue.timeValueSeconds(30L));
assertThat(firstSnapshotResponse.isDone(), is(false));
assertThat(secondSnapshotResponse.isDone(), is(false));
logger.info("--> waiting for all three snapshots to show up as in-progress");
assertBusy(() -> assertThat(currentSnapshots(repoName), hasSize(3)), 30L, TimeUnit.SECONDS);
final ActionFuture<AcknowledgedResponse> deleteAllSnapshots = startDeleteFromNonMasterClient(repoName, "*");
logger.info("--> wait for delete to be enqueued in cluster state");
awaitClusterState(state -> {
final SnapshotDeletionsInProgress deletionsInProgress = state.custom(SnapshotDeletionsInProgress.TYPE);
return deletionsInProgress.getEntries().size() == 1 && deletionsInProgress.getEntries().get(0).getSnapshots().size() == 3;
});
logger.info("--> waiting for second snapshot to finish and the other two snapshots to become aborted");
assertBusy(() -> {
assertThat(currentSnapshots(repoName), hasSize(2));
for (SnapshotsInProgress.Entry entry : clusterService().state().custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY).entries()) {
assertThat(entry.state(), is(SnapshotsInProgress.State.ABORTED));
assertThat(entry.snapshot().getSnapshotId().getName(), not(secondSnapshot));
}
}, 30L, TimeUnit.SECONDS);
logger.info("--> stopping current master node");
internalCluster().stopCurrentMasterNode();
unblockNode(repoName, dataNode);
unblockNode(repoName, dataNode2);
for (ActionFuture<AcknowledgedResponse> deleteFuture : Arrays.asList(firstDeleteFuture, deleteAllSnapshots)) {
try {
assertAcked(deleteFuture.actionGet());
} catch (RepositoryException rex) {
// rarely the master node fails over twice when shutting down the initial master and fails the transport listener
assertThat(rex.repository(), is("_all"));
assertThat(rex.getMessage(), endsWith("Failed to update cluster state during repository operation"));
} catch (SnapshotMissingException sme) {
// very rarely a master node fail-over happens at such a time that the client on the data-node sees a disconnect exception
// after the master has already started the delete, leading to the delete retry to run into a situation where the
// snapshot has already been deleted potentially
assertThat(sme.getSnapshotName(), is(firstSnapshot));
}
}
expectThrows(SnapshotException.class, snapshotThreeFuture::actionGet);
logger.info("--> verify that all snapshots are gone and no more work is left in the cluster state");
assertBusy(() -> {
assertThat(client().admin().cluster().prepareGetSnapshots(repoName).get().getSnapshots(), empty());
final ClusterState state = clusterService().state();
final SnapshotsInProgress snapshotsInProgress = state.custom(SnapshotsInProgress.TYPE);
assertThat(snapshotsInProgress.entries(), empty());
final SnapshotDeletionsInProgress snapshotDeletionsInProgress = state.custom(SnapshotDeletionsInProgress.TYPE);
assertThat(snapshotDeletionsInProgress.getEntries(), empty());
}, 30L, TimeUnit.SECONDS);
}
use of org.opensearch.repositories.RepositoryException in project OpenSearch by opensearch-project.
the class SnapshotsService method beginSnapshot.
/**
* Starts snapshot.
* <p>
* Creates snapshot in repository and updates snapshot metadata record with list of shards that needs to be processed.
* Note: This method is only used in clusters that contain a node older than {@link #NO_REPO_INITIALIZE_VERSION} to ensure a backwards
* compatible path for initializing the snapshot in the repository is executed.
*
* @param clusterState cluster state
* @param snapshot snapshot meta data
* @param partial allow partial snapshots
* @param userCreateSnapshotListener listener
*/
private void beginSnapshot(final ClusterState clusterState, final SnapshotsInProgress.Entry snapshot, final boolean partial, final List<String> indices, final Repository repository, final ActionListener<Snapshot> userCreateSnapshotListener) {
threadPool.executor(ThreadPool.Names.SNAPSHOT).execute(new AbstractRunnable() {
boolean hadAbortedInitializations;
@Override
protected void doRun() {
assert initializingSnapshots.contains(snapshot.snapshot());
if (repository.isReadOnly()) {
throw new RepositoryException(repository.getMetadata().name(), "cannot create snapshot in a readonly repository");
}
final String snapshotName = snapshot.snapshot().getSnapshotId().getName();
final StepListener<RepositoryData> repositoryDataListener = new StepListener<>();
repository.getRepositoryData(repositoryDataListener);
repositoryDataListener.whenComplete(repositoryData -> {
// check if the snapshot name already exists in the repository
if (repositoryData.getSnapshotIds().stream().anyMatch(s -> s.getName().equals(snapshotName))) {
throw new InvalidSnapshotNameException(repository.getMetadata().name(), snapshotName, "snapshot with the same name already exists");
}
if (clusterState.nodes().getMinNodeVersion().onOrAfter(NO_REPO_INITIALIZE_VERSION) == false) {
// In mixed version clusters we initialize the snapshot in the repository so that in case of a master failover to an
// older version master node snapshot finalization (that assumes initializeSnapshot was called) produces a valid
// snapshot.
repository.initializeSnapshot(snapshot.snapshot().getSnapshotId(), snapshot.indices(), metadataForSnapshot(snapshot, clusterState.metadata()));
}
logger.info("snapshot [{}] started", snapshot.snapshot());
final Version version = minCompatibleVersion(clusterState.nodes().getMinNodeVersion(), repositoryData, null);
if (indices.isEmpty()) {
// No indices in this snapshot - we are done
userCreateSnapshotListener.onResponse(snapshot.snapshot());
endSnapshot(SnapshotsInProgress.startedEntry(snapshot.snapshot(), snapshot.includeGlobalState(), snapshot.partial(), Collections.emptyList(), Collections.emptyList(), threadPool.absoluteTimeInMillis(), repositoryData.getGenId(), ImmutableOpenMap.of(), snapshot.userMetadata(), version), clusterState.metadata(), repositoryData);
return;
}
clusterService.submitStateUpdateTask("update_snapshot [" + snapshot.snapshot() + "]", new ClusterStateUpdateTask() {
@Override
public ClusterState execute(ClusterState currentState) {
SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE);
List<SnapshotsInProgress.Entry> entries = new ArrayList<>();
for (SnapshotsInProgress.Entry entry : snapshots.entries()) {
if (entry.snapshot().equals(snapshot.snapshot()) == false) {
entries.add(entry);
continue;
}
if (entry.state() == State.ABORTED) {
entries.add(entry);
assert entry.shards().isEmpty();
hadAbortedInitializations = true;
} else {
final List<IndexId> indexIds = repositoryData.resolveNewIndices(indices, Collections.emptyMap());
// Replace the snapshot that was just initialized
ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards = shards(snapshots, currentState.custom(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY), currentState.metadata(), currentState.routingTable(), indexIds, useShardGenerations(version), repositoryData, entry.repository());
if (!partial) {
Tuple<Set<String>, Set<String>> indicesWithMissingShards = indicesWithMissingShards(shards, currentState.metadata());
Set<String> missing = indicesWithMissingShards.v1();
Set<String> closed = indicesWithMissingShards.v2();
if (missing.isEmpty() == false || closed.isEmpty() == false) {
final StringBuilder failureMessage = new StringBuilder();
if (missing.isEmpty() == false) {
failureMessage.append("Indices don't have primary shards ");
failureMessage.append(missing);
}
if (closed.isEmpty() == false) {
if (failureMessage.length() > 0) {
failureMessage.append("; ");
}
failureMessage.append("Indices are closed ");
failureMessage.append(closed);
}
entries.add(new SnapshotsInProgress.Entry(entry, State.FAILED, indexIds, repositoryData.getGenId(), shards, version, failureMessage.toString()));
continue;
}
}
entries.add(new SnapshotsInProgress.Entry(entry, State.STARTED, indexIds, repositoryData.getGenId(), shards, version, null));
}
}
return ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, SnapshotsInProgress.of(unmodifiableList(entries))).build();
}
@Override
public void onFailure(String source, Exception e) {
logger.warn(() -> new ParameterizedMessage("[{}] failed to create snapshot", snapshot.snapshot().getSnapshotId()), e);
removeFailedSnapshotFromClusterState(snapshot.snapshot(), e, null, new CleanupAfterErrorListener(userCreateSnapshotListener, e));
}
@Override
public void onNoLongerMaster(String source) {
// We are not longer a master - we shouldn't try to do any cleanup
// The new master will take care of it
logger.warn("[{}] failed to create snapshot - no longer a master", snapshot.snapshot().getSnapshotId());
userCreateSnapshotListener.onFailure(new SnapshotException(snapshot.snapshot(), "master changed during snapshot initialization"));
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
// The userCreateSnapshotListener.onResponse() notifies caller that the snapshot was accepted
// for processing. If client wants to wait for the snapshot completion, it can register snapshot
// completion listener in this method. For the snapshot completion to work properly, the snapshot
// should still exist when listener is registered.
userCreateSnapshotListener.onResponse(snapshot.snapshot());
if (hadAbortedInitializations) {
final SnapshotsInProgress snapshotsInProgress = newState.custom(SnapshotsInProgress.TYPE);
assert snapshotsInProgress != null;
final SnapshotsInProgress.Entry entry = snapshotsInProgress.snapshot(snapshot.snapshot());
assert entry != null;
endSnapshot(entry, newState.metadata(), repositoryData);
} else {
endCompletedSnapshots(newState);
}
}
});
}, this::onFailure);
}
@Override
public void onFailure(Exception e) {
logger.warn(() -> new ParameterizedMessage("failed to create snapshot [{}]", snapshot.snapshot().getSnapshotId()), e);
removeFailedSnapshotFromClusterState(snapshot.snapshot(), e, null, new CleanupAfterErrorListener(userCreateSnapshotListener, e));
}
});
}
use of org.opensearch.repositories.RepositoryException in project OpenSearch by opensearch-project.
the class SnapshotsService method createSnapshot.
/**
* Initializes the snapshotting process.
* <p>
* This method is used by clients to start snapshot. It makes sure that there is no snapshots are currently running and
* creates a snapshot record in cluster state metadata.
*
* @param request snapshot request
* @param listener snapshot creation listener
*/
public void createSnapshot(final CreateSnapshotRequest request, final ActionListener<Snapshot> listener) {
final String repositoryName = request.repository();
final String snapshotName = indexNameExpressionResolver.resolveDateMathExpression(request.snapshot());
validate(repositoryName, snapshotName);
// TODO: create snapshot UUID in CreateSnapshotRequest and make this operation idempotent to cleanly deal with transport layer
// retries
// new UUID for the snapshot
final SnapshotId snapshotId = new SnapshotId(snapshotName, UUIDs.randomBase64UUID());
Repository repository = repositoriesService.repository(request.repository());
if (repository.isReadOnly()) {
listener.onFailure(new RepositoryException(repository.getMetadata().name(), "cannot create snapshot in a readonly repository"));
return;
}
final Snapshot snapshot = new Snapshot(repositoryName, snapshotId);
final Map<String, Object> userMeta = repository.adaptUserMetadata(request.userMetadata());
repository.executeConsistentStateUpdate(repositoryData -> new ClusterStateUpdateTask() {
private SnapshotsInProgress.Entry newEntry;
@Override
public ClusterState execute(ClusterState currentState) {
ensureSnapshotNameAvailableInRepo(repositoryData, snapshotName, repository);
final SnapshotsInProgress snapshots = currentState.custom(SnapshotsInProgress.TYPE, SnapshotsInProgress.EMPTY);
final List<SnapshotsInProgress.Entry> runningSnapshots = snapshots.entries();
ensureSnapshotNameNotRunning(runningSnapshots, repositoryName, snapshotName);
validate(repositoryName, snapshotName, currentState);
final boolean concurrentOperationsAllowed = currentState.nodes().getMinNodeVersion().onOrAfter(FULL_CONCURRENCY_VERSION);
final SnapshotDeletionsInProgress deletionsInProgress = currentState.custom(SnapshotDeletionsInProgress.TYPE, SnapshotDeletionsInProgress.EMPTY);
if (deletionsInProgress.hasDeletionsInProgress() && concurrentOperationsAllowed == false) {
throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName, "cannot snapshot while a snapshot deletion is in-progress in [" + deletionsInProgress + "]");
}
final RepositoryCleanupInProgress repositoryCleanupInProgress = currentState.custom(RepositoryCleanupInProgress.TYPE, RepositoryCleanupInProgress.EMPTY);
if (repositoryCleanupInProgress.hasCleanupInProgress()) {
throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName, "cannot snapshot while a repository cleanup is in-progress in [" + repositoryCleanupInProgress + "]");
}
// cluster state anyway in #applyClusterState.
if (concurrentOperationsAllowed == false && runningSnapshots.stream().anyMatch(entry -> entry.state() != State.INIT)) {
throw new ConcurrentSnapshotExecutionException(repositoryName, snapshotName, " a snapshot is already running");
}
ensureNoCleanupInProgress(currentState, repositoryName, snapshotName);
ensureBelowConcurrencyLimit(repositoryName, snapshotName, snapshots, deletionsInProgress);
// Store newSnapshot here to be processed in clusterStateProcessed
List<String> indices = Arrays.asList(indexNameExpressionResolver.concreteIndexNames(currentState, request));
final List<String> dataStreams = indexNameExpressionResolver.dataStreamNames(currentState, request.indicesOptions(), request.indices());
logger.trace("[{}][{}] creating snapshot for indices [{}]", repositoryName, snapshotName, indices);
final List<IndexId> indexIds = repositoryData.resolveNewIndices(indices, getInFlightIndexIds(runningSnapshots, repositoryName));
final Version version = minCompatibleVersion(currentState.nodes().getMinNodeVersion(), repositoryData, null);
ImmutableOpenMap<ShardId, ShardSnapshotStatus> shards = shards(snapshots, deletionsInProgress, currentState.metadata(), currentState.routingTable(), indexIds, useShardGenerations(version), repositoryData, repositoryName);
if (request.partial() == false) {
Set<String> missing = new HashSet<>();
for (ObjectObjectCursor<ShardId, SnapshotsInProgress.ShardSnapshotStatus> entry : shards) {
if (entry.value.state() == ShardState.MISSING) {
missing.add(entry.key.getIndex().getName());
}
}
if (missing.isEmpty() == false) {
throw new SnapshotException(new Snapshot(repositoryName, snapshotId), "Indices don't have primary shards " + missing);
}
}
newEntry = SnapshotsInProgress.startedEntry(new Snapshot(repositoryName, snapshotId), request.includeGlobalState(), request.partial(), indexIds, dataStreams, threadPool.absoluteTimeInMillis(), repositoryData.getGenId(), shards, userMeta, version);
final List<SnapshotsInProgress.Entry> newEntries = new ArrayList<>(runningSnapshots);
newEntries.add(newEntry);
return ClusterState.builder(currentState).putCustom(SnapshotsInProgress.TYPE, SnapshotsInProgress.of(new ArrayList<>(newEntries))).build();
}
@Override
public void onFailure(String source, Exception e) {
logger.warn(() -> new ParameterizedMessage("[{}][{}] failed to create snapshot", repositoryName, snapshotName), e);
listener.onFailure(e);
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, final ClusterState newState) {
try {
logger.info("snapshot [{}] started", snapshot);
listener.onResponse(snapshot);
} finally {
if (newEntry.state().completed()) {
endSnapshot(newEntry, newState.metadata(), repositoryData);
}
}
}
@Override
public TimeValue timeout() {
return request.masterNodeTimeout();
}
}, "create_snapshot [" + snapshotName + ']', listener::onFailure);
}
use of org.opensearch.repositories.RepositoryException in project OpenSearch by opensearch-project.
the class BlobStoreRepository method doGetRepositoryData.
private void doGetRepositoryData(ActionListener<RepositoryData> listener) {
// Retry loading RepositoryData in a loop in case we run into concurrent modifications of the repository.
// Keep track of the most recent generation we failed to load so we can break out of the loop if we fail to load the same
// generation repeatedly.
long lastFailedGeneration = RepositoryData.UNKNOWN_REPO_GEN;
while (true) {
final long genToLoad;
if (bestEffortConsistency) {
// We're only using #latestKnownRepoGen as a hint in this mode and listing repo contents as a secondary way of trying
// to find a higher generation
final long generation;
try {
generation = latestIndexBlobId();
} catch (IOException ioe) {
listener.onFailure(new RepositoryException(metadata.name(), "Could not determine repository generation from root blobs", ioe));
return;
}
genToLoad = latestKnownRepoGen.updateAndGet(known -> Math.max(known, generation));
if (genToLoad > generation) {
logger.info("Determined repository generation [" + generation + "] from repository contents but correct generation must be at least [" + genToLoad + "]");
}
} else {
// We only rely on the generation tracked in #latestKnownRepoGen which is exclusively updated from the cluster state
genToLoad = latestKnownRepoGen.get();
}
try {
final Tuple<Long, BytesReference> cached = latestKnownRepositoryData.get();
final RepositoryData loaded;
// Caching is not used with #bestEffortConsistency see docs on #cacheRepositoryData for details
if (bestEffortConsistency == false && cached != null && cached.v1() == genToLoad) {
loaded = repositoryDataFromCachedEntry(cached);
} else {
loaded = getRepositoryData(genToLoad);
// We can cache serialized in the most recent version here without regard to the actual repository metadata version
// since we're only caching the information that we just wrote and thus won't accidentally cache any information that
// isn't safe
cacheRepositoryData(BytesReference.bytes(loaded.snapshotsToXContent(XContentFactory.jsonBuilder(), Version.CURRENT)), genToLoad);
}
listener.onResponse(loaded);
return;
} catch (RepositoryException e) {
// If the generation to load changed concurrently and we didn't just try loading the same generation before we retry
if (genToLoad != latestKnownRepoGen.get() && genToLoad != lastFailedGeneration) {
lastFailedGeneration = genToLoad;
logger.warn("Failed to load repository data generation [" + genToLoad + "] because a concurrent operation moved the current generation to [" + latestKnownRepoGen.get() + "]", e);
continue;
}
if (bestEffortConsistency == false && ExceptionsHelper.unwrap(e, NoSuchFileException.class) != null) {
// We did not find the expected index-N even though the cluster state continues to point at the missing value
// of N so we mark this repository as corrupted.
markRepoCorrupted(genToLoad, e, ActionListener.wrap(v -> listener.onFailure(corruptedStateException(e)), listener::onFailure));
} else {
listener.onFailure(e);
}
return;
} catch (Exception e) {
listener.onFailure(new RepositoryException(metadata.name(), "Unexpected exception when loading repository data", e));
return;
}
}
}
use of org.opensearch.repositories.RepositoryException in project OpenSearch by opensearch-project.
the class ExceptionSerializationTests method testRepositoryException.
public void testRepositoryException() throws IOException {
RepositoryException ex = serialize(new RepositoryException("repo", "msg"));
assertEquals("repo", ex.repository());
assertEquals("[repo] msg", ex.getMessage());
ex = serialize(new RepositoryException(null, "msg"));
assertNull(ex.repository());
assertEquals("[_na] msg", ex.getMessage());
}
Aggregations