use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.
the class IndexMetadataUpdater method updateInSyncAllocations.
/**
* Updates in-sync allocations with routing changes that were made to the routing table.
*/
private IndexMetadata.Builder updateInSyncAllocations(RoutingTable newRoutingTable, IndexMetadata oldIndexMetadata, IndexMetadata.Builder indexMetadataBuilder, ShardId shardId, Updates updates) {
assert Sets.haveEmptyIntersection(updates.addedAllocationIds, updates.removedAllocationIds) : "allocation ids cannot be both added and removed in the same allocation round, added ids: " + updates.addedAllocationIds + ", removed ids: " + updates.removedAllocationIds;
Set<String> oldInSyncAllocationIds = oldIndexMetadata.inSyncAllocationIds(shardId.id());
// check if we have been force-initializing an empty primary or a stale primary
if (updates.initializedPrimary != null && oldInSyncAllocationIds.isEmpty() == false && oldInSyncAllocationIds.contains(updates.initializedPrimary.allocationId().getId()) == false) {
// we're not reusing an existing in-sync allocation id to initialize a primary, which means that we're either force-allocating
// an empty or a stale primary (see AllocateEmptyPrimaryAllocationCommand or AllocateStalePrimaryAllocationCommand).
RecoverySource recoverySource = updates.initializedPrimary.recoverySource();
RecoverySource.Type recoverySourceType = recoverySource.getType();
boolean emptyPrimary = recoverySourceType == RecoverySource.Type.EMPTY_STORE;
assert updates.addedAllocationIds.isEmpty() : (emptyPrimary ? "empty" : "stale") + " primary is not force-initialized in same allocation round where shards are started";
if (indexMetadataBuilder == null) {
indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
}
if (emptyPrimary) {
// forcing an empty primary resets the in-sync allocations to the empty set (ShardRouting.allocatedPostIndexCreate)
indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.emptySet());
} else {
final String allocationId;
if (recoverySource == RecoverySource.ExistingStoreRecoverySource.FORCE_STALE_PRIMARY_INSTANCE) {
allocationId = RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID;
} else {
assert recoverySource instanceof RecoverySource.SnapshotRecoverySource : recoverySource;
allocationId = updates.initializedPrimary.allocationId().getId();
}
// forcing a stale primary resets the in-sync allocations to the singleton set with the stale id
indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.singleton(allocationId));
}
} else {
// standard path for updating in-sync ids
Set<String> inSyncAllocationIds = new HashSet<>(oldInSyncAllocationIds);
inSyncAllocationIds.addAll(updates.addedAllocationIds);
inSyncAllocationIds.removeAll(updates.removedAllocationIds);
assert oldInSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false || inSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false : "fake allocation id has to be removed, inSyncAllocationIds:" + inSyncAllocationIds;
// Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
// but repeatedly shut down nodes that have active replicas.
// We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
// Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number
// of replicas was decreased while shards were unassigned.
// +1 for the primary
int maxActiveShards = oldIndexMetadata.getNumberOfReplicas() + 1;
IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId);
assert newShardRoutingTable.assignedShards().stream().filter(ShardRouting::isRelocationTarget).map(s -> s.allocationId().getId()).noneMatch(inSyncAllocationIds::contains) : newShardRoutingTable.assignedShards() + " vs " + inSyncAllocationIds;
if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) {
// trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies)
List<ShardRouting> assignedShards = newShardRoutingTable.assignedShards().stream().filter(s -> s.isRelocationTarget() == false).collect(Collectors.toList());
assert assignedShards.size() <= maxActiveShards : "cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards;
Set<String> assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet());
inSyncAllocationIds = inSyncAllocationIds.stream().sorted(// values with routing entries first
Comparator.comparing(assignedAllocations::contains).reversed()).limit(maxActiveShards).collect(Collectors.toSet());
}
// in-sync set, this could create an empty primary on the next allocation.
if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) {
// add back allocation id of failed primary
inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
}
assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() : "in-sync allocations cannot become empty after they have been non-empty: " + oldInSyncAllocationIds;
// be extra safe here and only update in-sync set if it is non-empty
if (inSyncAllocationIds.isEmpty() == false) {
if (indexMetadataBuilder == null) {
indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
}
indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), inSyncAllocationIds);
}
}
return indexMetadataBuilder;
}
use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.
the class RestoreInProgressAllocationDecider method canAllocate.
@Override
public Decision canAllocate(final ShardRouting shardRouting, final RoutingAllocation allocation) {
final RecoverySource recoverySource = shardRouting.recoverySource();
if (recoverySource == null || recoverySource.getType() != RecoverySource.Type.SNAPSHOT) {
return allocation.decision(Decision.YES, NAME, "ignored as shard is not being recovered from a snapshot");
}
final RecoverySource.SnapshotRecoverySource source = (RecoverySource.SnapshotRecoverySource) recoverySource;
if (source.restoreUUID().equals(RecoverySource.SnapshotRecoverySource.NO_API_RESTORE_UUID)) {
return allocation.decision(Decision.YES, NAME, "not an API-level restore");
}
final RestoreInProgress restoresInProgress = allocation.custom(RestoreInProgress.TYPE);
if (restoresInProgress != null) {
RestoreInProgress.Entry restoreInProgress = restoresInProgress.get(source.restoreUUID());
if (restoreInProgress != null) {
RestoreInProgress.ShardRestoreStatus shardRestoreStatus = restoreInProgress.shards().get(shardRouting.shardId());
if (shardRestoreStatus != null && shardRestoreStatus.state().completed() == false) {
assert shardRestoreStatus.state() != RestoreInProgress.State.SUCCESS : "expected shard [" + shardRouting + "] to be in initializing state but got [" + shardRestoreStatus.state() + "]";
return allocation.decision(Decision.YES, NAME, "shard is currently being restored");
}
}
}
return allocation.decision(Decision.NO, NAME, "shard has failed to be restored from the snapshot [%s] because of [%s] - " + "manually close or delete the index [%s] in order to retry to restore the snapshot again or use the reroute API to force the " + "allocation of an empty primary shard", source.snapshot(), shardRouting.unassignedInfo().getDetails(), shardRouting.getIndexName());
}
use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.
the class RestoreInProgressAllocationDeciderTests method testCanAllocatePrimaryExistingInRestoreInProgress.
public void testCanAllocatePrimaryExistingInRestoreInProgress() {
RecoverySource.SnapshotRecoverySource recoverySource = createSnapshotRecoverySource("_existing");
ClusterState clusterState = createInitialClusterState();
RoutingTable routingTable = RoutingTable.builder(clusterState.getRoutingTable()).addAsRestore(clusterState.getMetadata().index("test"), recoverySource).build();
clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
ShardRouting primary = clusterState.getRoutingTable().shardRoutingTable("test", 0).primaryShard();
assertEquals(ShardRoutingState.UNASSIGNED, primary.state());
assertEquals(RecoverySource.Type.SNAPSHOT, primary.recoverySource().getType());
routingTable = clusterState.routingTable();
final RestoreInProgress.State shardState;
if (randomBoolean()) {
shardState = randomFrom(RestoreInProgress.State.STARTED, RestoreInProgress.State.INIT);
} else {
shardState = RestoreInProgress.State.FAILURE;
UnassignedInfo currentInfo = primary.unassignedInfo();
UnassignedInfo newInfo = new UnassignedInfo(currentInfo.getReason(), currentInfo.getMessage(), new IOException("i/o failure"), currentInfo.getNumFailedAllocations(), currentInfo.getUnassignedTimeInNanos(), currentInfo.getUnassignedTimeInMillis(), currentInfo.isDelayed(), currentInfo.getLastAllocationStatus(), currentInfo.getFailedNodeIds());
primary = primary.updateUnassigned(newInfo, primary.recoverySource());
IndexRoutingTable indexRoutingTable = routingTable.index("test");
IndexRoutingTable.Builder newIndexRoutingTable = IndexRoutingTable.builder(indexRoutingTable.getIndex());
for (final ObjectCursor<IndexShardRoutingTable> shardEntry : indexRoutingTable.getShards().values()) {
final IndexShardRoutingTable shardRoutingTable = shardEntry.value;
for (ShardRouting shardRouting : shardRoutingTable.getShards()) {
if (shardRouting.primary()) {
newIndexRoutingTable.addShard(primary);
} else {
newIndexRoutingTable.addShard(shardRouting);
}
}
}
routingTable = RoutingTable.builder(routingTable).add(newIndexRoutingTable).build();
}
ImmutableOpenMap.Builder<ShardId, RestoreInProgress.ShardRestoreStatus> shards = ImmutableOpenMap.builder();
shards.put(primary.shardId(), new RestoreInProgress.ShardRestoreStatus(clusterState.getNodes().getLocalNodeId(), shardState));
Snapshot snapshot = recoverySource.snapshot();
RestoreInProgress.State restoreState = RestoreInProgress.State.STARTED;
RestoreInProgress.Entry restore = new RestoreInProgress.Entry(recoverySource.restoreUUID(), snapshot, restoreState, singletonList("test"), shards.build());
clusterState = ClusterState.builder(clusterState).putCustom(RestoreInProgress.TYPE, new RestoreInProgress.Builder().add(restore).build()).routingTable(routingTable).build();
Decision decision = executeAllocation(clusterState, primary);
if (shardState == RestoreInProgress.State.FAILURE) {
assertEquals(Decision.Type.NO, decision.type());
assertEquals("shard has failed to be restored from the snapshot [_repository:_existing/_uuid] because of " + "[restore_source[_repository/_existing], failure IOException[i/o failure]] - manually close or delete the index " + "[test] in order to retry to restore the snapshot again or use the reroute API to force the allocation of " + "an empty primary shard", decision.getExplanation());
} else {
assertEquals(Decision.Type.YES, decision.type());
assertEquals("shard is currently being restored", decision.getExplanation());
}
}
use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.
the class IndexShardTestCase method recoverShardFromSnapshot.
/**
* Recover a shard from a snapshot using a given repository *
*/
protected void recoverShardFromSnapshot(final IndexShard shard, final Snapshot snapshot, final Repository repository) {
final Version version = Version.CURRENT;
final ShardId shardId = shard.shardId();
final IndexId indexId = new IndexId(shardId.getIndex().getName(), shardId.getIndex().getUUID());
final DiscoveryNode node = getFakeDiscoNode(shard.routingEntry().currentNodeId());
final RecoverySource.SnapshotRecoverySource recoverySource = new RecoverySource.SnapshotRecoverySource(UUIDs.randomBase64UUID(), snapshot, version, indexId);
final ShardRouting shardRouting = TestShardRouting.newShardRouting(shardId, node.getId(), true, ShardRoutingState.INITIALIZING, recoverySource);
shard.markAsRecovering("from snapshot", new RecoveryState(shardRouting, node, null));
final PlainActionFuture<Void> future = PlainActionFuture.newFuture();
repository.restoreShard(shard.store(), snapshot.getSnapshotId(), indexId, shard.shardId(), shard.recoveryState(), future);
future.actionGet();
}
use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.
the class IndexRecoveryIT method testReplicaRecovery.
public void testReplicaRecovery() throws Exception {
final String nodeA = internalCluster().startNode();
createIndex(INDEX_NAME, Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, SHARD_COUNT).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, REPLICA_COUNT).build());
ensureGreen(INDEX_NAME);
final int numOfDocs = scaledRandomIntBetween(0, 200);
try (BackgroundIndexer indexer = new BackgroundIndexer(INDEX_NAME, "_doc", client(), numOfDocs)) {
waitForDocs(numOfDocs, indexer);
}
refresh(INDEX_NAME);
assertHitCount(client().prepareSearch(INDEX_NAME).setSize(0).get(), numOfDocs);
final boolean closedIndex = randomBoolean();
if (closedIndex) {
assertAcked(client().admin().indices().prepareClose(INDEX_NAME));
ensureGreen(INDEX_NAME);
}
// force a shard recovery from nodeA to nodeB
final String nodeB = internalCluster().startNode();
assertAcked(client().admin().indices().prepareUpdateSettings(INDEX_NAME).setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)));
ensureGreen(INDEX_NAME);
final RecoveryResponse response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet();
// we should now have two total shards, one primary and one replica
List<RecoveryState> recoveryStates = response.shardRecoveryStates().get(INDEX_NAME);
assertThat(recoveryStates.size(), equalTo(2));
List<RecoveryState> nodeAResponses = findRecoveriesForTargetNode(nodeA, recoveryStates);
assertThat(nodeAResponses.size(), equalTo(1));
List<RecoveryState> nodeBResponses = findRecoveriesForTargetNode(nodeB, recoveryStates);
assertThat(nodeBResponses.size(), equalTo(1));
// validate node A recovery
final RecoveryState nodeARecoveryState = nodeAResponses.get(0);
final RecoverySource expectedRecoverySource;
if (closedIndex == false) {
expectedRecoverySource = RecoverySource.EmptyStoreRecoverySource.INSTANCE;
} else {
expectedRecoverySource = RecoverySource.ExistingStoreRecoverySource.INSTANCE;
}
assertRecoveryState(nodeARecoveryState, 0, expectedRecoverySource, true, Stage.DONE, null, nodeA);
validateIndexRecoveryState(nodeARecoveryState.getIndex());
// validate node B recovery
final RecoveryState nodeBRecoveryState = nodeBResponses.get(0);
assertRecoveryState(nodeBRecoveryState, 0, PeerRecoverySource.INSTANCE, false, Stage.DONE, nodeA, nodeB);
validateIndexRecoveryState(nodeBRecoveryState.getIndex());
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeA));
if (closedIndex) {
assertAcked(client().admin().indices().prepareOpen(INDEX_NAME));
}
assertHitCount(client().prepareSearch(INDEX_NAME).setSize(0).get(), numOfDocs);
}
Aggregations