use of org.opensearch.cluster.metadata.IndexMetadata in project OpenSearch by opensearch-project.
the class TransportBroadcastReplicationAction method shards.
/**
* @return all shard ids the request should run on
*/
protected List<ShardId> shards(Request request, ClusterState clusterState) {
List<ShardId> shardIds = new ArrayList<>();
String[] concreteIndices = indexNameExpressionResolver.concreteIndexNames(clusterState, request);
for (String index : concreteIndices) {
IndexMetadata indexMetadata = clusterState.metadata().getIndices().get(index);
if (indexMetadata != null) {
for (IntObjectCursor<IndexShardRoutingTable> shardRouting : clusterState.getRoutingTable().indicesRouting().get(index).getShards()) {
shardIds.add(shardRouting.value.shardId());
}
}
}
return shardIds;
}
use of org.opensearch.cluster.metadata.IndexMetadata in project OpenSearch by opensearch-project.
the class IndexMetadataUpdater method removeStaleIdsWithoutRoutings.
/**
* Removes allocation ids from the in-sync set for shard copies for which there is no routing entries in the routing table.
* This method is called in AllocationService before any changes to the routing table are made.
*/
public static ClusterState removeStaleIdsWithoutRoutings(ClusterState clusterState, List<StaleShard> staleShards, Logger logger) {
Metadata oldMetadata = clusterState.metadata();
RoutingTable oldRoutingTable = clusterState.routingTable();
Metadata.Builder metadataBuilder = null;
// group staleShards entries by index
for (Map.Entry<Index, List<StaleShard>> indexEntry : staleShards.stream().collect(Collectors.groupingBy(fs -> fs.getShardId().getIndex())).entrySet()) {
final IndexMetadata oldIndexMetadata = oldMetadata.getIndexSafe(indexEntry.getKey());
IndexMetadata.Builder indexMetadataBuilder = null;
// group staleShards entries by shard id
for (Map.Entry<ShardId, List<StaleShard>> shardEntry : indexEntry.getValue().stream().collect(Collectors.groupingBy(staleShard -> staleShard.getShardId())).entrySet()) {
int shardNumber = shardEntry.getKey().getId();
Set<String> oldInSyncAllocations = oldIndexMetadata.inSyncAllocationIds(shardNumber);
Set<String> idsToRemove = shardEntry.getValue().stream().map(e -> e.getAllocationId()).collect(Collectors.toSet());
assert idsToRemove.stream().allMatch(id -> oldRoutingTable.getByAllocationId(shardEntry.getKey(), id) == null) : "removing stale ids: " + idsToRemove + ", some of which have still a routing entry: " + oldRoutingTable;
Set<String> remainingInSyncAllocations = Sets.difference(oldInSyncAllocations, idsToRemove);
assert remainingInSyncAllocations.isEmpty() == false : "Set of in-sync ids cannot become empty for shard " + shardEntry.getKey() + " (before: " + oldInSyncAllocations + ", ids to remove: " + idsToRemove + ")";
// (see ShardRouting#allocatedPostIndexCreate)
if (remainingInSyncAllocations.isEmpty() == false) {
if (indexMetadataBuilder == null) {
indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
}
indexMetadataBuilder.putInSyncAllocationIds(shardNumber, remainingInSyncAllocations);
}
logger.warn("{} marking unavailable shards as stale: {}", shardEntry.getKey(), idsToRemove);
}
if (indexMetadataBuilder != null) {
if (metadataBuilder == null) {
metadataBuilder = Metadata.builder(oldMetadata);
}
metadataBuilder.put(indexMetadataBuilder);
}
}
if (metadataBuilder != null) {
return ClusterState.builder(clusterState).metadata(metadataBuilder).build();
} else {
return clusterState;
}
}
use of org.opensearch.cluster.metadata.IndexMetadata in project OpenSearch by opensearch-project.
the class IndexMetadataUpdater method updateInSyncAllocations.
/**
* Updates in-sync allocations with routing changes that were made to the routing table.
*/
private IndexMetadata.Builder updateInSyncAllocations(RoutingTable newRoutingTable, IndexMetadata oldIndexMetadata, IndexMetadata.Builder indexMetadataBuilder, ShardId shardId, Updates updates) {
assert Sets.haveEmptyIntersection(updates.addedAllocationIds, updates.removedAllocationIds) : "allocation ids cannot be both added and removed in the same allocation round, added ids: " + updates.addedAllocationIds + ", removed ids: " + updates.removedAllocationIds;
Set<String> oldInSyncAllocationIds = oldIndexMetadata.inSyncAllocationIds(shardId.id());
// check if we have been force-initializing an empty primary or a stale primary
if (updates.initializedPrimary != null && oldInSyncAllocationIds.isEmpty() == false && oldInSyncAllocationIds.contains(updates.initializedPrimary.allocationId().getId()) == false) {
// we're not reusing an existing in-sync allocation id to initialize a primary, which means that we're either force-allocating
// an empty or a stale primary (see AllocateEmptyPrimaryAllocationCommand or AllocateStalePrimaryAllocationCommand).
RecoverySource recoverySource = updates.initializedPrimary.recoverySource();
RecoverySource.Type recoverySourceType = recoverySource.getType();
boolean emptyPrimary = recoverySourceType == RecoverySource.Type.EMPTY_STORE;
assert updates.addedAllocationIds.isEmpty() : (emptyPrimary ? "empty" : "stale") + " primary is not force-initialized in same allocation round where shards are started";
if (indexMetadataBuilder == null) {
indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
}
if (emptyPrimary) {
// forcing an empty primary resets the in-sync allocations to the empty set (ShardRouting.allocatedPostIndexCreate)
indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.emptySet());
} else {
final String allocationId;
if (recoverySource == RecoverySource.ExistingStoreRecoverySource.FORCE_STALE_PRIMARY_INSTANCE) {
allocationId = RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID;
} else {
assert recoverySource instanceof RecoverySource.SnapshotRecoverySource : recoverySource;
allocationId = updates.initializedPrimary.allocationId().getId();
}
// forcing a stale primary resets the in-sync allocations to the singleton set with the stale id
indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.singleton(allocationId));
}
} else {
// standard path for updating in-sync ids
Set<String> inSyncAllocationIds = new HashSet<>(oldInSyncAllocationIds);
inSyncAllocationIds.addAll(updates.addedAllocationIds);
inSyncAllocationIds.removeAll(updates.removedAllocationIds);
assert oldInSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false || inSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false : "fake allocation id has to be removed, inSyncAllocationIds:" + inSyncAllocationIds;
// Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
// but repeatedly shut down nodes that have active replicas.
// We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
// Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number
// of replicas was decreased while shards were unassigned.
// +1 for the primary
int maxActiveShards = oldIndexMetadata.getNumberOfReplicas() + 1;
IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId);
assert newShardRoutingTable.assignedShards().stream().filter(ShardRouting::isRelocationTarget).map(s -> s.allocationId().getId()).noneMatch(inSyncAllocationIds::contains) : newShardRoutingTable.assignedShards() + " vs " + inSyncAllocationIds;
if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) {
// trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies)
List<ShardRouting> assignedShards = newShardRoutingTable.assignedShards().stream().filter(s -> s.isRelocationTarget() == false).collect(Collectors.toList());
assert assignedShards.size() <= maxActiveShards : "cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards;
Set<String> assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet());
inSyncAllocationIds = inSyncAllocationIds.stream().sorted(// values with routing entries first
Comparator.comparing(assignedAllocations::contains).reversed()).limit(maxActiveShards).collect(Collectors.toSet());
}
// in-sync set, this could create an empty primary on the next allocation.
if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) {
// add back allocation id of failed primary
inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
}
assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() : "in-sync allocations cannot become empty after they have been non-empty: " + oldInSyncAllocationIds;
// be extra safe here and only update in-sync set if it is non-empty
if (inSyncAllocationIds.isEmpty() == false) {
if (indexMetadataBuilder == null) {
indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
}
indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), inSyncAllocationIds);
}
}
return indexMetadataBuilder;
}
use of org.opensearch.cluster.metadata.IndexMetadata in project OpenSearch by opensearch-project.
the class AllocationService method applyFailedShards.
/**
* Applies the failed shards. Note, only assigned ShardRouting instances that exist in the routing table should be
* provided as parameter. Also applies a list of allocation ids to remove from the in-sync set for shard copies for which there
* are no routing entries in the routing table.
*
* <p>
* If the same instance of ClusterState is returned, then no change has been made.</p>
*/
public ClusterState applyFailedShards(final ClusterState clusterState, final List<FailedShard> failedShards, final List<StaleShard> staleShards) {
assert assertInitialized();
if (staleShards.isEmpty() && failedShards.isEmpty()) {
return clusterState;
}
ClusterState tmpState = IndexMetadataUpdater.removeStaleIdsWithoutRoutings(clusterState, staleShards, logger);
RoutingNodes routingNodes = getMutableRoutingNodes(tmpState);
// shuffle the unassigned nodes, just so we won't have things like poison failed shards
routingNodes.unassigned().shuffle();
long currentNanoTime = currentNanoTime();
RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, tmpState, clusterInfoService.getClusterInfo(), snapshotsInfoService.snapshotShardSizes(), currentNanoTime);
for (FailedShard failedShardEntry : failedShards) {
ShardRouting shardToFail = failedShardEntry.getRoutingEntry();
IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardToFail.shardId().getIndex());
allocation.addIgnoreShardForNode(shardToFail.shardId(), shardToFail.currentNodeId());
// failing a primary also fails initializing replica shards, re-resolve ShardRouting
ShardRouting failedShard = routingNodes.getByAllocationId(shardToFail.shardId(), shardToFail.allocationId().getId());
if (failedShard != null) {
if (failedShard != shardToFail) {
logger.trace("{} shard routing modified in an earlier iteration (previous: {}, current: {})", shardToFail.shardId(), shardToFail, failedShard);
}
int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
final Set<String> failedNodeIds;
if (failedShard.unassignedInfo() != null) {
failedNodeIds = new HashSet<>(failedShard.unassignedInfo().getFailedNodeIds().size() + 1);
failedNodeIds.addAll(failedShard.unassignedInfo().getFailedNodeIds());
failedNodeIds.add(failedShard.currentNodeId());
} else {
failedNodeIds = Collections.emptySet();
}
String message = "failed shard on node [" + shardToFail.currentNodeId() + "]: " + failedShardEntry.getMessage();
UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, message, failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false, UnassignedInfo.AllocationStatus.NO_ATTEMPT, failedNodeIds);
if (failedShardEntry.markAsStale()) {
allocation.removeAllocationId(failedShard);
}
logger.warn(new ParameterizedMessage("failing shard [{}]", failedShardEntry), failedShardEntry.getFailure());
routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetadata, allocation.changes());
} else {
logger.trace("{} shard routing failed in an earlier iteration (routing: {})", shardToFail.shardId(), shardToFail);
}
}
for (final ExistingShardsAllocator allocator : existingShardsAllocators.values()) {
allocator.applyFailedShards(failedShards, allocation);
}
reroute(allocation);
String failedShardsAsString = firstListElementsToCommaDelimitedString(failedShards, s -> s.getRoutingEntry().shardId().toString(), logger.isDebugEnabled());
return buildResultAndLogHealthChange(clusterState, allocation, "shards failed [" + failedShardsAsString + "]");
}
use of org.opensearch.cluster.metadata.IndexMetadata in project OpenSearch by opensearch-project.
the class CancelAllocationCommand method execute.
@Override
public RerouteExplanation execute(RoutingAllocation allocation, boolean explain) {
DiscoveryNode discoNode = allocation.nodes().resolveNode(node);
ShardRouting shardRouting = null;
RoutingNodes routingNodes = allocation.routingNodes();
RoutingNode routingNode = routingNodes.node(discoNode.getId());
IndexMetadata indexMetadata = null;
if (routingNode != null) {
indexMetadata = allocation.metadata().index(index());
if (indexMetadata == null) {
throw new IndexNotFoundException(index());
}
ShardId shardId = new ShardId(indexMetadata.getIndex(), shardId());
shardRouting = routingNode.getByShardId(shardId);
}
if (shardRouting == null) {
if (explain) {
return new RerouteExplanation(this, allocation.decision(Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + ", failed to find it on node " + discoNode));
}
throw new IllegalArgumentException("[cancel_allocation] can't cancel " + shardId + ", failed to find it on node " + discoNode);
}
if (shardRouting.primary() && allowPrimary == false) {
if ((shardRouting.initializing() && shardRouting.relocatingNodeId() != null) == false) {
// only allow cancelling initializing shard of primary relocation without allowPrimary flag
if (explain) {
return new RerouteExplanation(this, allocation.decision(Decision.NO, "cancel_allocation_command", "can't cancel " + shardId + " on node " + discoNode + ", shard is primary and " + shardRouting.state().name().toLowerCase(Locale.ROOT)));
}
throw new IllegalArgumentException("[cancel_allocation] can't cancel " + shardId + " on node " + discoNode + ", shard is primary and " + shardRouting.state().name().toLowerCase(Locale.ROOT));
}
}
routingNodes.failShard(LogManager.getLogger(CancelAllocationCommand.class), shardRouting, new UnassignedInfo(UnassignedInfo.Reason.REROUTE_CANCELLED, null), indexMetadata, allocation.changes());
// TODO: We don't have to remove a cancelled shard from in-sync set once we have a strict resync implementation.
allocation.removeAllocationId(shardRouting);
return new RerouteExplanation(this, allocation.decision(Decision.YES, "cancel_allocation_command", "shard " + shardId + " on node " + discoNode + " can be cancelled"));
}
Aggregations