Examples with ShardRouting - org.opensearch.cluster.routing.ShardRouting

Example 36 with ShardRouting

use of org.opensearch.cluster.routing.ShardRouting in project OpenSearch by opensearch-project.

the class DiskThresholdMonitor method markNodesMissingUsageIneligibleForRelease.

private void markNodesMissingUsageIneligibleForRelease(RoutingNodes routingNodes, ImmutableOpenMap<String, DiskUsage> usages, Set<String> indicesToMarkIneligibleForAutoRelease) {
    for (RoutingNode routingNode : routingNodes) {
        if (usages.containsKey(routingNode.nodeId()) == false) {
            if (routingNode != null) {
                for (ShardRouting routing : routingNode) {
                    String indexName = routing.index().getName();
                    indicesToMarkIneligibleForAutoRelease.add(indexName);
                }
            }
        }
    }
}

Also used : RoutingNode(org.opensearch.cluster.routing.RoutingNode) ShardRouting(org.opensearch.cluster.routing.ShardRouting)

Example 37 with ShardRouting

use of org.opensearch.cluster.routing.ShardRouting in project OpenSearch by opensearch-project.

the class IndexMetadataUpdater method removeStaleIdsWithoutRoutings.

/**
 * Removes allocation ids from the in-sync set for shard copies for which there is no routing entries in the routing table.
 * This method is called in AllocationService before any changes to the routing table are made.
 */
public static ClusterState removeStaleIdsWithoutRoutings(ClusterState clusterState, List<StaleShard> staleShards, Logger logger) {
    Metadata oldMetadata = clusterState.metadata();
    RoutingTable oldRoutingTable = clusterState.routingTable();
    Metadata.Builder metadataBuilder = null;
    // group staleShards entries by index
    for (Map.Entry<Index, List<StaleShard>> indexEntry : staleShards.stream().collect(Collectors.groupingBy(fs -> fs.getShardId().getIndex())).entrySet()) {
        final IndexMetadata oldIndexMetadata = oldMetadata.getIndexSafe(indexEntry.getKey());
        IndexMetadata.Builder indexMetadataBuilder = null;
        // group staleShards entries by shard id
        for (Map.Entry<ShardId, List<StaleShard>> shardEntry : indexEntry.getValue().stream().collect(Collectors.groupingBy(staleShard -> staleShard.getShardId())).entrySet()) {
            int shardNumber = shardEntry.getKey().getId();
            Set<String> oldInSyncAllocations = oldIndexMetadata.inSyncAllocationIds(shardNumber);
            Set<String> idsToRemove = shardEntry.getValue().stream().map(e -> e.getAllocationId()).collect(Collectors.toSet());
            assert idsToRemove.stream().allMatch(id -> oldRoutingTable.getByAllocationId(shardEntry.getKey(), id) == null) : "removing stale ids: " + idsToRemove + ", some of which have still a routing entry: " + oldRoutingTable;
            Set<String> remainingInSyncAllocations = Sets.difference(oldInSyncAllocations, idsToRemove);
            assert remainingInSyncAllocations.isEmpty() == false : "Set of in-sync ids cannot become empty for shard " + shardEntry.getKey() + " (before: " + oldInSyncAllocations + ", ids to remove: " + idsToRemove + ")";
            // (see ShardRouting#allocatedPostIndexCreate)
            if (remainingInSyncAllocations.isEmpty() == false) {
                if (indexMetadataBuilder == null) {
                    indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
                }
                indexMetadataBuilder.putInSyncAllocationIds(shardNumber, remainingInSyncAllocations);
            }
            logger.warn("{} marking unavailable shards as stale: {}", shardEntry.getKey(), idsToRemove);
        }
        if (indexMetadataBuilder != null) {
            if (metadataBuilder == null) {
                metadataBuilder = Metadata.builder(oldMetadata);
            }
            metadataBuilder.put(indexMetadataBuilder);
        }
    }
    if (metadataBuilder != null) {
        return ClusterState.builder(clusterState).metadata(metadataBuilder).build();
    } else {
        return clusterState;
    }
}

Also used : IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) RoutingChangesObserver(org.opensearch.cluster.routing.RoutingChangesObserver) Metadata(org.opensearch.cluster.metadata.Metadata) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) Index(org.opensearch.index.Index) Set(java.util.Set) HashMap(java.util.HashMap) Collectors(java.util.stream.Collectors) RecoverySource(org.opensearch.cluster.routing.RecoverySource) ShardRouting(org.opensearch.cluster.routing.ShardRouting) ShardId(org.opensearch.index.shard.ShardId) HashSet(java.util.HashSet) Objects(java.util.Objects) ClusterState(org.opensearch.cluster.ClusterState) Sets(org.opensearch.common.util.set.Sets) List(java.util.List) Logger(org.apache.logging.log4j.Logger) Map(java.util.Map) RoutingTable(org.opensearch.cluster.routing.RoutingTable) Comparator(java.util.Comparator) UnassignedInfo(org.opensearch.cluster.routing.UnassignedInfo) Collections(java.util.Collections) Metadata(org.opensearch.cluster.metadata.Metadata) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) Index(org.opensearch.index.Index) ShardId(org.opensearch.index.shard.ShardId) IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) RoutingTable(org.opensearch.cluster.routing.RoutingTable) List(java.util.List) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) HashMap(java.util.HashMap) Map(java.util.Map)

Example 38 with ShardRouting

use of org.opensearch.cluster.routing.ShardRouting in project OpenSearch by opensearch-project.

the class IndexMetadataUpdater method updateInSyncAllocations.

/**
 * Updates in-sync allocations with routing changes that were made to the routing table.
 */
private IndexMetadata.Builder updateInSyncAllocations(RoutingTable newRoutingTable, IndexMetadata oldIndexMetadata, IndexMetadata.Builder indexMetadataBuilder, ShardId shardId, Updates updates) {
    assert Sets.haveEmptyIntersection(updates.addedAllocationIds, updates.removedAllocationIds) : "allocation ids cannot be both added and removed in the same allocation round, added ids: " + updates.addedAllocationIds + ", removed ids: " + updates.removedAllocationIds;
    Set<String> oldInSyncAllocationIds = oldIndexMetadata.inSyncAllocationIds(shardId.id());
    // check if we have been force-initializing an empty primary or a stale primary
    if (updates.initializedPrimary != null && oldInSyncAllocationIds.isEmpty() == false && oldInSyncAllocationIds.contains(updates.initializedPrimary.allocationId().getId()) == false) {
        // we're not reusing an existing in-sync allocation id to initialize a primary, which means that we're either force-allocating
        // an empty or a stale primary (see AllocateEmptyPrimaryAllocationCommand or AllocateStalePrimaryAllocationCommand).
        RecoverySource recoverySource = updates.initializedPrimary.recoverySource();
        RecoverySource.Type recoverySourceType = recoverySource.getType();
        boolean emptyPrimary = recoverySourceType == RecoverySource.Type.EMPTY_STORE;
        assert updates.addedAllocationIds.isEmpty() : (emptyPrimary ? "empty" : "stale") + " primary is not force-initialized in same allocation round where shards are started";
        if (indexMetadataBuilder == null) {
            indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
        }
        if (emptyPrimary) {
            // forcing an empty primary resets the in-sync allocations to the empty set (ShardRouting.allocatedPostIndexCreate)
            indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.emptySet());
        } else {
            final String allocationId;
            if (recoverySource == RecoverySource.ExistingStoreRecoverySource.FORCE_STALE_PRIMARY_INSTANCE) {
                allocationId = RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID;
            } else {
                assert recoverySource instanceof RecoverySource.SnapshotRecoverySource : recoverySource;
                allocationId = updates.initializedPrimary.allocationId().getId();
            }
            // forcing a stale primary resets the in-sync allocations to the singleton set with the stale id
            indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.singleton(allocationId));
        }
    } else {
        // standard path for updating in-sync ids
        Set<String> inSyncAllocationIds = new HashSet<>(oldInSyncAllocationIds);
        inSyncAllocationIds.addAll(updates.addedAllocationIds);
        inSyncAllocationIds.removeAll(updates.removedAllocationIds);
        assert oldInSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false || inSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false : "fake allocation id has to be removed, inSyncAllocationIds:" + inSyncAllocationIds;
        // Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
        // but repeatedly shut down nodes that have active replicas.
        // We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
        // Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number
        // of replicas was decreased while shards were unassigned.
        // +1 for the primary
        int maxActiveShards = oldIndexMetadata.getNumberOfReplicas() + 1;
        IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId);
        assert newShardRoutingTable.assignedShards().stream().filter(ShardRouting::isRelocationTarget).map(s -> s.allocationId().getId()).noneMatch(inSyncAllocationIds::contains) : newShardRoutingTable.assignedShards() + " vs " + inSyncAllocationIds;
        if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) {
            // trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies)
            List<ShardRouting> assignedShards = newShardRoutingTable.assignedShards().stream().filter(s -> s.isRelocationTarget() == false).collect(Collectors.toList());
            assert assignedShards.size() <= maxActiveShards : "cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards;
            Set<String> assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet());
            inSyncAllocationIds = inSyncAllocationIds.stream().sorted(// values with routing entries first
            Comparator.comparing(assignedAllocations::contains).reversed()).limit(maxActiveShards).collect(Collectors.toSet());
        }
        // in-sync set, this could create an empty primary on the next allocation.
        if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) {
            // add back allocation id of failed primary
            inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
        }
        assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() : "in-sync allocations cannot become empty after they have been non-empty: " + oldInSyncAllocationIds;
        // be extra safe here and only update in-sync set if it is non-empty
        if (inSyncAllocationIds.isEmpty() == false) {
            if (indexMetadataBuilder == null) {
                indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
            }
            indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), inSyncAllocationIds);
        }
    }
    return indexMetadataBuilder;
}

Also used : IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) RoutingChangesObserver(org.opensearch.cluster.routing.RoutingChangesObserver) Metadata(org.opensearch.cluster.metadata.Metadata) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) Index(org.opensearch.index.Index) Set(java.util.Set) HashMap(java.util.HashMap) Collectors(java.util.stream.Collectors) RecoverySource(org.opensearch.cluster.routing.RecoverySource) ShardRouting(org.opensearch.cluster.routing.ShardRouting) ShardId(org.opensearch.index.shard.ShardId) HashSet(java.util.HashSet) Objects(java.util.Objects) ClusterState(org.opensearch.cluster.ClusterState) Sets(org.opensearch.common.util.set.Sets) List(java.util.List) Logger(org.apache.logging.log4j.Logger) Map(java.util.Map) RoutingTable(org.opensearch.cluster.routing.RoutingTable) Comparator(java.util.Comparator) UnassignedInfo(org.opensearch.cluster.routing.UnassignedInfo) Collections(java.util.Collections) IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) RecoverySource(org.opensearch.cluster.routing.RecoverySource) ShardRouting(org.opensearch.cluster.routing.ShardRouting) HashSet(java.util.HashSet)

Example 39 with ShardRouting

use of org.opensearch.cluster.routing.ShardRouting in project OpenSearch by opensearch-project.

the class AllocationService method allocateExistingUnassignedShards.

private void allocateExistingUnassignedShards(RoutingAllocation allocation) {
    // sort for priority ordering
    allocation.routingNodes().unassigned().sort(PriorityComparator.getAllocationComparator(allocation));
    for (final ExistingShardsAllocator existingShardsAllocator : existingShardsAllocators.values()) {
        existingShardsAllocator.beforeAllocation(allocation);
    }
    final RoutingNodes.UnassignedShards.UnassignedIterator primaryIterator = allocation.routingNodes().unassigned().iterator();
    while (primaryIterator.hasNext()) {
        final ShardRouting shardRouting = primaryIterator.next();
        if (shardRouting.primary()) {
            getAllocatorForShard(shardRouting, allocation).allocateUnassigned(shardRouting, allocation, primaryIterator);
        }
    }
    for (final ExistingShardsAllocator existingShardsAllocator : existingShardsAllocators.values()) {
        existingShardsAllocator.afterPrimariesBeforeReplicas(allocation);
    }
    final RoutingNodes.UnassignedShards.UnassignedIterator replicaIterator = allocation.routingNodes().unassigned().iterator();
    while (replicaIterator.hasNext()) {
        final ShardRouting shardRouting = replicaIterator.next();
        if (shardRouting.primary() == false) {
            getAllocatorForShard(shardRouting, allocation).allocateUnassigned(shardRouting, allocation, replicaIterator);
        }
    }
}

Also used : ShardRouting(org.opensearch.cluster.routing.ShardRouting)

Example 40 with ShardRouting

use of org.opensearch.cluster.routing.ShardRouting in project OpenSearch by opensearch-project.

the class AllocationService method applyFailedShards.

/**
 * Applies the failed shards. Note, only assigned ShardRouting instances that exist in the routing table should be
 * provided as parameter. Also applies a list of allocation ids to remove from the in-sync set for shard copies for which there
 * are no routing entries in the routing table.
 *
 * <p>
 * If the same instance of ClusterState is returned, then no change has been made.</p>
 */
public ClusterState applyFailedShards(final ClusterState clusterState, final List<FailedShard> failedShards, final List<StaleShard> staleShards) {
    assert assertInitialized();
    if (staleShards.isEmpty() && failedShards.isEmpty()) {
        return clusterState;
    }
    ClusterState tmpState = IndexMetadataUpdater.removeStaleIdsWithoutRoutings(clusterState, staleShards, logger);
    RoutingNodes routingNodes = getMutableRoutingNodes(tmpState);
    // shuffle the unassigned nodes, just so we won't have things like poison failed shards
    routingNodes.unassigned().shuffle();
    long currentNanoTime = currentNanoTime();
    RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, tmpState, clusterInfoService.getClusterInfo(), snapshotsInfoService.snapshotShardSizes(), currentNanoTime);
    for (FailedShard failedShardEntry : failedShards) {
        ShardRouting shardToFail = failedShardEntry.getRoutingEntry();
        IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardToFail.shardId().getIndex());
        allocation.addIgnoreShardForNode(shardToFail.shardId(), shardToFail.currentNodeId());
        // failing a primary also fails initializing replica shards, re-resolve ShardRouting
        ShardRouting failedShard = routingNodes.getByAllocationId(shardToFail.shardId(), shardToFail.allocationId().getId());
        if (failedShard != null) {
            if (failedShard != shardToFail) {
                logger.trace("{} shard routing modified in an earlier iteration (previous: {}, current: {})", shardToFail.shardId(), shardToFail, failedShard);
            }
            int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
            final Set<String> failedNodeIds;
            if (failedShard.unassignedInfo() != null) {
                failedNodeIds = new HashSet<>(failedShard.unassignedInfo().getFailedNodeIds().size() + 1);
                failedNodeIds.addAll(failedShard.unassignedInfo().getFailedNodeIds());
                failedNodeIds.add(failedShard.currentNodeId());
            } else {
                failedNodeIds = Collections.emptySet();
            }
            String message = "failed shard on node [" + shardToFail.currentNodeId() + "]: " + failedShardEntry.getMessage();
            UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, message, failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false, UnassignedInfo.AllocationStatus.NO_ATTEMPT, failedNodeIds);
            if (failedShardEntry.markAsStale()) {
                allocation.removeAllocationId(failedShard);
            }
            logger.warn(new ParameterizedMessage("failing shard [{}]", failedShardEntry), failedShardEntry.getFailure());
            routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetadata, allocation.changes());
        } else {
            logger.trace("{} shard routing failed in an earlier iteration (routing: {})", shardToFail.shardId(), shardToFail);
        }
    }
    for (final ExistingShardsAllocator allocator : existingShardsAllocators.values()) {
        allocator.applyFailedShards(failedShards, allocation);
    }
    reroute(allocation);
    String failedShardsAsString = firstListElementsToCommaDelimitedString(failedShards, s -> s.getRoutingEntry().shardId().toString(), logger.isDebugEnabled());
    return buildResultAndLogHealthChange(clusterState, allocation, "shards failed [" + failedShardsAsString + "]");
}

Also used : ClusterState(org.opensearch.cluster.ClusterState) RoutingNodes(org.opensearch.cluster.routing.RoutingNodes) UnassignedInfo(org.opensearch.cluster.routing.UnassignedInfo) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) ShardRouting(org.opensearch.cluster.routing.ShardRouting) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata)

Aggregations

ShardRouting (org.opensearch.cluster.routing.ShardRouting)361 ClusterState (org.opensearch.cluster.ClusterState)172 IndexMetadata (org.opensearch.cluster.metadata.IndexMetadata)135 ShardId (org.opensearch.index.shard.ShardId)110 TestShardRouting (org.opensearch.cluster.routing.TestShardRouting)100 IndexShardRoutingTable (org.opensearch.cluster.routing.IndexShardRoutingTable)93 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)85 RoutingTable (org.opensearch.cluster.routing.RoutingTable)84 Settings (org.opensearch.common.settings.Settings)83 Metadata (org.opensearch.cluster.metadata.Metadata)71 HashSet (java.util.HashSet)59 RoutingNode (org.opensearch.cluster.routing.RoutingNode)59 ArrayList (java.util.ArrayList)57 IOException (java.io.IOException)56 List (java.util.List)50 PlainActionFuture (org.opensearch.action.support.PlainActionFuture)50 Index (org.opensearch.index.Index)50 UnassignedInfo (org.opensearch.cluster.routing.UnassignedInfo)49 IndexShard (org.opensearch.index.shard.IndexShard)49 ActionListener (org.opensearch.action.ActionListener)45