Search in sources :

Example 1 with RoutingNodes

use of org.opensearch.cluster.routing.RoutingNodes in project OpenSearch by opensearch-project.

the class TransportClusterAllocationExplainAction method masterOperation.

@Override
protected void masterOperation(final ClusterAllocationExplainRequest request, final ClusterState state, final ActionListener<ClusterAllocationExplainResponse> listener) {
    final RoutingNodes routingNodes = state.getRoutingNodes();
    final ClusterInfo clusterInfo = clusterInfoService.getClusterInfo();
    final RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, state, clusterInfo, snapshotsInfoService.snapshotShardSizes(), System.nanoTime());
    ShardRouting shardRouting = findShardToExplain(request, allocation);
    logger.debug("explaining the allocation for [{}], found shard [{}]", request, shardRouting);
    ClusterAllocationExplanation cae = explainShard(shardRouting, allocation, request.includeDiskInfo() ? clusterInfo : null, request.includeYesDecisions(), allocationService);
    listener.onResponse(new ClusterAllocationExplainResponse(cae));
}
Also used : ClusterInfo(org.opensearch.cluster.ClusterInfo) RoutingNodes(org.opensearch.cluster.routing.RoutingNodes) RoutingAllocation(org.opensearch.cluster.routing.allocation.RoutingAllocation) ShardRouting(org.opensearch.cluster.routing.ShardRouting)

Example 2 with RoutingNodes

use of org.opensearch.cluster.routing.RoutingNodes in project OpenSearch by opensearch-project.

the class ReplicaShardAllocator method makeAllocationDecision.

@Override
public AllocateUnassignedDecision makeAllocationDecision(final ShardRouting unassignedShard, final RoutingAllocation allocation, final Logger logger) {
    if (isResponsibleFor(unassignedShard) == false) {
        // this allocator is not responsible for deciding on this shard
        return AllocateUnassignedDecision.NOT_TAKEN;
    }
    final RoutingNodes routingNodes = allocation.routingNodes();
    final boolean explain = allocation.debugDecision();
    // pre-check if it can be allocated to any node that currently exists, so we won't list the store for it for nothing
    Tuple<Decision, Map<String, NodeAllocationResult>> result = canBeAllocatedToAtLeastOneNode(unassignedShard, allocation);
    Decision allocateDecision = result.v1();
    if (allocateDecision.type() != Decision.Type.YES && (explain == false || hasInitiatedFetching(unassignedShard) == false)) {
        // only return early if we are not in explain mode, or we are in explain mode but we have not
        // yet attempted to fetch any shard data
        logger.trace("{}: ignoring allocation, can't be allocated on any node", unassignedShard);
        return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), result.v2() != null ? new ArrayList<>(result.v2().values()) : null);
    }
    AsyncShardFetch.FetchResult<NodeStoreFilesMetadata> shardStores = fetchData(unassignedShard, allocation);
    if (shardStores.hasData() == false) {
        logger.trace("{}: ignoring allocation, still fetching shard stores", unassignedShard);
        allocation.setHasPendingAsyncFetch();
        List<NodeAllocationResult> nodeDecisions = null;
        if (explain) {
            nodeDecisions = buildDecisionsForAllNodes(unassignedShard, allocation);
        }
        return AllocateUnassignedDecision.no(AllocationStatus.FETCHING_SHARD_DATA, nodeDecisions);
    }
    ShardRouting primaryShard = routingNodes.activePrimary(unassignedShard.shardId());
    if (primaryShard == null) {
        assert explain : "primary should only be null here if we are in explain mode, so we didn't " + "exit early when canBeAllocatedToAtLeastOneNode didn't return a YES decision";
        return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), new ArrayList<>(result.v2().values()));
    }
    assert primaryShard.currentNodeId() != null;
    final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
    final TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore = findStore(primaryNode, shardStores);
    if (primaryStore == null) {
        // if we can't find the primary data, it is probably because the primary shard is corrupted (and listing failed)
        // we want to let the replica be allocated in order to expose the actual problem with the primary that the replica
        // will try and recover from
        // Note, this is the existing behavior, as exposed in running CorruptFileTest#testNoPrimaryData
        logger.trace("{}: no primary shard store found or allocated, letting actual allocation figure it out", unassignedShard);
        return AllocateUnassignedDecision.NOT_TAKEN;
    }
    MatchingNodes matchingNodes = findMatchingNodes(unassignedShard, allocation, false, primaryNode, primaryStore, shardStores, explain);
    assert explain == false || matchingNodes.nodeDecisions != null : "in explain mode, we must have individual node decisions";
    List<NodeAllocationResult> nodeDecisions = augmentExplanationsWithStoreInfo(result.v2(), matchingNodes.nodeDecisions);
    if (allocateDecision.type() != Decision.Type.YES) {
        return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), nodeDecisions);
    } else if (matchingNodes.getNodeWithHighestMatch() != null) {
        RoutingNode nodeWithHighestMatch = allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().getId());
        // we only check on THROTTLE since we checked before on NO
        Decision decision = allocation.deciders().canAllocate(unassignedShard, nodeWithHighestMatch, allocation);
        if (decision.type() == Decision.Type.THROTTLE) {
            logger.debug("[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodeWithHighestMatch.node());
            // we are throttling this, as we have enough other shards to allocate to this node, so ignore it for now
            return AllocateUnassignedDecision.throttle(nodeDecisions);
        } else {
            logger.debug("[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodeWithHighestMatch.node());
            // we found a match
            return AllocateUnassignedDecision.yes(nodeWithHighestMatch.node(), null, nodeDecisions, true);
        }
    } else if (matchingNodes.hasAnyData() == false && unassignedShard.unassignedInfo().isDelayed()) {
        // if we didn't manage to find *any* data (regardless of matching sizes), and the replica is
        // unassigned due to a node leaving, so we delay allocation of this replica to see if the
        // node with the shard copy will rejoin so we can re-use the copy it has
        logger.debug("{}: allocation of [{}] is delayed", unassignedShard.shardId(), unassignedShard);
        long remainingDelayMillis = 0L;
        long totalDelayMillis = 0L;
        if (explain) {
            UnassignedInfo unassignedInfo = unassignedShard.unassignedInfo();
            Metadata metadata = allocation.metadata();
            IndexMetadata indexMetadata = metadata.index(unassignedShard.index());
            totalDelayMillis = INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(indexMetadata.getSettings()).getMillis();
            long remainingDelayNanos = unassignedInfo.getRemainingDelay(System.nanoTime(), indexMetadata.getSettings());
            remainingDelayMillis = TimeValue.timeValueNanos(remainingDelayNanos).millis();
        }
        return AllocateUnassignedDecision.delayed(remainingDelayMillis, totalDelayMillis, nodeDecisions);
    }
    return AllocateUnassignedDecision.NOT_TAKEN;
}
Also used : NodeStoreFilesMetadata(org.opensearch.indices.store.TransportNodesListShardStoreMetadata.NodeStoreFilesMetadata) DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) RoutingNodes(org.opensearch.cluster.routing.RoutingNodes) UnassignedInfo(org.opensearch.cluster.routing.UnassignedInfo) ArrayList(java.util.ArrayList) Metadata(org.opensearch.cluster.metadata.Metadata) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) StoreFileMetadata(org.opensearch.index.store.StoreFileMetadata) TransportNodesListShardStoreMetadata(org.opensearch.indices.store.TransportNodesListShardStoreMetadata) NodeStoreFilesMetadata(org.opensearch.indices.store.TransportNodesListShardStoreMetadata.NodeStoreFilesMetadata) Decision(org.opensearch.cluster.routing.allocation.decider.Decision) AllocateUnassignedDecision(org.opensearch.cluster.routing.allocation.AllocateUnassignedDecision) RoutingNode(org.opensearch.cluster.routing.RoutingNode) TransportNodesListShardStoreMetadata(org.opensearch.indices.store.TransportNodesListShardStoreMetadata) ShardRouting(org.opensearch.cluster.routing.ShardRouting) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) HashMap(java.util.HashMap) Map(java.util.Map) NodeAllocationResult(org.opensearch.cluster.routing.allocation.NodeAllocationResult)

Example 3 with RoutingNodes

use of org.opensearch.cluster.routing.RoutingNodes in project OpenSearch by opensearch-project.

the class TransportIndicesShardStoresAction method masterOperation.

@Override
protected void masterOperation(IndicesShardStoresRequest request, ClusterState state, ActionListener<IndicesShardStoresResponse> listener) {
    final RoutingTable routingTables = state.routingTable();
    final RoutingNodes routingNodes = state.getRoutingNodes();
    final String[] concreteIndices = indexNameExpressionResolver.concreteIndexNames(state, request);
    final Set<Tuple<ShardId, String>> shardsToFetch = new HashSet<>();
    logger.trace("using cluster state version [{}] to determine shards", state.version());
    // collect relevant shard ids of the requested indices for fetching store infos
    for (String index : concreteIndices) {
        IndexRoutingTable indexShardRoutingTables = routingTables.index(index);
        if (indexShardRoutingTables == null) {
            continue;
        }
        final String customDataPath = IndexMetadata.INDEX_DATA_PATH_SETTING.get(state.metadata().index(index).getSettings());
        for (IndexShardRoutingTable routing : indexShardRoutingTables) {
            final int shardId = routing.shardId().id();
            ClusterShardHealth shardHealth = new ClusterShardHealth(shardId, routing);
            if (request.shardStatuses().contains(shardHealth.getStatus())) {
                shardsToFetch.add(Tuple.tuple(routing.shardId(), customDataPath));
            }
        }
    }
    // async fetch store infos from all the nodes
    // NOTE: instead of fetching shard store info one by one from every node (nShards * nNodes requests)
    // we could fetch all shard store info from every node once (nNodes requests)
    // we have to implement a TransportNodesAction instead of using TransportNodesListGatewayStartedShards
    // for fetching shard stores info, that operates on a list of shards instead of a single shard
    new AsyncShardStoresInfoFetches(state.nodes(), routingNodes, shardsToFetch, listener).start();
}
Also used : IndexRoutingTable(org.opensearch.cluster.routing.IndexRoutingTable) IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) IndexRoutingTable(org.opensearch.cluster.routing.IndexRoutingTable) IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) RoutingTable(org.opensearch.cluster.routing.RoutingTable) RoutingNodes(org.opensearch.cluster.routing.RoutingNodes) ClusterShardHealth(org.opensearch.cluster.health.ClusterShardHealth) Tuple(org.opensearch.common.collect.Tuple) HashSet(java.util.HashSet)

Example 4 with RoutingNodes

use of org.opensearch.cluster.routing.RoutingNodes in project OpenSearch by opensearch-project.

the class DiskThresholdMonitor method onNewInfo.

public void onNewInfo(ClusterInfo info) {
    // all ClusterInfo updates are processed and never ignored
    if (checkInProgress.compareAndSet(false, true) == false) {
        logger.info("skipping monitor as a check is already in progress");
        return;
    }
    final ImmutableOpenMap<String, DiskUsage> usages = info.getNodeLeastAvailableDiskUsages();
    if (usages == null) {
        logger.trace("skipping monitor as no disk usage information is available");
        checkFinished();
        return;
    }
    logger.trace("processing new cluster info");
    boolean reroute = false;
    String explanation = "";
    final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
    // Clean up nodes that have been removed from the cluster
    final ObjectLookupContainer<String> nodes = usages.keys();
    cleanUpRemovedNodes(nodes, nodesOverLowThreshold);
    cleanUpRemovedNodes(nodes, nodesOverHighThreshold);
    cleanUpRemovedNodes(nodes, nodesOverHighThresholdAndRelocating);
    final ClusterState state = clusterStateSupplier.get();
    final Set<String> indicesToMarkReadOnly = new HashSet<>();
    RoutingNodes routingNodes = state.getRoutingNodes();
    Set<String> indicesNotToAutoRelease = new HashSet<>();
    markNodesMissingUsageIneligibleForRelease(routingNodes, usages, indicesNotToAutoRelease);
    final List<DiskUsage> usagesOverHighThreshold = new ArrayList<>();
    for (final ObjectObjectCursor<String, DiskUsage> entry : usages) {
        final String node = entry.key;
        final DiskUsage usage = entry.value;
        final RoutingNode routingNode = routingNodes.node(node);
        if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes() || usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) {
            nodesOverLowThreshold.add(node);
            nodesOverHighThreshold.add(node);
            nodesOverHighThresholdAndRelocating.remove(node);
            if (routingNode != null) {
                // might be temporarily null if the ClusterInfoService and the ClusterService are out of step
                for (ShardRouting routing : routingNode) {
                    String indexName = routing.index().getName();
                    indicesToMarkReadOnly.add(indexName);
                    indicesNotToAutoRelease.add(indexName);
                }
            }
            logger.warn("flood stage disk watermark [{}] exceeded on {}, all indices on this node will be marked read-only", diskThresholdSettings.describeFloodStageThreshold(), usage);
            continue;
        }
        if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() || usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
            if (routingNode != null) {
                // might be temporarily null if the ClusterInfoService and the ClusterService are out of step
                for (ShardRouting routing : routingNode) {
                    String indexName = routing.index().getName();
                    indicesNotToAutoRelease.add(indexName);
                }
            }
        }
        final long reservedSpace = info.getReservedSpace(usage.getNodeId(), usage.getPath()).getTotal();
        final DiskUsage usageWithReservedSpace = new DiskUsage(usage.getNodeId(), usage.getNodeName(), usage.getPath(), usage.getTotalBytes(), Math.max(0L, usage.getFreeBytes() - reservedSpace));
        if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() || usageWithReservedSpace.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
            nodesOverLowThreshold.add(node);
            nodesOverHighThreshold.add(node);
            if (lastRunTimeMillis.get() <= currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
                reroute = true;
                explanation = "high disk watermark exceeded on one or more nodes";
                usagesOverHighThreshold.add(usage);
            // will log about this node when the reroute completes
            } else {
                logger.debug("high disk watermark exceeded on {} but an automatic reroute has occurred " + "in the last [{}], skipping reroute", node, diskThresholdSettings.getRerouteInterval());
            }
        } else if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLow().getBytes() || usageWithReservedSpace.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdLow()) {
            nodesOverHighThresholdAndRelocating.remove(node);
            final boolean wasUnderLowThreshold = nodesOverLowThreshold.add(node);
            final boolean wasOverHighThreshold = nodesOverHighThreshold.remove(node);
            assert (wasUnderLowThreshold && wasOverHighThreshold) == false;
            if (wasUnderLowThreshold) {
                logger.info("low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node", diskThresholdSettings.describeLowThreshold(), usage);
            } else if (wasOverHighThreshold) {
                logger.info("high disk watermark [{}] no longer exceeded on {}, but low disk watermark [{}] is still exceeded", diskThresholdSettings.describeHighThreshold(), usage, diskThresholdSettings.describeLowThreshold());
            }
        } else {
            nodesOverHighThresholdAndRelocating.remove(node);
            if (nodesOverLowThreshold.contains(node)) {
                // if we reroute now.
                if (lastRunTimeMillis.get() <= currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
                    reroute = true;
                    explanation = "one or more nodes has gone under the high or low watermark";
                    nodesOverLowThreshold.remove(node);
                    nodesOverHighThreshold.remove(node);
                    logger.info("low disk watermark [{}] no longer exceeded on {}", diskThresholdSettings.describeLowThreshold(), usage);
                } else {
                    logger.debug("{} has gone below a disk threshold, but an automatic reroute has occurred " + "in the last [{}], skipping reroute", node, diskThresholdSettings.getRerouteInterval());
                }
            }
        }
    }
    final ActionListener<Void> listener = new GroupedActionListener<>(ActionListener.wrap(this::checkFinished), 3);
    if (reroute) {
        logger.debug("rerouting shards: [{}]", explanation);
        rerouteService.reroute("disk threshold monitor", Priority.HIGH, ActionListener.wrap(reroutedClusterState -> {
            for (DiskUsage diskUsage : usagesOverHighThreshold) {
                final RoutingNode routingNode = reroutedClusterState.getRoutingNodes().node(diskUsage.getNodeId());
                final DiskUsage usageIncludingRelocations;
                final long relocatingShardsSize;
                if (routingNode != null) {
                    // might be temporarily null if the ClusterInfoService and the ClusterService are out of step
                    relocatingShardsSize = sizeOfRelocatingShards(routingNode, diskUsage, info, reroutedClusterState);
                    usageIncludingRelocations = new DiskUsage(diskUsage.getNodeId(), diskUsage.getNodeName(), diskUsage.getPath(), diskUsage.getTotalBytes(), diskUsage.getFreeBytes() - relocatingShardsSize);
                } else {
                    usageIncludingRelocations = diskUsage;
                    relocatingShardsSize = 0L;
                }
                if (usageIncludingRelocations.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() || usageIncludingRelocations.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
                    nodesOverHighThresholdAndRelocating.remove(diskUsage.getNodeId());
                    logger.warn("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes; the node is expected to continue to exceed " + "the high disk watermark when these relocations are complete", diskThresholdSettings.describeHighThreshold(), diskUsage, -relocatingShardsSize);
                } else if (nodesOverHighThresholdAndRelocating.add(diskUsage.getNodeId())) {
                    logger.info("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes; the node is expected to be below the high " + "disk watermark when these relocations are complete", diskThresholdSettings.describeHighThreshold(), diskUsage, -relocatingShardsSize);
                } else {
                    logger.debug("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes", diskThresholdSettings.describeHighThreshold(), diskUsage, -relocatingShardsSize);
                }
            }
            setLastRunTimeMillis();
            listener.onResponse(null);
        }, e -> {
            logger.debug("reroute failed", e);
            setLastRunTimeMillis();
            listener.onFailure(e);
        }));
    } else {
        logger.trace("no reroute required");
        listener.onResponse(null);
    }
    final Set<String> indicesToAutoRelease = StreamSupport.stream(state.routingTable().indicesRouting().spliterator(), false).map(c -> c.key).filter(index -> indicesNotToAutoRelease.contains(index) == false).filter(index -> state.getBlocks().hasIndexBlock(index, IndexMetadata.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK)).collect(Collectors.toSet());
    if (indicesToAutoRelease.isEmpty() == false) {
        if (diskThresholdSettings.isAutoReleaseIndexEnabled()) {
            logger.info("releasing read-only-allow-delete block on indices: [{}]", indicesToAutoRelease);
            updateIndicesReadOnly(indicesToAutoRelease, listener, false);
        } else {
            deprecationLogger.deprecate(DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY.replace(".", "_"), "[{}] will be removed in version {}", DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY, LegacyESVersion.V_7_4_0.major + 1);
            logger.debug("[{}] disabled, not releasing read-only-allow-delete block on indices: [{}]", DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY, indicesToAutoRelease);
            listener.onResponse(null);
        }
    } else {
        logger.trace("no auto-release required");
        listener.onResponse(null);
    }
    indicesToMarkReadOnly.removeIf(index -> state.getBlocks().indexBlocked(ClusterBlockLevel.WRITE, index));
    logger.trace("marking indices as read-only: [{}]", indicesToMarkReadOnly);
    if (indicesToMarkReadOnly.isEmpty() == false) {
        updateIndicesReadOnly(indicesToMarkReadOnly, listener, true);
    } else {
        listener.onResponse(null);
    }
}
Also used : ImmutableOpenMap(org.opensearch.common.collect.ImmutableOpenMap) LongSupplier(java.util.function.LongSupplier) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Priority(org.opensearch.common.Priority) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) Supplier(java.util.function.Supplier) Strings(org.opensearch.common.Strings) ArrayList(java.util.ArrayList) DeprecationLogger(org.opensearch.common.logging.DeprecationLogger) HashSet(java.util.HashSet) ObjectObjectCursor(com.carrotsearch.hppc.cursors.ObjectObjectCursor) GroupedActionListener(org.opensearch.action.support.GroupedActionListener) ClusterState(org.opensearch.cluster.ClusterState) LegacyESVersion(org.opensearch.LegacyESVersion) RerouteService(org.opensearch.cluster.routing.RerouteService) RoutingNodes(org.opensearch.cluster.routing.RoutingNodes) StreamSupport(java.util.stream.StreamSupport) ActionListener(org.opensearch.action.ActionListener) DiskUsage(org.opensearch.cluster.DiskUsage) ClusterSettings(org.opensearch.common.settings.ClusterSettings) Client(org.opensearch.client.Client) DiskThresholdDecider(org.opensearch.cluster.routing.allocation.decider.DiskThresholdDecider) ClusterInfo(org.opensearch.cluster.ClusterInfo) ClusterBlockLevel(org.opensearch.cluster.block.ClusterBlockLevel) Set(java.util.Set) Settings(org.opensearch.common.settings.Settings) Collectors(java.util.stream.Collectors) ShardRouting(org.opensearch.cluster.routing.ShardRouting) AtomicLong(java.util.concurrent.atomic.AtomicLong) Sets(org.opensearch.common.util.set.Sets) List(java.util.List) Logger(org.apache.logging.log4j.Logger) RoutingNode(org.opensearch.cluster.routing.RoutingNode) LogManager(org.apache.logging.log4j.LogManager) ObjectLookupContainer(com.carrotsearch.hppc.ObjectLookupContainer) ClusterState(org.opensearch.cluster.ClusterState) RoutingNodes(org.opensearch.cluster.routing.RoutingNodes) ArrayList(java.util.ArrayList) DiskUsage(org.opensearch.cluster.DiskUsage) RoutingNode(org.opensearch.cluster.routing.RoutingNode) GroupedActionListener(org.opensearch.action.support.GroupedActionListener) ShardRouting(org.opensearch.cluster.routing.ShardRouting) HashSet(java.util.HashSet)

Example 5 with RoutingNodes

use of org.opensearch.cluster.routing.RoutingNodes in project OpenSearch by opensearch-project.

the class AllocationService method applyFailedShards.

/**
 * Applies the failed shards. Note, only assigned ShardRouting instances that exist in the routing table should be
 * provided as parameter. Also applies a list of allocation ids to remove from the in-sync set for shard copies for which there
 * are no routing entries in the routing table.
 *
 * <p>
 * If the same instance of ClusterState is returned, then no change has been made.</p>
 */
public ClusterState applyFailedShards(final ClusterState clusterState, final List<FailedShard> failedShards, final List<StaleShard> staleShards) {
    assert assertInitialized();
    if (staleShards.isEmpty() && failedShards.isEmpty()) {
        return clusterState;
    }
    ClusterState tmpState = IndexMetadataUpdater.removeStaleIdsWithoutRoutings(clusterState, staleShards, logger);
    RoutingNodes routingNodes = getMutableRoutingNodes(tmpState);
    // shuffle the unassigned nodes, just so we won't have things like poison failed shards
    routingNodes.unassigned().shuffle();
    long currentNanoTime = currentNanoTime();
    RoutingAllocation allocation = new RoutingAllocation(allocationDeciders, routingNodes, tmpState, clusterInfoService.getClusterInfo(), snapshotsInfoService.snapshotShardSizes(), currentNanoTime);
    for (FailedShard failedShardEntry : failedShards) {
        ShardRouting shardToFail = failedShardEntry.getRoutingEntry();
        IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(shardToFail.shardId().getIndex());
        allocation.addIgnoreShardForNode(shardToFail.shardId(), shardToFail.currentNodeId());
        // failing a primary also fails initializing replica shards, re-resolve ShardRouting
        ShardRouting failedShard = routingNodes.getByAllocationId(shardToFail.shardId(), shardToFail.allocationId().getId());
        if (failedShard != null) {
            if (failedShard != shardToFail) {
                logger.trace("{} shard routing modified in an earlier iteration (previous: {}, current: {})", shardToFail.shardId(), shardToFail, failedShard);
            }
            int failedAllocations = failedShard.unassignedInfo() != null ? failedShard.unassignedInfo().getNumFailedAllocations() : 0;
            final Set<String> failedNodeIds;
            if (failedShard.unassignedInfo() != null) {
                failedNodeIds = new HashSet<>(failedShard.unassignedInfo().getFailedNodeIds().size() + 1);
                failedNodeIds.addAll(failedShard.unassignedInfo().getFailedNodeIds());
                failedNodeIds.add(failedShard.currentNodeId());
            } else {
                failedNodeIds = Collections.emptySet();
            }
            String message = "failed shard on node [" + shardToFail.currentNodeId() + "]: " + failedShardEntry.getMessage();
            UnassignedInfo unassignedInfo = new UnassignedInfo(UnassignedInfo.Reason.ALLOCATION_FAILED, message, failedShardEntry.getFailure(), failedAllocations + 1, currentNanoTime, System.currentTimeMillis(), false, UnassignedInfo.AllocationStatus.NO_ATTEMPT, failedNodeIds);
            if (failedShardEntry.markAsStale()) {
                allocation.removeAllocationId(failedShard);
            }
            logger.warn(new ParameterizedMessage("failing shard [{}]", failedShardEntry), failedShardEntry.getFailure());
            routingNodes.failShard(logger, failedShard, unassignedInfo, indexMetadata, allocation.changes());
        } else {
            logger.trace("{} shard routing failed in an earlier iteration (routing: {})", shardToFail.shardId(), shardToFail);
        }
    }
    for (final ExistingShardsAllocator allocator : existingShardsAllocators.values()) {
        allocator.applyFailedShards(failedShards, allocation);
    }
    reroute(allocation);
    String failedShardsAsString = firstListElementsToCommaDelimitedString(failedShards, s -> s.getRoutingEntry().shardId().toString(), logger.isDebugEnabled());
    return buildResultAndLogHealthChange(clusterState, allocation, "shards failed [" + failedShardsAsString + "]");
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) RoutingNodes(org.opensearch.cluster.routing.RoutingNodes) UnassignedInfo(org.opensearch.cluster.routing.UnassignedInfo) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) ShardRouting(org.opensearch.cluster.routing.ShardRouting) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata)

Aggregations

RoutingNodes (org.opensearch.cluster.routing.RoutingNodes)63 ClusterState (org.opensearch.cluster.ClusterState)48 IndexMetadata (org.opensearch.cluster.metadata.IndexMetadata)45 Metadata (org.opensearch.cluster.metadata.Metadata)42 RoutingTable (org.opensearch.cluster.routing.RoutingTable)42 ShardRouting (org.opensearch.cluster.routing.ShardRouting)24 RoutingNode (org.opensearch.cluster.routing.RoutingNode)17 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)13 IndexShardRoutingTable (org.opensearch.cluster.routing.IndexShardRoutingTable)11 DiscoveryNodes (org.opensearch.cluster.node.DiscoveryNodes)9 RoutingAllocation (org.opensearch.cluster.routing.allocation.RoutingAllocation)8 Settings (org.opensearch.common.settings.Settings)8 IndexRoutingTable (org.opensearch.cluster.routing.IndexRoutingTable)7 TestShardRouting (org.opensearch.cluster.routing.TestShardRouting)7 UnassignedInfo (org.opensearch.cluster.routing.UnassignedInfo)7 ClusterSettings (org.opensearch.common.settings.ClusterSettings)7 ClusterInfo (org.opensearch.cluster.ClusterInfo)6 ArrayList (java.util.ArrayList)5 BalancedShardsAllocator (org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator)5 ImmutableOpenMap (org.opensearch.common.collect.ImmutableOpenMap)5