use of org.opensearch.cluster.routing.RoutingNode in project OpenSearch by opensearch-project.
the class MockDiskUsagesIT method getShardCountByNodeId.
private Map<String, Integer> getShardCountByNodeId() {
final Map<String, Integer> shardCountByNodeId = new HashMap<>();
final ClusterState clusterState = client().admin().cluster().prepareState().get().getState();
for (final RoutingNode node : clusterState.getRoutingNodes()) {
logger.info("----> node {} has {} shards", node.nodeId(), clusterState.getRoutingNodes().node(node.nodeId()).numberOfOwningShards());
shardCountByNodeId.put(node.nodeId(), clusterState.getRoutingNodes().node(node.nodeId()).numberOfOwningShards());
}
return shardCountByNodeId;
}
use of org.opensearch.cluster.routing.RoutingNode in project OpenSearch by opensearch-project.
the class ReplicaShardAllocator method makeAllocationDecision.
@Override
public AllocateUnassignedDecision makeAllocationDecision(final ShardRouting unassignedShard, final RoutingAllocation allocation, final Logger logger) {
if (isResponsibleFor(unassignedShard) == false) {
// this allocator is not responsible for deciding on this shard
return AllocateUnassignedDecision.NOT_TAKEN;
}
final RoutingNodes routingNodes = allocation.routingNodes();
final boolean explain = allocation.debugDecision();
// pre-check if it can be allocated to any node that currently exists, so we won't list the store for it for nothing
Tuple<Decision, Map<String, NodeAllocationResult>> result = canBeAllocatedToAtLeastOneNode(unassignedShard, allocation);
Decision allocateDecision = result.v1();
if (allocateDecision.type() != Decision.Type.YES && (explain == false || hasInitiatedFetching(unassignedShard) == false)) {
// only return early if we are not in explain mode, or we are in explain mode but we have not
// yet attempted to fetch any shard data
logger.trace("{}: ignoring allocation, can't be allocated on any node", unassignedShard);
return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), result.v2() != null ? new ArrayList<>(result.v2().values()) : null);
}
AsyncShardFetch.FetchResult<NodeStoreFilesMetadata> shardStores = fetchData(unassignedShard, allocation);
if (shardStores.hasData() == false) {
logger.trace("{}: ignoring allocation, still fetching shard stores", unassignedShard);
allocation.setHasPendingAsyncFetch();
List<NodeAllocationResult> nodeDecisions = null;
if (explain) {
nodeDecisions = buildDecisionsForAllNodes(unassignedShard, allocation);
}
return AllocateUnassignedDecision.no(AllocationStatus.FETCHING_SHARD_DATA, nodeDecisions);
}
ShardRouting primaryShard = routingNodes.activePrimary(unassignedShard.shardId());
if (primaryShard == null) {
assert explain : "primary should only be null here if we are in explain mode, so we didn't " + "exit early when canBeAllocatedToAtLeastOneNode didn't return a YES decision";
return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), new ArrayList<>(result.v2().values()));
}
assert primaryShard.currentNodeId() != null;
final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
final TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore = findStore(primaryNode, shardStores);
if (primaryStore == null) {
// if we can't find the primary data, it is probably because the primary shard is corrupted (and listing failed)
// we want to let the replica be allocated in order to expose the actual problem with the primary that the replica
// will try and recover from
// Note, this is the existing behavior, as exposed in running CorruptFileTest#testNoPrimaryData
logger.trace("{}: no primary shard store found or allocated, letting actual allocation figure it out", unassignedShard);
return AllocateUnassignedDecision.NOT_TAKEN;
}
MatchingNodes matchingNodes = findMatchingNodes(unassignedShard, allocation, false, primaryNode, primaryStore, shardStores, explain);
assert explain == false || matchingNodes.nodeDecisions != null : "in explain mode, we must have individual node decisions";
List<NodeAllocationResult> nodeDecisions = augmentExplanationsWithStoreInfo(result.v2(), matchingNodes.nodeDecisions);
if (allocateDecision.type() != Decision.Type.YES) {
return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), nodeDecisions);
} else if (matchingNodes.getNodeWithHighestMatch() != null) {
RoutingNode nodeWithHighestMatch = allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().getId());
// we only check on THROTTLE since we checked before on NO
Decision decision = allocation.deciders().canAllocate(unassignedShard, nodeWithHighestMatch, allocation);
if (decision.type() == Decision.Type.THROTTLE) {
logger.debug("[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodeWithHighestMatch.node());
// we are throttling this, as we have enough other shards to allocate to this node, so ignore it for now
return AllocateUnassignedDecision.throttle(nodeDecisions);
} else {
logger.debug("[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodeWithHighestMatch.node());
// we found a match
return AllocateUnassignedDecision.yes(nodeWithHighestMatch.node(), null, nodeDecisions, true);
}
} else if (matchingNodes.hasAnyData() == false && unassignedShard.unassignedInfo().isDelayed()) {
// if we didn't manage to find *any* data (regardless of matching sizes), and the replica is
// unassigned due to a node leaving, so we delay allocation of this replica to see if the
// node with the shard copy will rejoin so we can re-use the copy it has
logger.debug("{}: allocation of [{}] is delayed", unassignedShard.shardId(), unassignedShard);
long remainingDelayMillis = 0L;
long totalDelayMillis = 0L;
if (explain) {
UnassignedInfo unassignedInfo = unassignedShard.unassignedInfo();
Metadata metadata = allocation.metadata();
IndexMetadata indexMetadata = metadata.index(unassignedShard.index());
totalDelayMillis = INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(indexMetadata.getSettings()).getMillis();
long remainingDelayNanos = unassignedInfo.getRemainingDelay(System.nanoTime(), indexMetadata.getSettings());
remainingDelayMillis = TimeValue.timeValueNanos(remainingDelayNanos).millis();
}
return AllocateUnassignedDecision.delayed(remainingDelayMillis, totalDelayMillis, nodeDecisions);
}
return AllocateUnassignedDecision.NOT_TAKEN;
}
use of org.opensearch.cluster.routing.RoutingNode in project OpenSearch by opensearch-project.
the class BaseGatewayShardAllocator method buildDecisionsForAllNodes.
/**
* Builds decisions for all nodes in the cluster, so that the explain API can provide information on
* allocation decisions for each node, while still waiting to allocate the shard (e.g. due to fetching shard data).
*/
protected static List<NodeAllocationResult> buildDecisionsForAllNodes(ShardRouting shard, RoutingAllocation allocation) {
List<NodeAllocationResult> results = new ArrayList<>();
for (RoutingNode node : allocation.routingNodes()) {
Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
results.add(new NodeAllocationResult(node.node(), null, decision));
}
return results;
}
use of org.opensearch.cluster.routing.RoutingNode in project OpenSearch by opensearch-project.
the class DiskThresholdMonitor method onNewInfo.
public void onNewInfo(ClusterInfo info) {
// all ClusterInfo updates are processed and never ignored
if (checkInProgress.compareAndSet(false, true) == false) {
logger.info("skipping monitor as a check is already in progress");
return;
}
final ImmutableOpenMap<String, DiskUsage> usages = info.getNodeLeastAvailableDiskUsages();
if (usages == null) {
logger.trace("skipping monitor as no disk usage information is available");
checkFinished();
return;
}
logger.trace("processing new cluster info");
boolean reroute = false;
String explanation = "";
final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
// Clean up nodes that have been removed from the cluster
final ObjectLookupContainer<String> nodes = usages.keys();
cleanUpRemovedNodes(nodes, nodesOverLowThreshold);
cleanUpRemovedNodes(nodes, nodesOverHighThreshold);
cleanUpRemovedNodes(nodes, nodesOverHighThresholdAndRelocating);
final ClusterState state = clusterStateSupplier.get();
final Set<String> indicesToMarkReadOnly = new HashSet<>();
RoutingNodes routingNodes = state.getRoutingNodes();
Set<String> indicesNotToAutoRelease = new HashSet<>();
markNodesMissingUsageIneligibleForRelease(routingNodes, usages, indicesNotToAutoRelease);
final List<DiskUsage> usagesOverHighThreshold = new ArrayList<>();
for (final ObjectObjectCursor<String, DiskUsage> entry : usages) {
final String node = entry.key;
final DiskUsage usage = entry.value;
final RoutingNode routingNode = routingNodes.node(node);
if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes() || usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) {
nodesOverLowThreshold.add(node);
nodesOverHighThreshold.add(node);
nodesOverHighThresholdAndRelocating.remove(node);
if (routingNode != null) {
// might be temporarily null if the ClusterInfoService and the ClusterService are out of step
for (ShardRouting routing : routingNode) {
String indexName = routing.index().getName();
indicesToMarkReadOnly.add(indexName);
indicesNotToAutoRelease.add(indexName);
}
}
logger.warn("flood stage disk watermark [{}] exceeded on {}, all indices on this node will be marked read-only", diskThresholdSettings.describeFloodStageThreshold(), usage);
continue;
}
if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() || usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
if (routingNode != null) {
// might be temporarily null if the ClusterInfoService and the ClusterService are out of step
for (ShardRouting routing : routingNode) {
String indexName = routing.index().getName();
indicesNotToAutoRelease.add(indexName);
}
}
}
final long reservedSpace = info.getReservedSpace(usage.getNodeId(), usage.getPath()).getTotal();
final DiskUsage usageWithReservedSpace = new DiskUsage(usage.getNodeId(), usage.getNodeName(), usage.getPath(), usage.getTotalBytes(), Math.max(0L, usage.getFreeBytes() - reservedSpace));
if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() || usageWithReservedSpace.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
nodesOverLowThreshold.add(node);
nodesOverHighThreshold.add(node);
if (lastRunTimeMillis.get() <= currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
reroute = true;
explanation = "high disk watermark exceeded on one or more nodes";
usagesOverHighThreshold.add(usage);
// will log about this node when the reroute completes
} else {
logger.debug("high disk watermark exceeded on {} but an automatic reroute has occurred " + "in the last [{}], skipping reroute", node, diskThresholdSettings.getRerouteInterval());
}
} else if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLow().getBytes() || usageWithReservedSpace.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdLow()) {
nodesOverHighThresholdAndRelocating.remove(node);
final boolean wasUnderLowThreshold = nodesOverLowThreshold.add(node);
final boolean wasOverHighThreshold = nodesOverHighThreshold.remove(node);
assert (wasUnderLowThreshold && wasOverHighThreshold) == false;
if (wasUnderLowThreshold) {
logger.info("low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node", diskThresholdSettings.describeLowThreshold(), usage);
} else if (wasOverHighThreshold) {
logger.info("high disk watermark [{}] no longer exceeded on {}, but low disk watermark [{}] is still exceeded", diskThresholdSettings.describeHighThreshold(), usage, diskThresholdSettings.describeLowThreshold());
}
} else {
nodesOverHighThresholdAndRelocating.remove(node);
if (nodesOverLowThreshold.contains(node)) {
// if we reroute now.
if (lastRunTimeMillis.get() <= currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) {
reroute = true;
explanation = "one or more nodes has gone under the high or low watermark";
nodesOverLowThreshold.remove(node);
nodesOverHighThreshold.remove(node);
logger.info("low disk watermark [{}] no longer exceeded on {}", diskThresholdSettings.describeLowThreshold(), usage);
} else {
logger.debug("{} has gone below a disk threshold, but an automatic reroute has occurred " + "in the last [{}], skipping reroute", node, diskThresholdSettings.getRerouteInterval());
}
}
}
}
final ActionListener<Void> listener = new GroupedActionListener<>(ActionListener.wrap(this::checkFinished), 3);
if (reroute) {
logger.debug("rerouting shards: [{}]", explanation);
rerouteService.reroute("disk threshold monitor", Priority.HIGH, ActionListener.wrap(reroutedClusterState -> {
for (DiskUsage diskUsage : usagesOverHighThreshold) {
final RoutingNode routingNode = reroutedClusterState.getRoutingNodes().node(diskUsage.getNodeId());
final DiskUsage usageIncludingRelocations;
final long relocatingShardsSize;
if (routingNode != null) {
// might be temporarily null if the ClusterInfoService and the ClusterService are out of step
relocatingShardsSize = sizeOfRelocatingShards(routingNode, diskUsage, info, reroutedClusterState);
usageIncludingRelocations = new DiskUsage(diskUsage.getNodeId(), diskUsage.getNodeName(), diskUsage.getPath(), diskUsage.getTotalBytes(), diskUsage.getFreeBytes() - relocatingShardsSize);
} else {
usageIncludingRelocations = diskUsage;
relocatingShardsSize = 0L;
}
if (usageIncludingRelocations.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() || usageIncludingRelocations.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) {
nodesOverHighThresholdAndRelocating.remove(diskUsage.getNodeId());
logger.warn("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes; the node is expected to continue to exceed " + "the high disk watermark when these relocations are complete", diskThresholdSettings.describeHighThreshold(), diskUsage, -relocatingShardsSize);
} else if (nodesOverHighThresholdAndRelocating.add(diskUsage.getNodeId())) {
logger.info("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes; the node is expected to be below the high " + "disk watermark when these relocations are complete", diskThresholdSettings.describeHighThreshold(), diskUsage, -relocatingShardsSize);
} else {
logger.debug("high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes", diskThresholdSettings.describeHighThreshold(), diskUsage, -relocatingShardsSize);
}
}
setLastRunTimeMillis();
listener.onResponse(null);
}, e -> {
logger.debug("reroute failed", e);
setLastRunTimeMillis();
listener.onFailure(e);
}));
} else {
logger.trace("no reroute required");
listener.onResponse(null);
}
final Set<String> indicesToAutoRelease = StreamSupport.stream(state.routingTable().indicesRouting().spliterator(), false).map(c -> c.key).filter(index -> indicesNotToAutoRelease.contains(index) == false).filter(index -> state.getBlocks().hasIndexBlock(index, IndexMetadata.INDEX_READ_ONLY_ALLOW_DELETE_BLOCK)).collect(Collectors.toSet());
if (indicesToAutoRelease.isEmpty() == false) {
if (diskThresholdSettings.isAutoReleaseIndexEnabled()) {
logger.info("releasing read-only-allow-delete block on indices: [{}]", indicesToAutoRelease);
updateIndicesReadOnly(indicesToAutoRelease, listener, false);
} else {
deprecationLogger.deprecate(DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY.replace(".", "_"), "[{}] will be removed in version {}", DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY, LegacyESVersion.V_7_4_0.major + 1);
logger.debug("[{}] disabled, not releasing read-only-allow-delete block on indices: [{}]", DiskThresholdSettings.AUTO_RELEASE_INDEX_ENABLED_KEY, indicesToAutoRelease);
listener.onResponse(null);
}
} else {
logger.trace("no auto-release required");
listener.onResponse(null);
}
indicesToMarkReadOnly.removeIf(index -> state.getBlocks().indexBlocked(ClusterBlockLevel.WRITE, index));
logger.trace("marking indices as read-only: [{}]", indicesToMarkReadOnly);
if (indicesToMarkReadOnly.isEmpty() == false) {
updateIndicesReadOnly(indicesToMarkReadOnly, listener, true);
} else {
listener.onResponse(null);
}
}
use of org.opensearch.cluster.routing.RoutingNode in project OpenSearch by opensearch-project.
the class DiskThresholdMonitor method markNodesMissingUsageIneligibleForRelease.
private void markNodesMissingUsageIneligibleForRelease(RoutingNodes routingNodes, ImmutableOpenMap<String, DiskUsage> usages, Set<String> indicesToMarkIneligibleForAutoRelease) {
for (RoutingNode routingNode : routingNodes) {
if (usages.containsKey(routingNode.nodeId()) == false) {
if (routingNode != null) {
for (ShardRouting routing : routingNode) {
String indexName = routing.index().getName();
indicesToMarkIneligibleForAutoRelease.add(indexName);
}
}
}
}
}
Aggregations