use of org.opensearch.cluster.node.DiscoveryNode in project OpenSearch by opensearch-project.
the class PrimaryShardAllocator method makeAllocationDecision.
@Override
public AllocateUnassignedDecision makeAllocationDecision(final ShardRouting unassignedShard, final RoutingAllocation allocation, final Logger logger) {
if (isResponsibleFor(unassignedShard) == false) {
// this allocator is not responsible for allocating this shard
return AllocateUnassignedDecision.NOT_TAKEN;
}
final boolean explain = allocation.debugDecision();
if (unassignedShard.recoverySource().getType() == RecoverySource.Type.SNAPSHOT && allocation.snapshotShardSizeInfo().getShardSize(unassignedShard) == null) {
List<NodeAllocationResult> nodeDecisions = null;
if (explain) {
nodeDecisions = buildDecisionsForAllNodes(unassignedShard, allocation);
}
return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.FETCHING_SHARD_DATA, nodeDecisions);
}
final FetchResult<NodeGatewayStartedShards> shardState = fetchData(unassignedShard, allocation);
if (shardState.hasData() == false) {
allocation.setHasPendingAsyncFetch();
List<NodeAllocationResult> nodeDecisions = null;
if (explain) {
nodeDecisions = buildDecisionsForAllNodes(unassignedShard, allocation);
}
return AllocateUnassignedDecision.no(AllocationStatus.FETCHING_SHARD_DATA, nodeDecisions);
}
// don't create a new IndexSetting object for every shard as this could cause a lot of garbage
// on cluster restart if we allocate a boat load of shards
final IndexMetadata indexMetadata = allocation.metadata().getIndexSafe(unassignedShard.index());
final Set<String> inSyncAllocationIds = indexMetadata.inSyncAllocationIds(unassignedShard.id());
final boolean snapshotRestore = unassignedShard.recoverySource().getType() == RecoverySource.Type.SNAPSHOT;
assert inSyncAllocationIds.isEmpty() == false;
// use in-sync allocation ids to select nodes
final NodeShardsResult nodeShardsResult = buildNodeShardsResult(unassignedShard, snapshotRestore, allocation.getIgnoreNodes(unassignedShard.shardId()), inSyncAllocationIds, shardState, logger);
final boolean enoughAllocationsFound = nodeShardsResult.orderedAllocationCandidates.size() > 0;
logger.debug("[{}][{}]: found {} allocation candidates of {} based on allocation ids: [{}]", unassignedShard.index(), unassignedShard.id(), nodeShardsResult.orderedAllocationCandidates.size(), unassignedShard, inSyncAllocationIds);
if (enoughAllocationsFound == false) {
if (snapshotRestore) {
// let BalancedShardsAllocator take care of allocating this shard
logger.debug("[{}][{}]: missing local data, will restore from [{}]", unassignedShard.index(), unassignedShard.id(), unassignedShard.recoverySource());
return AllocateUnassignedDecision.NOT_TAKEN;
} else {
// We have a shard that was previously allocated, but we could not find a valid shard copy to allocate the primary.
// We could just be waiting for the node that holds the primary to start back up, in which case the allocation for
// this shard will be picked up when the node joins and we do another allocation reroute
logger.debug("[{}][{}]: not allocating, number_of_allocated_shards_found [{}]", unassignedShard.index(), unassignedShard.id(), nodeShardsResult.allocationsFound);
return AllocateUnassignedDecision.no(AllocationStatus.NO_VALID_SHARD_COPY, explain ? buildNodeDecisions(null, shardState, inSyncAllocationIds) : null);
}
}
NodesToAllocate nodesToAllocate = buildNodesToAllocate(allocation, nodeShardsResult.orderedAllocationCandidates, unassignedShard, false);
DiscoveryNode node = null;
String allocationId = null;
boolean throttled = false;
if (nodesToAllocate.yesNodeShards.isEmpty() == false) {
DecidedNode decidedNode = nodesToAllocate.yesNodeShards.get(0);
logger.debug("[{}][{}]: allocating [{}] to [{}] on primary allocation", unassignedShard.index(), unassignedShard.id(), unassignedShard, decidedNode.nodeShardState.getNode());
node = decidedNode.nodeShardState.getNode();
allocationId = decidedNode.nodeShardState.allocationId();
} else if (nodesToAllocate.throttleNodeShards.isEmpty() && !nodesToAllocate.noNodeShards.isEmpty()) {
// The deciders returned a NO decision for all nodes with shard copies, so we check if primary shard
// can be force-allocated to one of the nodes.
nodesToAllocate = buildNodesToAllocate(allocation, nodeShardsResult.orderedAllocationCandidates, unassignedShard, true);
if (nodesToAllocate.yesNodeShards.isEmpty() == false) {
final DecidedNode decidedNode = nodesToAllocate.yesNodeShards.get(0);
final NodeGatewayStartedShards nodeShardState = decidedNode.nodeShardState;
logger.debug("[{}][{}]: allocating [{}] to [{}] on forced primary allocation", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodeShardState.getNode());
node = nodeShardState.getNode();
allocationId = nodeShardState.allocationId();
} else if (nodesToAllocate.throttleNodeShards.isEmpty() == false) {
logger.debug("[{}][{}]: throttling allocation [{}] to [{}] on forced primary allocation", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodesToAllocate.throttleNodeShards);
throttled = true;
} else {
logger.debug("[{}][{}]: forced primary allocation denied [{}]", unassignedShard.index(), unassignedShard.id(), unassignedShard);
}
} else {
// we are throttling this, since we are allowed to allocate to this node but there are enough allocations
// taking place on the node currently, ignore it for now
logger.debug("[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodesToAllocate.throttleNodeShards);
throttled = true;
}
List<NodeAllocationResult> nodeResults = null;
if (explain) {
nodeResults = buildNodeDecisions(nodesToAllocate, shardState, inSyncAllocationIds);
}
if (allocation.hasPendingAsyncFetch()) {
return AllocateUnassignedDecision.no(AllocationStatus.FETCHING_SHARD_DATA, nodeResults);
} else if (node != null) {
return AllocateUnassignedDecision.yes(node, allocationId, nodeResults, false);
} else if (throttled) {
return AllocateUnassignedDecision.throttle(nodeResults);
} else {
return AllocateUnassignedDecision.no(AllocationStatus.DECIDERS_NO, nodeResults, true);
}
}
use of org.opensearch.cluster.node.DiscoveryNode in project OpenSearch by opensearch-project.
the class PrimaryShardAllocator method buildNodeDecisions.
/**
* Builds a map of nodes to the corresponding allocation decisions for those nodes.
*/
private static List<NodeAllocationResult> buildNodeDecisions(NodesToAllocate nodesToAllocate, FetchResult<NodeGatewayStartedShards> fetchedShardData, Set<String> inSyncAllocationIds) {
List<NodeAllocationResult> nodeResults = new ArrayList<>();
Collection<NodeGatewayStartedShards> ineligibleShards;
if (nodesToAllocate != null) {
final Set<DiscoveryNode> discoNodes = new HashSet<>();
nodeResults.addAll(Stream.of(nodesToAllocate.yesNodeShards, nodesToAllocate.throttleNodeShards, nodesToAllocate.noNodeShards).flatMap(Collection::stream).map(dnode -> {
discoNodes.add(dnode.nodeShardState.getNode());
return new NodeAllocationResult(dnode.nodeShardState.getNode(), shardStoreInfo(dnode.nodeShardState, inSyncAllocationIds), dnode.decision);
}).collect(Collectors.toList()));
ineligibleShards = fetchedShardData.getData().values().stream().filter(shardData -> discoNodes.contains(shardData.getNode()) == false).collect(Collectors.toList());
} else {
// there were no shard copies that were eligible for being assigned the allocation,
// so all fetched shard data are ineligible shards
ineligibleShards = fetchedShardData.getData().values();
}
nodeResults.addAll(ineligibleShards.stream().map(shardData -> new NodeAllocationResult(shardData.getNode(), shardStoreInfo(shardData, inSyncAllocationIds), null)).collect(Collectors.toList()));
return nodeResults;
}
use of org.opensearch.cluster.node.DiscoveryNode in project OpenSearch by opensearch-project.
the class ReplicaShardAllocator method makeAllocationDecision.
@Override
public AllocateUnassignedDecision makeAllocationDecision(final ShardRouting unassignedShard, final RoutingAllocation allocation, final Logger logger) {
if (isResponsibleFor(unassignedShard) == false) {
// this allocator is not responsible for deciding on this shard
return AllocateUnassignedDecision.NOT_TAKEN;
}
final RoutingNodes routingNodes = allocation.routingNodes();
final boolean explain = allocation.debugDecision();
// pre-check if it can be allocated to any node that currently exists, so we won't list the store for it for nothing
Tuple<Decision, Map<String, NodeAllocationResult>> result = canBeAllocatedToAtLeastOneNode(unassignedShard, allocation);
Decision allocateDecision = result.v1();
if (allocateDecision.type() != Decision.Type.YES && (explain == false || hasInitiatedFetching(unassignedShard) == false)) {
// only return early if we are not in explain mode, or we are in explain mode but we have not
// yet attempted to fetch any shard data
logger.trace("{}: ignoring allocation, can't be allocated on any node", unassignedShard);
return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), result.v2() != null ? new ArrayList<>(result.v2().values()) : null);
}
AsyncShardFetch.FetchResult<NodeStoreFilesMetadata> shardStores = fetchData(unassignedShard, allocation);
if (shardStores.hasData() == false) {
logger.trace("{}: ignoring allocation, still fetching shard stores", unassignedShard);
allocation.setHasPendingAsyncFetch();
List<NodeAllocationResult> nodeDecisions = null;
if (explain) {
nodeDecisions = buildDecisionsForAllNodes(unassignedShard, allocation);
}
return AllocateUnassignedDecision.no(AllocationStatus.FETCHING_SHARD_DATA, nodeDecisions);
}
ShardRouting primaryShard = routingNodes.activePrimary(unassignedShard.shardId());
if (primaryShard == null) {
assert explain : "primary should only be null here if we are in explain mode, so we didn't " + "exit early when canBeAllocatedToAtLeastOneNode didn't return a YES decision";
return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), new ArrayList<>(result.v2().values()));
}
assert primaryShard.currentNodeId() != null;
final DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
final TransportNodesListShardStoreMetadata.StoreFilesMetadata primaryStore = findStore(primaryNode, shardStores);
if (primaryStore == null) {
// if we can't find the primary data, it is probably because the primary shard is corrupted (and listing failed)
// we want to let the replica be allocated in order to expose the actual problem with the primary that the replica
// will try and recover from
// Note, this is the existing behavior, as exposed in running CorruptFileTest#testNoPrimaryData
logger.trace("{}: no primary shard store found or allocated, letting actual allocation figure it out", unassignedShard);
return AllocateUnassignedDecision.NOT_TAKEN;
}
MatchingNodes matchingNodes = findMatchingNodes(unassignedShard, allocation, false, primaryNode, primaryStore, shardStores, explain);
assert explain == false || matchingNodes.nodeDecisions != null : "in explain mode, we must have individual node decisions";
List<NodeAllocationResult> nodeDecisions = augmentExplanationsWithStoreInfo(result.v2(), matchingNodes.nodeDecisions);
if (allocateDecision.type() != Decision.Type.YES) {
return AllocateUnassignedDecision.no(UnassignedInfo.AllocationStatus.fromDecision(allocateDecision.type()), nodeDecisions);
} else if (matchingNodes.getNodeWithHighestMatch() != null) {
RoutingNode nodeWithHighestMatch = allocation.routingNodes().node(matchingNodes.getNodeWithHighestMatch().getId());
// we only check on THROTTLE since we checked before on NO
Decision decision = allocation.deciders().canAllocate(unassignedShard, nodeWithHighestMatch, allocation);
if (decision.type() == Decision.Type.THROTTLE) {
logger.debug("[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodeWithHighestMatch.node());
// we are throttling this, as we have enough other shards to allocate to this node, so ignore it for now
return AllocateUnassignedDecision.throttle(nodeDecisions);
} else {
logger.debug("[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store", unassignedShard.index(), unassignedShard.id(), unassignedShard, nodeWithHighestMatch.node());
// we found a match
return AllocateUnassignedDecision.yes(nodeWithHighestMatch.node(), null, nodeDecisions, true);
}
} else if (matchingNodes.hasAnyData() == false && unassignedShard.unassignedInfo().isDelayed()) {
// if we didn't manage to find *any* data (regardless of matching sizes), and the replica is
// unassigned due to a node leaving, so we delay allocation of this replica to see if the
// node with the shard copy will rejoin so we can re-use the copy it has
logger.debug("{}: allocation of [{}] is delayed", unassignedShard.shardId(), unassignedShard);
long remainingDelayMillis = 0L;
long totalDelayMillis = 0L;
if (explain) {
UnassignedInfo unassignedInfo = unassignedShard.unassignedInfo();
Metadata metadata = allocation.metadata();
IndexMetadata indexMetadata = metadata.index(unassignedShard.index());
totalDelayMillis = INDEX_DELAYED_NODE_LEFT_TIMEOUT_SETTING.get(indexMetadata.getSettings()).getMillis();
long remainingDelayNanos = unassignedInfo.getRemainingDelay(System.nanoTime(), indexMetadata.getSettings());
remainingDelayMillis = TimeValue.timeValueNanos(remainingDelayNanos).millis();
}
return AllocateUnassignedDecision.delayed(remainingDelayMillis, totalDelayMillis, nodeDecisions);
}
return AllocateUnassignedDecision.NOT_TAKEN;
}
use of org.opensearch.cluster.node.DiscoveryNode in project OpenSearch by opensearch-project.
the class PeerFinder method handleWakeUp.
/**
* @return whether any peers were removed due to disconnection
*/
private boolean handleWakeUp() {
assert holdsLock() : "PeerFinder mutex not held";
final boolean peersRemoved = peersByAddress.values().removeIf(Peer::handleWakeUp);
if (active == false) {
logger.trace("not active");
return peersRemoved;
}
logger.trace("probing master nodes from cluster state: {}", lastAcceptedNodes);
for (ObjectCursor<DiscoveryNode> discoveryNodeObjectCursor : lastAcceptedNodes.getMasterNodes().values()) {
startProbe(discoveryNodeObjectCursor.value.getAddress());
}
configuredHostsResolver.resolveConfiguredHosts(providedAddresses -> {
synchronized (mutex) {
lastResolvedAddresses = providedAddresses;
logger.trace("probing resolved transport addresses {}", providedAddresses);
providedAddresses.forEach(this::startProbe);
}
});
transportService.getThreadPool().scheduleUnlessShuttingDown(findPeersInterval, Names.GENERIC, new AbstractRunnable() {
@Override
public boolean isForceExecution() {
return true;
}
@Override
public void onFailure(Exception e) {
assert false : e;
logger.debug("unexpected exception in wakeup", e);
}
@Override
protected void doRun() {
synchronized (mutex) {
if (handleWakeUp() == false) {
return;
}
}
onFoundPeersUpdated();
}
@Override
public String toString() {
return "PeerFinder handling wakeup";
}
});
return peersRemoved;
}
use of org.opensearch.cluster.node.DiscoveryNode in project OpenSearch by opensearch-project.
the class SplitIndexIT method testCreateSplitIndex.
public void testCreateSplitIndex() throws Exception {
internalCluster().ensureAtLeastNumDataNodes(2);
Version version = VersionUtils.randomIndexCompatibleVersion(random());
prepareCreate("source").setSettings(Settings.builder().put(indexSettings()).put("number_of_shards", 1).put("index.version.created", version)).get();
final int docs = randomIntBetween(0, 128);
for (int i = 0; i < docs; i++) {
client().prepareIndex("source").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", XContentType.JSON).get();
}
// ensure all shards are allocated otherwise the ensure green below might not succeed since we require the merge node
// if we change the setting too quickly we will end up with one replica unassigned which can't be assigned anymore due
// to the require._name below.
ensureGreen();
// relocate all shards to one node such that we can merge it.
client().admin().indices().prepareUpdateSettings("source").setSettings(Settings.builder().put("index.blocks.write", true)).get();
ensureGreen();
final IndicesStatsResponse sourceStats = client().admin().indices().prepareStats("source").setSegments(true).get();
// disable rebalancing to be able to capture the right stats. balancing can move the target primary
// making it hard to pin point the source shards.
client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none")).get();
try {
final boolean createWithReplicas = randomBoolean();
assertAcked(client().admin().indices().prepareResizeIndex("source", "target").setResizeType(ResizeType.SPLIT).setSettings(Settings.builder().put("index.number_of_replicas", createWithReplicas ? 1 : 0).put("index.number_of_shards", 2).putNull("index.blocks.write").build()).get());
ensureGreen();
final ClusterState state = client().admin().cluster().prepareState().get().getState();
DiscoveryNode mergeNode = state.nodes().get(state.getRoutingTable().index("target").shard(0).primaryShard().currentNodeId());
logger.info("split node {}", mergeNode);
final long maxSeqNo = Arrays.stream(sourceStats.getShards()).filter(shard -> shard.getShardRouting().currentNodeId().equals(mergeNode.getId())).map(ShardStats::getSeqNoStats).mapToLong(SeqNoStats::getMaxSeqNo).max().getAsLong();
final long maxUnsafeAutoIdTimestamp = Arrays.stream(sourceStats.getShards()).filter(shard -> shard.getShardRouting().currentNodeId().equals(mergeNode.getId())).map(ShardStats::getStats).map(CommonStats::getSegments).mapToLong(SegmentsStats::getMaxUnsafeAutoIdTimestamp).max().getAsLong();
final IndicesStatsResponse targetStats = client().admin().indices().prepareStats("target").get();
for (final ShardStats shardStats : targetStats.getShards()) {
final SeqNoStats seqNoStats = shardStats.getSeqNoStats();
final ShardRouting shardRouting = shardStats.getShardRouting();
assertThat("failed on " + shardRouting, seqNoStats.getMaxSeqNo(), equalTo(maxSeqNo));
assertThat("failed on " + shardRouting, seqNoStats.getLocalCheckpoint(), equalTo(maxSeqNo));
assertThat("failed on " + shardRouting, shardStats.getStats().getSegments().getMaxUnsafeAutoIdTimestamp(), equalTo(maxUnsafeAutoIdTimestamp));
}
final int size = docs > 0 ? 2 * docs : 1;
assertHitCount(client().prepareSearch("target").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
if (createWithReplicas == false) {
// bump replicas
client().admin().indices().prepareUpdateSettings("target").setSettings(Settings.builder().put("index.number_of_replicas", 1)).get();
ensureGreen();
assertHitCount(client().prepareSearch("target").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
}
for (int i = docs; i < 2 * docs; i++) {
client().prepareIndex("target").setSource("{\"foo\" : \"bar\", \"i\" : " + i + "}", XContentType.JSON).get();
}
flushAndRefresh();
assertHitCount(client().prepareSearch("target").setSize(2 * size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), 2 * docs);
assertHitCount(client().prepareSearch("source").setSize(size).setQuery(new TermsQueryBuilder("foo", "bar")).get(), docs);
GetSettingsResponse target = client().admin().indices().prepareGetSettings("target").get();
assertEquals(version, target.getIndexToSettings().get("target").getAsVersion("index.version.created", null));
} finally {
// clean up
client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), (String) null)).get();
}
}
Aggregations