use of org.opensearch.cluster.coordination.CoordinationMetadata.VotingConfiguration in project OpenSearch by opensearch-project.
the class Coordinator method doStart.
@Override
protected void doStart() {
synchronized (mutex) {
CoordinationState.PersistedState persistedState = persistedStateSupplier.get();
coordinationState.set(new CoordinationState(getLocalNode(), persistedState, electionStrategy));
peerFinder.setCurrentTerm(getCurrentTerm());
configuredHostsResolver.start();
final ClusterState lastAcceptedState = coordinationState.get().getLastAcceptedState();
if (lastAcceptedState.metadata().clusterUUIDCommitted()) {
logger.info("cluster UUID [{}]", lastAcceptedState.metadata().clusterUUID());
}
final VotingConfiguration votingConfiguration = lastAcceptedState.getLastCommittedConfiguration();
if (singleNodeDiscovery && votingConfiguration.isEmpty() == false && votingConfiguration.hasQuorum(Collections.singleton(getLocalNode().getId())) == false) {
throw new IllegalStateException("cannot start with [" + DiscoveryModule.DISCOVERY_TYPE_SETTING.getKey() + "] set to [" + DiscoveryModule.SINGLE_NODE_DISCOVERY_TYPE + "] when local node " + getLocalNode() + " does not have quorum in voting configuration " + votingConfiguration);
}
ClusterState initialState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.get(settings)).blocks(ClusterBlocks.builder().addGlobalBlock(STATE_NOT_RECOVERED_BLOCK).addGlobalBlock(noMasterBlockService.getNoMasterBlock())).nodes(DiscoveryNodes.builder().add(getLocalNode()).localNodeId(getLocalNode().getId())).build();
applierState = initialState;
clusterApplier.setInitialState(initialState);
}
}
use of org.opensearch.cluster.coordination.CoordinationMetadata.VotingConfiguration in project OpenSearch by opensearch-project.
the class Reconfigurator method reconfigure.
/**
* Compute an optimal configuration for the cluster.
*
* @param liveNodes The live nodes in the cluster. The optimal configuration prefers live nodes over non-live nodes as far as
* possible.
* @param retiredNodeIds Nodes that are leaving the cluster and which should not appear in the configuration if possible. Nodes that are
* retired and not in the current configuration will never appear in the resulting configuration; this is useful
* for shifting the vote in a 2-node cluster so one of the nodes can be restarted without harming availability.
* @param currentMaster The current master. Unless retired, we prefer to keep the current master in the config.
* @param currentConfig The current configuration. As far as possible, we prefer to keep the current config as-is.
* @return An optimal configuration, or leave the current configuration unchanged if the optimal configuration has no live quorum.
*/
public VotingConfiguration reconfigure(Set<DiscoveryNode> liveNodes, Set<String> retiredNodeIds, DiscoveryNode currentMaster, VotingConfiguration currentConfig) {
assert liveNodes.contains(currentMaster) : "liveNodes = " + liveNodes + " master = " + currentMaster;
logger.trace("{} reconfiguring {} based on liveNodes={}, retiredNodeIds={}, currentMaster={}", this, currentConfig, liveNodes, retiredNodeIds, currentMaster);
final Set<String> liveNodeIds = liveNodes.stream().filter(DiscoveryNode::isMasterNode).map(DiscoveryNode::getId).collect(Collectors.toSet());
final Set<String> currentConfigNodeIds = currentConfig.getNodeIds();
final Set<VotingConfigNode> orderedCandidateNodes = new TreeSet<>();
liveNodes.stream().filter(DiscoveryNode::isMasterNode).filter(n -> retiredNodeIds.contains(n.getId()) == false).forEach(n -> orderedCandidateNodes.add(new VotingConfigNode(n.getId(), true, n.getId().equals(currentMaster.getId()), currentConfigNodeIds.contains(n.getId()))));
currentConfigNodeIds.stream().filter(nid -> liveNodeIds.contains(nid) == false).filter(nid -> retiredNodeIds.contains(nid) == false).forEach(nid -> orderedCandidateNodes.add(new VotingConfigNode(nid, false, false, true)));
/*
* Now we work out how many nodes should be in the configuration:
*/
final int nonRetiredConfigSize = Math.toIntExact(orderedCandidateNodes.stream().filter(n -> n.inCurrentConfig).count());
final int minimumConfigEnforcedSize = autoShrinkVotingConfiguration ? (nonRetiredConfigSize < 3 ? 1 : 3) : nonRetiredConfigSize;
final int nonRetiredLiveNodeCount = Math.toIntExact(orderedCandidateNodes.stream().filter(n -> n.live).count());
final int targetSize = Math.max(roundDownToOdd(nonRetiredLiveNodeCount), minimumConfigEnforcedSize);
final VotingConfiguration newConfig = new VotingConfiguration(orderedCandidateNodes.stream().limit(targetSize).map(n -> n.id).collect(Collectors.toSet()));
// new configuration should have a quorum
if (newConfig.hasQuorum(liveNodeIds)) {
return newConfig;
} else {
// If there are not enough live nodes to form a quorum in the newly-proposed configuration, it's better to do nothing.
return currentConfig;
}
}
use of org.opensearch.cluster.coordination.CoordinationMetadata.VotingConfiguration in project OpenSearch by opensearch-project.
the class NodeJoinTests method testConcurrentJoining.
public void testConcurrentJoining() {
List<DiscoveryNode> masterNodes = IntStream.rangeClosed(1, randomIntBetween(2, 5)).mapToObj(nodeId -> newNode(nodeId, true)).collect(Collectors.toList());
List<DiscoveryNode> otherNodes = IntStream.rangeClosed(masterNodes.size() + 1, masterNodes.size() + 1 + randomIntBetween(0, 5)).mapToObj(nodeId -> newNode(nodeId, false)).collect(Collectors.toList());
List<DiscoveryNode> allNodes = Stream.concat(masterNodes.stream(), otherNodes.stream()).collect(Collectors.toList());
DiscoveryNode localNode = masterNodes.get(0);
VotingConfiguration votingConfiguration = new VotingConfiguration(randomValueOtherThan(singletonList(localNode), () -> randomSubsetOf(randomIntBetween(1, masterNodes.size()), masterNodes)).stream().map(DiscoveryNode::getId).collect(Collectors.toSet()));
logger.info("Voting configuration: {}", votingConfiguration);
long initialTerm = randomLongBetween(1, 10);
long initialVersion = randomLongBetween(1, 10);
setupRealMasterServiceAndCoordinator(initialTerm, initialState(localNode, initialTerm, initialVersion, votingConfiguration));
long newTerm = initialTerm + randomLongBetween(1, 10);
// we need at least a quorum of voting nodes with a correct term and worse state
List<DiscoveryNode> successfulNodes;
do {
successfulNodes = randomSubsetOf(allNodes);
} while (votingConfiguration.hasQuorum(successfulNodes.stream().map(DiscoveryNode::getId).collect(Collectors.toList())) == false);
logger.info("Successful voting nodes: {}", successfulNodes);
List<JoinRequest> correctJoinRequests = successfulNodes.stream().map(node -> new JoinRequest(node, newTerm, Optional.of(new Join(node, localNode, newTerm, initialTerm, initialVersion)))).collect(Collectors.toList());
List<DiscoveryNode> possiblyUnsuccessfulNodes = new ArrayList<>(allNodes);
possiblyUnsuccessfulNodes.removeAll(successfulNodes);
logger.info("Possibly unsuccessful voting nodes: {}", possiblyUnsuccessfulNodes);
List<JoinRequest> possiblyFailingJoinRequests = possiblyUnsuccessfulNodes.stream().map(node -> {
if (randomBoolean()) {
// a correct request
return new JoinRequest(node, newTerm, Optional.of(new Join(node, localNode, newTerm, initialTerm, initialVersion)));
} else if (randomBoolean()) {
// term too low
return new JoinRequest(node, newTerm, Optional.of(new Join(node, localNode, randomLongBetween(0, initialTerm), initialTerm, initialVersion)));
} else {
// better state
return new JoinRequest(node, newTerm, Optional.of(new Join(node, localNode, newTerm, initialTerm, initialVersion + randomLongBetween(1, 10))));
}
}).collect(Collectors.toList());
// duplicate some requests, which will be unsuccessful
possiblyFailingJoinRequests.addAll(randomSubsetOf(possiblyFailingJoinRequests));
CyclicBarrier barrier = new CyclicBarrier(correctJoinRequests.size() + possiblyFailingJoinRequests.size() + 1);
final Runnable awaitBarrier = () -> {
try {
barrier.await();
} catch (InterruptedException | BrokenBarrierException e) {
throw new RuntimeException(e);
}
};
final AtomicBoolean stopAsserting = new AtomicBoolean();
final Thread assertionThread = new Thread(() -> {
awaitBarrier.run();
while (stopAsserting.get() == false) {
coordinator.invariant();
}
}, "assert invariants");
final List<Thread> joinThreads = Stream.concat(correctJoinRequests.stream().map(joinRequest -> new Thread(() -> {
awaitBarrier.run();
joinNode(joinRequest);
}, "process " + joinRequest)), possiblyFailingJoinRequests.stream().map(joinRequest -> new Thread(() -> {
awaitBarrier.run();
try {
joinNode(joinRequest);
} catch (CoordinationStateRejectedException e) {
// ignore - these requests are expected to fail
}
}, "process " + joinRequest))).collect(Collectors.toList());
assertionThread.start();
joinThreads.forEach(Thread::start);
joinThreads.forEach(t -> {
try {
t.join();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
});
stopAsserting.set(true);
try {
assertionThread.join();
} catch (InterruptedException e) {
throw new RuntimeException(e);
}
assertTrue(MasterServiceTests.discoveryState(masterService).nodes().isLocalNodeElectedMaster());
for (DiscoveryNode successfulNode : successfulNodes) {
assertTrue(successfulNode + " joined cluster", clusterStateHasNode(successfulNode));
assertFalse(successfulNode + " voted for master", coordinator.missingJoinVoteFrom(successfulNode));
}
}
use of org.opensearch.cluster.coordination.CoordinationMetadata.VotingConfiguration in project OpenSearch by opensearch-project.
the class CoordinatorTests method testCanShrinkFromFiveNodesToThree.
public void testCanShrinkFromFiveNodesToThree() {
try (Cluster cluster = new Cluster(5)) {
cluster.runRandomly();
cluster.stabilise();
{
final ClusterNode leader = cluster.getAnyLeader();
logger.info("setting auto-shrink reconfiguration to false");
leader.submitSetAutoShrinkVotingConfiguration(false);
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
assertFalse(CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION.get(leader.getLastAppliedClusterState().metadata().settings()));
}
final ClusterNode disconnect1 = cluster.getAnyNode();
final ClusterNode disconnect2 = cluster.getAnyNodeExcept(disconnect1);
logger.info("--> disconnecting {} and {}", disconnect1, disconnect2);
disconnect1.disconnect();
disconnect2.disconnect();
cluster.stabilise();
final ClusterNode leader = cluster.getAnyLeader();
{
final VotingConfiguration lastCommittedConfiguration = leader.getLastAppliedClusterState().getLastCommittedConfiguration();
assertThat(lastCommittedConfiguration + " should be all nodes", lastCommittedConfiguration.getNodeIds(), equalTo(cluster.clusterNodes.stream().map(ClusterNode::getId).collect(Collectors.toSet())));
}
logger.info("setting auto-shrink reconfiguration to true");
leader.submitSetAutoShrinkVotingConfiguration(true);
// allow for a reconfiguration
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY * 2);
assertTrue(CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION.get(leader.getLastAppliedClusterState().metadata().settings()));
{
final VotingConfiguration lastCommittedConfiguration = leader.getLastAppliedClusterState().getLastCommittedConfiguration();
assertThat(lastCommittedConfiguration + " should be 3 nodes", lastCommittedConfiguration.getNodeIds().size(), equalTo(3));
assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect1.getId()));
assertFalse(lastCommittedConfiguration.getNodeIds().contains(disconnect2.getId()));
}
}
}
use of org.opensearch.cluster.coordination.CoordinationMetadata.VotingConfiguration in project OpenSearch by opensearch-project.
the class CoordinatorTests method testUnhealthyNodesGetsRemoved.
public void testUnhealthyNodesGetsRemoved() {
AtomicReference<StatusInfo> healthStatusInfo = new AtomicReference<>(new StatusInfo(HEALTHY, "healthy-info"));
try (Cluster cluster = new Cluster(3)) {
cluster.runRandomly();
cluster.stabilise();
final ClusterNode leader = cluster.getAnyLeader();
logger.info("--> adding two new healthy nodes");
ClusterNode newNode1 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, () -> healthStatusInfo.get());
ClusterNode newNode2 = cluster.new ClusterNode(nextNodeIndex.getAndIncrement(), true, leader.nodeSettings, () -> healthStatusInfo.get());
cluster.clusterNodes.add(newNode1);
cluster.clusterNodes.add(newNode2);
cluster.stabilise(// The first pinging discovers the master
defaultMillis(DISCOVERY_FIND_PEERS_INTERVAL_SETTING) + // One message delay to send a join
DEFAULT_DELAY_VARIABILITY + // followup reconfiguration
2 * 2 * DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
{
assertThat(leader.coordinator.getMode(), is(Mode.LEADER));
final VotingConfiguration lastCommittedConfiguration = leader.getLastAppliedClusterState().getLastCommittedConfiguration();
assertThat(lastCommittedConfiguration + " should be all nodes", lastCommittedConfiguration.getNodeIds(), equalTo(cluster.clusterNodes.stream().map(ClusterNode::getId).collect(Collectors.toSet())));
}
logger.info("setting auto-shrink reconfiguration to true");
leader.submitSetAutoShrinkVotingConfiguration(true);
cluster.stabilise(DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
assertTrue(CLUSTER_AUTO_SHRINK_VOTING_CONFIGURATION.get(leader.getLastAppliedClusterState().metadata().settings()));
logger.info("--> changing health of newly added nodes to unhealthy");
healthStatusInfo.getAndSet(new StatusInfo(UNHEALTHY, "unhealthy-info"));
cluster.stabilise(Math.max(// Each follower may have just sent a leader check, which receives no response
defaultMillis(LEADER_CHECK_TIMEOUT_SETTING) + // then wait for the follower to check the leader
defaultMillis(LEADER_CHECK_INTERVAL_SETTING) + // then wait for the exception response
DEFAULT_DELAY_VARIABILITY, // ALSO the leader may have just sent a follower check, which receives no response
defaultMillis(FOLLOWER_CHECK_TIMEOUT_SETTING) + // wait for the leader to check its followers
defaultMillis(FOLLOWER_CHECK_INTERVAL_SETTING) + // then wait for the exception response
DEFAULT_DELAY_VARIABILITY) + // wait for the removal to be committed
DEFAULT_CLUSTER_STATE_UPDATE_DELAY + // then wait for the followup reconfiguration
DEFAULT_CLUSTER_STATE_UPDATE_DELAY);
{
final ClusterNode newLeader = cluster.getAnyLeader();
final VotingConfiguration lastCommittedConfiguration = newLeader.getLastAppliedClusterState().getLastCommittedConfiguration();
assertThat(lastCommittedConfiguration + " should be 3 nodes", lastCommittedConfiguration.getNodeIds().size(), equalTo(3));
assertFalse(lastCommittedConfiguration.getNodeIds().contains(newNode1.getId()));
assertFalse(lastCommittedConfiguration.getNodeIds().contains(newNode2.getId()));
}
}
}
Aggregations