use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionTopologyImpl method allOwners.
/**
* {@inheritDoc}
*/
@Override
public List<List<ClusterNode>> allOwners() {
lock.readLock().lock();
try {
int parts = partitions();
List<List<ClusterNode>> res = new ArrayList<>(parts);
for (int i = 0; i < parts; i++) res.add(new ArrayList<>());
List<ClusterNode> allNodes = discoCache.cacheGroupAffinityNodes(grp.groupId());
for (int i = 0; i < allNodes.size(); i++) {
ClusterNode node = allNodes.get(i);
GridDhtPartitionMap nodeParts = node2part.get(node.id());
if (nodeParts != null) {
for (Map.Entry<Integer, GridDhtPartitionState> e : nodeParts.map().entrySet()) {
if (e.getValue() == OWNING) {
int part = e.getKey();
List<ClusterNode> owners = res.get(part);
owners.add(node);
}
}
}
}
return res;
} finally {
lock.readLock().unlock();
}
}
use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionTopologyImpl method resetOwners.
/**
* {@inheritDoc}
*/
@Override
public Map<UUID, Set<Integer>> resetOwners(Map<Integer, Set<UUID>> ownersByUpdCounters, Set<Integer> haveHist, GridDhtPartitionsExchangeFuture exchFut) {
Map<UUID, Set<Integer>> res = new HashMap<>();
Collection<DiscoveryEvent> evts = exchFut.events().events();
Set<UUID> joinedNodes = U.newHashSet(evts.size());
for (DiscoveryEvent evt : evts) {
if (evt.type() == EVT_NODE_JOINED)
joinedNodes.add(evt.eventNode().id());
}
ctx.database().checkpointReadLock();
try {
Map<UUID, Set<Integer>> addToWaitGroups = new HashMap<>();
lock.writeLock().lock();
try {
// First process local partitions.
UUID locNodeId = ctx.localNodeId();
for (Map.Entry<Integer, Set<UUID>> entry : ownersByUpdCounters.entrySet()) {
int part = entry.getKey();
Set<UUID> maxCounterPartOwners = entry.getValue();
GridDhtLocalPartition locPart = localPartition(part);
if (locPart == null || locPart.state() != OWNING)
continue;
// Partition state should be mutated only on joining nodes if they are exists for the exchange.
if (joinedNodes.isEmpty() && !maxCounterPartOwners.contains(locNodeId)) {
rebalancePartition(part, !haveHist.contains(part), exchFut);
res.computeIfAbsent(locNodeId, n -> new HashSet<>()).add(part);
}
}
// Then process node maps.
for (Map.Entry<Integer, Set<UUID>> entry : ownersByUpdCounters.entrySet()) {
int part = entry.getKey();
Set<UUID> maxCounterPartOwners = entry.getValue();
for (Map.Entry<UUID, GridDhtPartitionMap> remotes : node2part.entrySet()) {
UUID remoteNodeId = remotes.getKey();
if (!joinedNodes.isEmpty() && !joinedNodes.contains(remoteNodeId))
continue;
GridDhtPartitionMap partMap = remotes.getValue();
GridDhtPartitionState state = partMap.get(part);
if (state != OWNING)
continue;
if (!maxCounterPartOwners.contains(remoteNodeId)) {
partMap.put(part, MOVING);
partMap.updateSequence(partMap.updateSequence() + 1, partMap.topologyVersion());
if (partMap.nodeId().equals(locNodeId))
updateSeq.setIfGreater(partMap.updateSequence());
res.computeIfAbsent(remoteNodeId, n -> new HashSet<>()).add(part);
}
}
}
for (Map.Entry<UUID, Set<Integer>> entry : res.entrySet()) {
UUID nodeId = entry.getKey();
Set<Integer> rebalancedParts = entry.getValue();
addToWaitGroups.put(nodeId, new HashSet<>(rebalancedParts));
if (!rebalancedParts.isEmpty()) {
Set<Integer> historical = rebalancedParts.stream().filter(haveHist::contains).collect(Collectors.toSet());
// Filter out partitions having WAL history.
rebalancedParts.removeAll(historical);
U.warn(log, "Partitions have been scheduled for rebalancing due to outdated update counter " + "[grp=" + grp.cacheOrGroupName() + ", readyTopVer=" + readyTopVer + ", topVer=" + exchFut.initialVersion() + ", nodeId=" + nodeId + ", partsFull=" + S.compact(rebalancedParts) + ", partsHistorical=" + S.compact(historical) + "]");
}
}
node2part = new GridDhtPartitionFullMap(node2part, updateSeq.incrementAndGet());
} finally {
lock.writeLock().unlock();
}
List<List<ClusterNode>> ideal = ctx.affinity().affinity(groupId()).idealAssignmentRaw();
for (Map.Entry<UUID, Set<Integer>> entry : addToWaitGroups.entrySet()) {
// Add to wait groups to ensure late assignment switch after all partitions are rebalanced.
for (Integer part : entry.getValue()) {
ctx.cache().context().affinity().addToWaitGroup(groupId(), part, topologyVersionFuture().initialVersion(), ideal.get(part));
}
}
} finally {
ctx.database().checkpointReadUnlock();
}
return res;
}
use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionTopologyImpl method afterExchange.
/**
* {@inheritDoc}
*/
@Override
public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut) {
boolean changed = false;
int partitions = grp.affinity().partitions();
AffinityTopologyVersion topVer = exchFut.context().events().topologyVersion();
assert grp.affinity().lastVersion().equals(topVer) : "Affinity is not initialized " + "[grp=" + grp.cacheOrGroupName() + ", topVer=" + topVer + ", affVer=" + grp.affinity().lastVersion() + ", fut=" + exchFut + ']';
ctx.database().checkpointReadLock();
try {
lock.writeLock().lock();
try {
if (stopping)
return false;
assert readyTopVer.initialized() : readyTopVer;
assert lastTopChangeVer.equals(readyTopVer);
if (log.isDebugEnabled()) {
log.debug("Partition map before afterExchange [grp=" + grp.cacheOrGroupName() + ", exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']');
}
if (log.isTraceEnabled()) {
log.trace("Partition states before afterExchange [grp=" + grp.cacheOrGroupName() + ", exchVer=" + exchFut.exchangeId() + ", states=" + dumpPartitionStates() + ']');
}
long updateSeq = this.updateSeq.incrementAndGet();
// Skip partition updates in case of not real exchange.
if (!ctx.localNode().isClient() && exchFut.exchangeType() == ALL) {
for (int p = 0; p < partitions; p++) {
GridDhtLocalPartition locPart = localPartition0(p, topVer, false, true);
if (partitionLocalNode(p, topVer)) {
// Prepare partition to rebalance if it's not happened on full map update phase.
if (locPart == null || locPart.state() == RENTING || locPart.state() == EVICTED)
locPart = rebalancePartition(p, true, exchFut);
GridDhtPartitionState state = locPart.state();
if (state == MOVING) {
if (grp.rebalanceEnabled()) {
Collection<ClusterNode> owners = owners(p);
// then new exchange should be started with detecting lost partitions.
if (!F.isEmpty(owners)) {
if (log.isDebugEnabled())
log.debug("Will not own partition (there are owners to rebalance from) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ", owners = " + owners + ']');
}
} else
updateSeq = updateLocal(p, locPart.state(), updateSeq, topVer);
}
} else {
if (locPart != null) {
GridDhtPartitionState state = locPart.state();
if (state == MOVING) {
locPart.rent();
updateSeq = updateLocal(p, locPart.state(), updateSeq, topVer);
changed = true;
if (log.isDebugEnabled()) {
log.debug("Evicting MOVING partition (it does not belong to affinity) [" + "grp=" + grp.cacheOrGroupName() + ", p=" + locPart.id() + ']');
}
}
}
}
}
}
AffinityAssignment aff = grp.affinity().readyAffinity(topVer);
if (node2part != null && node2part.valid())
changed |= checkEvictions(updateSeq, aff);
updateRebalanceVersion(aff.topologyVersion(), aff.assignment());
consistencyCheck();
if (log.isTraceEnabled()) {
log.trace("Partition states after afterExchange [grp=" + grp.cacheOrGroupName() + ", exchVer=" + exchFut.exchangeId() + ", states=" + dumpPartitionStates() + ']');
}
} finally {
lock.writeLock().unlock();
}
} finally {
ctx.database().checkpointReadUnlock();
}
return changed;
}
use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionTopologyImpl method nodes.
/**
* @param p Partition.
* @param topVer Topology version ({@code -1} for all nodes).
* @param state Partition state.
* @param states Additional partition states.
* @return List of nodes for the partition.
*/
private List<ClusterNode> nodes(int p, AffinityTopologyVersion topVer, GridDhtPartitionState state, GridDhtPartitionState... states) {
Collection<UUID> allIds = F.nodeIds(discoCache.cacheGroupAffinityNodes(grp.groupId()));
lock.readLock().lock();
try {
assert node2part != null && node2part.valid() : "Invalid node-to-partitions map [topVer=" + topVer + ", grp=" + grp.cacheOrGroupName() + ", allIds=" + allIds + ", node2part=" + node2part + ']';
// Node IDs can be null if both, primary and backup, nodes disappear.
// Empirical size to reduce growing of ArrayList.
// We bear in mind that most of the time we filter OWNING partitions.
List<ClusterNode> nodes = new ArrayList<>(allIds.size() / 2 + 1);
for (UUID id : allIds) {
if (hasState(p, id, state, states)) {
ClusterNode n = ctx.discovery().node(id);
if (n != null && (topVer.topologyVersion() < 0 || n.order() <= topVer.topologyVersion()))
nodes.add(n);
}
}
return nodes;
} finally {
lock.readLock().unlock();
}
}
use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionTopologyImpl method checkEvictions.
/**
* Finds local partitions which don't belong to affinity and runs eviction process for such partitions.
*
* @param updateSeq Update sequence.
* @param aff Affinity assignments.
* @return {@code True} if there are local partitions need to be evicted.
*/
private boolean checkEvictions(long updateSeq, AffinityAssignment aff) {
assert lock.isWriteLockedByCurrentThread();
if (!ctx.kernalContext().state().evictionsAllowed())
return false;
boolean hasEvictedPartitions = false;
UUID locId = ctx.localNodeId();
for (int p = 0; p < locParts.length(); p++) {
GridDhtLocalPartition part = locParts.get(p);
if (part == null || !part.state().active())
continue;
List<ClusterNode> affNodes = aff.get(p);
// This node is affinity node for partition, no need to run eviction.
if (affNodes.contains(ctx.localNode()))
continue;
List<ClusterNode> nodes = nodes(p, aff.topologyVersion(), OWNING);
Collection<UUID> nodeIds = F.nodeIds(nodes);
// If all affinity nodes are owners, then evict partition from local node.
if (nodeIds.containsAll(F.nodeIds(affNodes))) {
GridDhtPartitionState state0 = part.state();
part.rent();
updateSeq = updateLocal(part.id(), part.state(), updateSeq, aff.topologyVersion());
boolean stateChanged = state0 != part.state();
hasEvictedPartitions |= stateChanged;
if (stateChanged && log.isDebugEnabled()) {
log.debug("Partition has been scheduled for eviction (all affinity nodes are owners) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + part.id() + ", prevState=" + state0 + ", state=" + part.state() + "]");
}
} else {
int ownerCnt = nodeIds.size();
int affCnt = affNodes.size();
if (ownerCnt > affCnt) {
// Sort by node orders in ascending order.
Collections.sort(nodes, CU.nodeComparator(true));
int diff = nodes.size() - affCnt;
for (int i = 0; i < diff; i++) {
ClusterNode n = nodes.get(i);
if (locId.equals(n.id())) {
GridDhtPartitionState state0 = part.state();
part.rent();
updateSeq = updateLocal(part.id(), part.state(), updateSeq, aff.topologyVersion());
boolean stateChanged = state0 != part.state();
hasEvictedPartitions |= stateChanged;
if (stateChanged && log.isDebugEnabled()) {
log.debug("Partition has been scheduled for eviction (this node is oldest non-affinity node) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + part.id() + ", prevState=" + state0 + ", state=" + part.state() + "]");
}
break;
}
}
}
}
}
return hasEvictedPartitions;
}
Aggregations