use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionTopology in project ignite by apache.
the class CacheAffinitySharedManager method initAffinityBasedOnPartitionsAvailability.
/**
* Initializes current affinity assignment based on partitions availability. Nodes that have most recent data will
* be considered affinity nodes.
*
* @param topVer Topology version.
* @param fut Exchange future.
* @param c Closure converting affinity diff.
* @param initAff {@code True} if need initialize affinity.
* @return Affinity assignment for each of registered cache group.
*/
private <T> Map<Integer, Map<Integer, List<T>>> initAffinityBasedOnPartitionsAvailability(final AffinityTopologyVersion topVer, final GridDhtPartitionsExchangeFuture fut, final IgniteClosure<ClusterNode, T> c, final boolean initAff) {
final boolean enforcedCentralizedAssignment = DiscoveryCustomEvent.requiresCentralizedAffinityAssignment(fut.firstEvent());
final WaitRebalanceInfo waitRebalanceInfo = enforcedCentralizedAssignment ? new WaitRebalanceInfo(fut.exchangeId().topologyVersion()) : new WaitRebalanceInfo(fut.context().events().lastServerEventVersion());
final Collection<ClusterNode> evtNodes = fut.context().events().discoveryCache().serverNodes();
final Map<Integer, Map<Integer, List<T>>> assignment = new ConcurrentHashMap<>();
forAllRegisteredCacheGroups(new IgniteInClosureX<CacheGroupDescriptor>() {
@Override
public void applyx(CacheGroupDescriptor desc) throws IgniteCheckedException {
CacheGroupHolder grpHolder = getOrCreateGroupHolder(topVer, desc);
if (!grpHolder.rebalanceEnabled || (fut.cacheGroupAddedOnExchange(desc.groupId(), desc.receivedFrom()) && !enforcedCentralizedAssignment))
return;
AffinityTopologyVersion affTopVer = grpHolder.affinity().lastVersion();
assert (affTopVer.topologyVersion() > 0 && !affTopVer.equals(topVer)) || enforcedCentralizedAssignment : "Invalid affinity version [last=" + affTopVer + ", futVer=" + topVer + ", grp=" + desc.cacheOrGroupName() + ']';
List<List<ClusterNode>> curAssignment = grpHolder.affinity().assignments(affTopVer);
List<List<ClusterNode>> newAssignment = grpHolder.affinity().idealAssignmentRaw();
assert newAssignment != null;
List<List<ClusterNode>> newAssignment0 = initAff ? new ArrayList<>(newAssignment) : null;
GridDhtPartitionTopology top = grpHolder.topology(fut.context().events().discoveryCache());
Map<Integer, List<T>> cacheAssignment = null;
for (int p = 0; p < newAssignment.size(); p++) {
List<ClusterNode> newNodes = newAssignment.get(p);
List<ClusterNode> curNodes = curAssignment.get(p);
assert evtNodes.containsAll(newNodes) : "Invalid new assignment [grp=" + grpHolder.aff.cacheOrGroupName() + ", nodes=" + newNodes + ", topVer=" + fut.context().events().discoveryCache().version() + ", evts=" + fut.context().events().events() + "]";
ClusterNode curPrimary = !curNodes.isEmpty() ? curNodes.get(0) : null;
ClusterNode newPrimary = !newNodes.isEmpty() ? newNodes.get(0) : null;
List<ClusterNode> newNodes0 = null;
assert newPrimary == null || evtNodes.contains(newPrimary) : "Invalid new primary [" + "grp=" + desc.cacheOrGroupName() + ", node=" + newPrimary + ", topVer=" + topVer + ']';
List<ClusterNode> owners = top.owners(p, topVer);
// It is essential that curPrimary node has partition in OWNING state.
if (!owners.isEmpty() && !owners.contains(curPrimary))
curPrimary = owners.get(0);
// If new assignment is empty preserve current ownership for alive nodes.
if (curPrimary != null && newPrimary == null) {
newNodes0 = new ArrayList<>(curNodes.size());
for (ClusterNode node : curNodes) {
if (evtNodes.contains(node))
newNodes0.add(node);
}
} else if (curPrimary != null && !curPrimary.equals(newPrimary)) {
GridDhtPartitionState state = top.partitionState(newPrimary.id(), p);
if (evtNodes.contains(curPrimary)) {
if (state != OWNING) {
newNodes0 = latePrimaryAssignment(grpHolder.affinity(), p, curPrimary, newNodes, waitRebalanceInfo);
}
} else {
if (state != OWNING) {
for (int i = 1; i < curNodes.size(); i++) {
ClusterNode curNode = curNodes.get(i);
if (top.partitionState(curNode.id(), p) == OWNING && evtNodes.contains(curNode)) {
newNodes0 = latePrimaryAssignment(grpHolder.affinity(), p, curNode, newNodes, waitRebalanceInfo);
break;
}
}
if (newNodes0 == null) {
for (ClusterNode owner : owners) {
if (evtNodes.contains(owner)) {
newNodes0 = latePrimaryAssignment(grpHolder.affinity(), p, owner, newNodes, waitRebalanceInfo);
break;
}
}
}
}
}
}
// This will happen if no primary has changed but some backups still need to be rebalanced.
if (!owners.isEmpty() && !owners.containsAll(newNodes) && !top.lostPartitions().contains(p))
waitRebalanceInfo.add(grpHolder.groupId(), p, newNodes);
if (newNodes0 != null) {
assert evtNodes.containsAll(newNodes0) : "Invalid late assignment [grp=" + grpHolder.aff.cacheOrGroupName() + ", nodes=" + newNodes + ", topVer=" + fut.context().events().discoveryCache().version() + ", evts=" + fut.context().events().events() + "]";
if (newAssignment0 != null)
newAssignment0.set(p, newNodes0);
if (cacheAssignment == null)
cacheAssignment = new HashMap<>();
List<T> n = new ArrayList<>(newNodes0.size());
for (int i = 0; i < newNodes0.size(); i++) n.add(c.apply(newNodes0.get(i)));
cacheAssignment.put(p, n);
}
}
if (cacheAssignment != null)
assignment.put(grpHolder.groupId(), cacheAssignment);
if (initAff)
grpHolder.affinity().initialize(topVer, newAssignment0);
fut.timeBag().finishLocalStage("Affinity recalculation (partitions availability) " + "[grp=" + desc.cacheOrGroupName() + "]");
}
});
if (log.isDebugEnabled()) {
log.debug("Computed new affinity after node left [topVer=" + topVer + ", waitGrps=" + groupNames(waitRebalanceInfo.waitGrps.keySet()) + ']');
}
synchronized (mux) {
waitInfo = !waitRebalanceInfo.empty() ? waitRebalanceInfo : null;
}
return assignment;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionTopology in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method distributedExchange.
/**
* @throws IgniteCheckedException If failed.
*/
private void distributedExchange() throws IgniteCheckedException {
assert crd != null;
assert !cctx.kernalContext().clientNode();
for (CacheGroupContext grp : cctx.cache().cacheGroups()) {
if (grp.isLocal())
continue;
cctx.exchange().exchangerBlockingSectionBegin();
try {
grp.preloader().onTopologyChanged(this);
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
}
timeBag.finishGlobalStage("Preloading notification");
// Skipping wait on local join is available when all cluster nodes have the same protocol.
boolean skipWaitOnLocalJoin = localJoinExchange() && cctx.exchange().latch().canSkipJoiningNodes(initialVersion());
if (context().exchangeFreeSwitch() && isBaselineNodeFailed()) {
// Currently MVCC does not support operations on partially switched cluster.
if (cctx.kernalContext().coordinators().mvccEnabled())
waitPartitionRelease(EXCHANGE_FREE_LATCH_ID, true, false);
else
waitPartitionRelease(null, false, false);
} else if (!skipWaitOnLocalJoin) {
// Skip partition release if node has locally joined (it doesn't have any updates to be finished).
boolean distributed = true;
// Do not perform distributed partition release in case of cluster activation.
if (activateCluster())
distributed = false;
// On first phase we wait for finishing all local tx updates, atomic updates and lock releases on all nodes.
waitPartitionRelease(EXCHANGE_LATCH_ID, distributed, true);
// Second phase is needed to wait for finishing all tx updates from primary to backup nodes remaining after first phase.
if (distributed)
waitPartitionRelease(EXCHANGE_LATCH_ID, false, false);
} else {
if (log.isInfoEnabled())
log.info("Skipped waiting for partitions release future (local node is joining) " + "[topVer=" + initialVersion() + "]");
}
boolean topChanged = firstDiscoEvt.type() != EVT_DISCOVERY_CUSTOM_EVT || affChangeMsg != null;
for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
if (cacheCtx.isLocal() || cacheStopping(cacheCtx.cacheId()))
continue;
if (topChanged) {
// Partition release future is done so we can flush the write-behind store.
cctx.exchange().exchangerBlockingSectionBegin();
try {
cacheCtx.store().forceFlush();
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
}
}
cctx.exchange().exchangerBlockingSectionBegin();
try {
/* It is necessary to run database callback before all topology callbacks.
In case of persistent store is enabled we first restore partitions presented on disk.
We need to guarantee that there are no partition state changes logged to WAL before this callback
to make sure that we correctly restored last actual states. */
cctx.database().beforeExchange(this);
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
// Pre-create missing partitions using current affinity.
if (!exchCtx.mergeExchanges() && !exchCtx.exchangeFreeSwitch()) {
for (CacheGroupContext grp : cctx.cache().cacheGroups()) {
if (grp.isLocal() || cacheGroupStopping(grp.groupId()))
continue;
// It is possible affinity is not initialized yet if node joins to cluster.
if (grp.affinity().lastVersion().topologyVersion() > 0) {
cctx.exchange().exchangerBlockingSectionBegin();
try {
grp.topology().beforeExchange(this, !centralizedAff && !forceAffReassignment, false);
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
}
}
}
// After all partitions have been restored and pre-created it's safe to make first checkpoint.
if (localJoinExchange() || activateCluster()) {
cctx.exchange().exchangerBlockingSectionBegin();
try {
cctx.database().onStateRestored(initialVersion());
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
}
timeBag.finishGlobalStage("After states restored callback");
cctx.exchange().exchangerBlockingSectionBegin();
try {
cctx.database().releaseHistoryForPreloading();
// To correctly rebalance when persistence is enabled, it is necessary to reserve history within exchange.
partHistReserved = cctx.database().reserveHistoryForExchange();
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
clearingPartitions = new HashMap();
timeBag.finishGlobalStage("WAL history reservation");
changeWalModeIfNeeded();
if (events().hasServerLeft())
finalizePartitionCounters();
cctx.exchange().exchangerBlockingSectionBegin();
try {
if (context().exchangeFreeSwitch()) {
// Update local maps, see CachePartitionLossWithRestartsTest.
doInParallel(U.availableThreadCount(cctx.kernalContext(), GridIoPolicy.SYSTEM_POOL, 2), cctx.kernalContext().pools().getSystemExecutorService(), cctx.affinity().cacheGroups().values(), desc -> {
if (desc.config().getCacheMode() == CacheMode.LOCAL)
return null;
CacheGroupContext grp = cctx.cache().cacheGroup(desc.groupId());
GridDhtPartitionTopology top = grp != null ? grp.topology() : cctx.exchange().clientTopology(desc.groupId(), events().discoveryCache());
// Not expecting new moving partitions.
top.beforeExchange(this, true, false);
return null;
});
} else {
if (crd.isLocal()) {
if (remaining.isEmpty()) {
initFut.onDone(true);
onAllReceived(null);
}
} else
sendPartitions(crd);
initDone();
}
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionTopology in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method updatePartitionFullMap.
/**
* Updates partition map in all caches.
*
* @param resTopVer Result topology version.
* @param msg Partitions full messages.
*/
private void updatePartitionFullMap(AffinityTopologyVersion resTopVer, GridDhtPartitionsFullMessage msg) {
cctx.versions().onExchange(msg.lastVersion().order());
assert partHistSuppliers.isEmpty();
partHistSuppliers.putAll(msg.partitionHistorySuppliers());
// Reserve at least 2 threads for system operations.
int parallelismLvl = U.availableThreadCount(cctx.kernalContext(), GridIoPolicy.SYSTEM_POOL, 2);
try {
Map<Integer, Map<Integer, Long>> partsSizes = msg.partitionSizes(cctx);
doInParallel(parallelismLvl, cctx.kernalContext().pools().getSystemExecutorService(), msg.partitions().keySet(), grpId -> {
CacheGroupContext grp = cctx.cache().cacheGroup(grpId);
if (grp != null) {
CachePartitionFullCountersMap cntrMap = msg.partitionUpdateCounters(grpId, grp.topology().partitions());
grp.topology().update(resTopVer, msg.partitions().get(grpId), cntrMap, msg.partsToReload(cctx.localNodeId(), grpId), partsSizes.getOrDefault(grpId, Collections.emptyMap()), null, this, msg.lostPartitions(grpId));
} else {
GridDhtPartitionTopology top = cctx.exchange().clientTopology(grpId, events().discoveryCache());
CachePartitionFullCountersMap cntrMap = msg.partitionUpdateCounters(grpId, top.partitions());
top.update(resTopVer, msg.partitions().get(grpId), cntrMap, Collections.emptySet(), null, null, this, msg.lostPartitions(grpId));
}
return null;
});
} catch (IgniteCheckedException e) {
throw new IgniteException(e);
}
timeBag.finishGlobalStage("Full map updating");
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionTopology in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method assignPartitionStates.
/**
* Collects and determines new owners of partitions for all nodes for given {@code top}.
*
* @param top Topology to assign.
* @param resetOwners True if need to reset partition state considering of counter, false otherwise.
* @return Partitions supply info list.
*/
private List<SupplyPartitionInfo> assignPartitionStates(GridDhtPartitionTopology top, boolean resetOwners) {
Map<Integer, CounterWithNodes> maxCntrs = new HashMap<>();
Map<Integer, TreeSet<Long>> varCntrs = new HashMap<>();
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> e : msgs.entrySet()) {
CachePartitionPartialCountersMap nodeCntrs = e.getValue().partitionUpdateCounters(top.groupId(), top.partitions());
assert nodeCntrs != null;
for (int i = 0; i < nodeCntrs.size(); i++) {
int p = nodeCntrs.partitionAt(i);
UUID remoteNodeId = e.getKey();
GridDhtPartitionState state = top.partitionState(remoteNodeId, p);
if (state != GridDhtPartitionState.OWNING && state != GridDhtPartitionState.MOVING)
continue;
long cntr = state == GridDhtPartitionState.MOVING ? nodeCntrs.initialUpdateCounterAt(i) : nodeCntrs.updateCounterAt(i);
varCntrs.computeIfAbsent(p, key -> new TreeSet<>()).add(cntr);
if (state != GridDhtPartitionState.OWNING)
continue;
CounterWithNodes maxCntr = maxCntrs.get(p);
if (maxCntr == null || cntr > maxCntr.cnt)
maxCntrs.put(p, new CounterWithNodes(cntr, e.getValue().partitionSizes(top.groupId()).get(p), remoteNodeId));
else if (cntr == maxCntr.cnt)
maxCntr.nodes.add(remoteNodeId);
}
}
// Also must process counters from the local node.
for (GridDhtLocalPartition part : top.currentLocalPartitions()) {
GridDhtPartitionState state = top.partitionState(cctx.localNodeId(), part.id());
if (state != GridDhtPartitionState.OWNING && state != GridDhtPartitionState.MOVING)
continue;
final long cntr = state == GridDhtPartitionState.MOVING ? part.initialUpdateCounter() : part.updateCounter();
varCntrs.computeIfAbsent(part.id(), key -> new TreeSet<>()).add(cntr);
if (state != GridDhtPartitionState.OWNING)
continue;
CounterWithNodes maxCntr = maxCntrs.get(part.id());
if (maxCntr == null && cntr == 0) {
CounterWithNodes cntrObj = new CounterWithNodes(0, 0L, cctx.localNodeId());
for (UUID nodeId : msgs.keySet()) {
if (top.partitionState(nodeId, part.id()) == GridDhtPartitionState.OWNING)
cntrObj.nodes.add(nodeId);
}
maxCntrs.put(part.id(), cntrObj);
} else if (maxCntr == null || cntr > maxCntr.cnt)
maxCntrs.put(part.id(), new CounterWithNodes(cntr, part.fullSize(), cctx.localNodeId()));
else if (cntr == maxCntr.cnt)
maxCntr.nodes.add(cctx.localNodeId());
}
Set<Integer> haveHistory = new HashSet<>();
List<SupplyPartitionInfo> list = assignHistoricalSuppliers(top, maxCntrs, varCntrs, haveHistory);
if (resetOwners)
resetOwnersByCounter(top, maxCntrs, haveHistory);
return list;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionTopology in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method updateTopologies.
/**
* Updates topology versions and discovery caches on all topologies.
*
* @param crd Coordinator flag.
* @throws IgniteCheckedException If failed.
*/
private void updateTopologies(boolean crd) throws IgniteCheckedException {
for (CacheGroupContext grp : cctx.cache().cacheGroups()) {
if (grp.isLocal())
continue;
GridClientPartitionTopology clientTop = cctx.exchange().clearClientTopology(grp.groupId());
long updSeq = clientTop == null ? -1 : clientTop.lastUpdateSequence();
GridDhtPartitionTopology top = grp.topology();
if (crd) {
boolean updateTop = exchId.topologyVersion().equals(grp.localStartVersion());
if (updateTop && clientTop != null) {
cctx.exchange().exchangerBlockingSectionBegin();
try {
top.update(null, clientTop.partitionMap(true), clientTop.fullUpdateCounters(), emptySet(), null, null, null, clientTop.lostPartitions());
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
}
}
cctx.exchange().exchangerBlockingSectionBegin();
try {
top.updateTopologyVersion(this, events().discoveryCache(), updSeq, cacheGroupStopping(grp.groupId()));
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
}
cctx.exchange().exchangerBlockingSectionBegin();
try {
for (GridClientPartitionTopology top : cctx.exchange().clientTopologies()) {
top.updateTopologyVersion(this, events().discoveryCache(), -1, cacheGroupStopping(top.groupId()));
}
} finally {
cctx.exchange().exchangerBlockingSectionEnd();
}
}
Aggregations