use of org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method finishExchangeOnCoordinator.
/**
* @param sndResNodes Additional nodes to send finish message to.
*/
private void finishExchangeOnCoordinator(@Nullable Collection<ClusterNode> sndResNodes) {
if (isDone() || !enterBusy())
return;
try {
if (!F.isEmpty(exchangeGlobalExceptions) && dynamicCacheStartExchange() && isRollbackSupported()) {
sendExchangeFailureMessage();
return;
}
AffinityTopologyVersion resTopVer = exchCtx.events().topologyVersion();
if (log.isInfoEnabled()) {
log.info("finishExchangeOnCoordinator [topVer=" + initialVersion() + ", resVer=" + resTopVer + ']');
}
Map<Integer, CacheGroupAffinityMessage> idealAffDiff = null;
// Reserve at least 2 threads for system operations.
int parallelismLvl = U.availableThreadCount(cctx.kernalContext(), GridIoPolicy.SYSTEM_POOL, 2);
if (exchCtx.mergeExchanges()) {
synchronized (mux) {
if (mergedJoinExchMsgs != null) {
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> e : mergedJoinExchMsgs.entrySet()) {
msgs.put(e.getKey(), e.getValue());
updatePartitionSingleMap(e.getKey(), e.getValue());
}
}
}
assert exchCtx.events().hasServerJoin() || exchCtx.events().hasServerLeft();
exchCtx.events().processEvents(this);
if (exchCtx.events().hasServerLeft())
idealAffDiff = cctx.affinity().onServerLeftWithExchangeMergeProtocol(this);
else
cctx.affinity().onServerJoinWithExchangeMergeProtocol(this, true);
doInParallel(parallelismLvl, cctx.kernalContext().pools().getSystemExecutorService(), cctx.affinity().cacheGroups().values(), desc -> {
if (desc.config().getCacheMode() == CacheMode.LOCAL)
return null;
CacheGroupContext grp = cctx.cache().cacheGroup(desc.groupId());
GridDhtPartitionTopology top = grp != null ? grp.topology() : cctx.exchange().clientTopology(desc.groupId(), events().discoveryCache());
top.beforeExchange(this, true, true);
return null;
});
}
span.addLog(() -> "Affinity recalculation (crd)");
timeBag.finishGlobalStage("Affinity recalculation (crd)");
Map<Integer, CacheGroupAffinityMessage> joinedNodeAff = new ConcurrentHashMap<>(cctx.cache().cacheGroups().size());
doInParallel(parallelismLvl, cctx.kernalContext().pools().getSystemExecutorService(), msgs.values(), msg -> {
processSingleMessageOnCrdFinish(msg, joinedNodeAff);
return null;
});
timeBag.finishGlobalStage("Collect update counters and create affinity messages");
if (firstDiscoEvt.type() == EVT_DISCOVERY_CUSTOM_EVT) {
assert firstDiscoEvt instanceof DiscoveryCustomEvent;
if (activateCluster() || changedBaseline())
assignPartitionsStates(null);
DiscoveryCustomMessage discoveryCustomMessage = ((DiscoveryCustomEvent) firstDiscoEvt).customMessage();
if (discoveryCustomMessage instanceof DynamicCacheChangeBatch) {
if (exchActions != null) {
Set<String> caches = exchActions.cachesToResetLostPartitions();
if (!F.isEmpty(caches))
resetLostPartitions(caches);
Set<Integer> cacheGroupsToResetOwners = concat(exchActions.cacheGroupsToStart().stream().map(grp -> grp.descriptor().groupId()), exchActions.cachesToResetLostPartitions().stream().map(CU::cacheId)).collect(Collectors.toSet());
assignPartitionsStates(cacheGroupsToResetOwners);
}
} else if (discoveryCustomMessage instanceof SnapshotDiscoveryMessage && ((SnapshotDiscoveryMessage) discoveryCustomMessage).needAssignPartitions()) {
markAffinityReassign();
assignPartitionsStates(null);
}
} else if (exchCtx.events().hasServerJoin())
assignPartitionsStates(null);
else if (exchCtx.events().hasServerLeft())
assignPartitionsStates(emptySet());
// Validation should happen after resetting owners to avoid false desync reporting.
validatePartitionsState();
// Recalculate new affinity based on partitions availability.
if (!exchCtx.mergeExchanges() && forceAffReassignment) {
idealAffDiff = cctx.affinity().onCustomEventWithEnforcedAffinityReassignment(this);
timeBag.finishGlobalStage("Ideal affinity diff calculation (enforced)");
}
for (CacheGroupContext grpCtx : cctx.cache().cacheGroups()) {
if (!grpCtx.isLocal())
grpCtx.topology().applyUpdateCounters();
}
timeBag.finishGlobalStage("Apply update counters");
updateLastVersion(cctx.versions().last());
cctx.versions().onExchange(lastVer.get().order());
IgniteProductVersion minVer = exchCtx.events().discoveryCache().minimumNodeVersion();
GridDhtPartitionsFullMessage msg = createPartitionsMessage(true, minVer.compareToIgnoreTimestamp(PARTIAL_COUNTERS_MAP_SINCE) >= 0);
if (!cctx.affinity().rebalanceRequired() && !deactivateCluster())
msg.rebalanced(true);
if (exchCtx.mergeExchanges()) {
assert !centralizedAff;
msg.resultTopologyVersion(resTopVer);
if (exchCtx.events().hasServerLeft())
msg.idealAffinityDiff(idealAffDiff);
} else if (forceAffReassignment)
msg.idealAffinityDiff(idealAffDiff);
msg.prepareMarshal(cctx);
timeBag.finishGlobalStage("Full message preparing");
synchronized (mux) {
finishState = new FinishState(crd.id(), resTopVer, msg);
state = ExchangeLocalState.DONE;
}
if (centralizedAff) {
assert !exchCtx.mergeExchanges();
IgniteInternalFuture<Map<Integer, Map<Integer, List<UUID>>>> fut = cctx.affinity().initAffinityOnNodeLeft(this);
if (!fut.isDone())
fut.listen(this::onAffinityInitialized);
else
onAffinityInitialized(fut);
} else {
Set<ClusterNode> nodes;
Map<UUID, GridDhtPartitionsSingleMessage> mergedJoinExchMsgs0;
synchronized (mux) {
srvNodes.remove(cctx.localNode());
nodes = new LinkedHashSet<>(srvNodes);
mergedJoinExchMsgs0 = mergedJoinExchMsgs;
if (mergedJoinExchMsgs != null) {
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> e : mergedJoinExchMsgs.entrySet()) {
if (e.getValue() != null) {
ClusterNode node = cctx.discovery().node(e.getKey());
if (node != null)
nodes.add(node);
}
}
} else
mergedJoinExchMsgs0 = Collections.emptyMap();
if (!F.isEmpty(sndResNodes))
nodes.addAll(sndResNodes);
}
if (msg.rebalanced())
markRebalanced();
if (!nodes.isEmpty())
sendAllPartitions(msg, nodes, mergedJoinExchMsgs0, joinedNodeAff);
timeBag.finishGlobalStage("Full message sending");
discoveryLag = calculateDiscoveryLag(msgs, mergedJoinExchMsgs0);
if (!stateChangeExchange())
onDone(exchCtx.events().topologyVersion(), null);
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> e : pendingSingleMsgs.entrySet()) {
if (log.isInfoEnabled()) {
log.info("Process pending message on coordinator [node=" + e.getKey() + ", ver=" + initialVersion() + ", resVer=" + resTopVer + ']');
}
processSingleMessage(e.getKey(), e.getValue());
}
}
if (stateChangeExchange()) {
StateChangeRequest req = exchActions.stateChangeRequest();
assert req != null : exchActions;
boolean stateChangeErr = false;
if (!F.isEmpty(exchangeGlobalExceptions)) {
stateChangeErr = true;
cctx.kernalContext().state().onStateChangeError(exchangeGlobalExceptions, req);
} else {
boolean hasMoving = !partsToReload.isEmpty();
Set<Integer> waitGrps = cctx.affinity().waitGroups();
if (!hasMoving) {
for (CacheGroupContext grpCtx : cctx.cache().cacheGroups()) {
if (waitGrps.contains(grpCtx.groupId()) && grpCtx.topology().hasMovingPartitions()) {
hasMoving = true;
break;
}
}
}
cctx.kernalContext().state().onExchangeFinishedOnCoordinator(this, hasMoving);
}
if (!cctx.kernalContext().state().clusterState().localBaselineAutoAdjustment()) {
ClusterState state = stateChangeErr ? ClusterState.INACTIVE : req.state();
ChangeGlobalStateFinishMessage stateFinishMsg = new ChangeGlobalStateFinishMessage(req.requestId(), state, !stateChangeErr);
cctx.discovery().sendCustomEvent(stateFinishMsg);
}
timeBag.finishGlobalStage("State finish message sending");
if (!centralizedAff)
onDone(exchCtx.events().topologyVersion(), null);
}
// Try switch late affinity right now if an exchange has been completed normally.
if (!centralizedAff && isDone() && error() == null && !cctx.kernalContext().isStopping())
cctx.exchange().checkRebalanceState();
} catch (IgniteCheckedException e) {
if (reconnectOnError(e))
onDone(new IgniteNeedReconnectException(cctx.localNode(), e));
else
onDone(e);
} finally {
leaveBusy();
}
}
use of org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method onAllReceived.
/**
* Called only for coordinator node when all {@link GridDhtPartitionsSingleMessage}s were received
*
* @param sndResNodes Additional nodes to send finish message to.
*/
private void onAllReceived(@Nullable Collection<ClusterNode> sndResNodes) {
try {
initFut.get();
span.addLog(() -> "Waiting for all single messages");
timeBag.finishGlobalStage("Waiting for all single messages");
assert crd.isLocal();
assert partHistSuppliers.isEmpty() : partHistSuppliers;
if (!exchCtx.mergeExchanges() && !crd.equals(events().discoveryCache().serverNodes().get(0))) {
for (CacheGroupContext grp : cctx.cache().cacheGroups()) {
if (grp.isLocal())
continue;
// For example, dynamic cache start failed.
if (grp.affinity().lastVersion().topologyVersion() > 0)
grp.topology().beforeExchange(this, !centralizedAff && !forceAffReassignment, false);
else
assert exchangeLocE != null : "Affinity is not calculated for the cache group [groupName=" + grp.name() + "]";
}
}
if (exchCtx.mergeExchanges()) {
if (log.isInfoEnabled())
log.info("Coordinator received all messages, try merge [ver=" + initialVersion() + ']');
AffinityTopologyVersion threshold = newCrdFut != null ? newCrdFut.resultTopologyVersion() : null;
if (threshold != null) {
assert newCrdFut.fullMessage() == null : "There is full message in new coordinator future, but exchange was not finished using it: " + newCrdFut.fullMessage();
}
boolean finish = cctx.exchange().mergeExchangesOnCoordinator(this, threshold);
timeBag.finishGlobalStage("Exchanges merge");
if (!finish)
return;
}
finishExchangeOnCoordinator(sndResNodes);
} catch (IgniteCheckedException e) {
if (reconnectOnError(e))
onDone(new IgniteNeedReconnectException(cctx.localNode(), e));
else
onDone(e);
}
}
use of org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion in project ignite by apache.
the class GridDhtPartitionTopologyImpl method beforeExchange.
/**
* {@inheritDoc}
*/
@Override
public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut, boolean affReady, boolean updateMoving) throws IgniteCheckedException {
ctx.database().checkpointReadLock();
try {
U.writeLock(lock);
try {
if (stopping)
return;
assert lastTopChangeVer.equals(exchFut.initialVersion()) : "Invalid topology version [topVer=" + lastTopChangeVer + ", exchId=" + exchFut.exchangeId() + ']';
ExchangeDiscoveryEvents evts = exchFut.context().events();
if (affReady) {
assert grp.affinity().lastVersion().equals(evts.topologyVersion()) : "Invalid affinity version [" + "grp=" + grp.cacheOrGroupName() + ", affVer=" + grp.affinity().lastVersion() + ", evtsVer=" + evts.topologyVersion() + ']';
lastTopChangeVer = readyTopVer = evts.topologyVersion();
discoCache = evts.discoveryCache();
}
if (log.isDebugEnabled()) {
log.debug("Partition map beforeExchange [grp=" + grp.cacheOrGroupName() + ", exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']');
}
long updateSeq = this.updateSeq.incrementAndGet();
if (exchFut.exchangeType() == ALL && !exchFut.rebalanced())
cntrMap.clear();
initializeFullMap(updateSeq);
boolean grpStarted = exchFut.cacheGroupAddedOnExchange(grp.groupId(), grp.receivedFrom());
if (evts.hasServerLeft()) {
for (DiscoveryEvent evt : evts.events()) {
if (ExchangeDiscoveryEvents.serverLeftEvent(evt))
removeNode(evt.eventNode().id());
}
} else if (affReady && grpStarted && exchFut.exchangeType() == NONE) {
assert !exchFut.context().mergeExchanges() : exchFut;
assert node2part != null && node2part.valid() : exchFut;
// Initialize node maps if group was started from joining client.
final List<ClusterNode> nodes = exchFut.firstEventCache().cacheGroupAffinityNodes(grp.groupId());
for (ClusterNode node : nodes) {
if (!node2part.containsKey(node.id()) && ctx.discovery().alive(node)) {
final GridDhtPartitionMap partMap = new GridDhtPartitionMap(node.id(), 1L, exchFut.initialVersion(), new GridPartitionStateMap(), false);
final AffinityAssignment aff = grp.affinity().cachedAffinity(exchFut.initialVersion());
for (Integer p0 : aff.primaryPartitions(node.id())) partMap.put(p0, OWNING);
for (Integer p0 : aff.backupPartitions(node.id())) partMap.put(p0, OWNING);
node2part.put(node.id(), partMap);
}
}
}
if (grp.affinityNode()) {
if (grpStarted || exchFut.firstEvent().type() == EVT_DISCOVERY_CUSTOM_EVT || exchFut.serverNodeDiscoveryEvent()) {
AffinityTopologyVersion affVer;
List<List<ClusterNode>> affAssignment;
if (affReady) {
affVer = evts.topologyVersion();
assert grp.affinity().lastVersion().equals(affVer) : "Invalid affinity [topVer=" + grp.affinity().lastVersion() + ", grp=" + grp.cacheOrGroupName() + ", affVer=" + affVer + ", fut=" + exchFut + ']';
affAssignment = grp.affinity().readyAssignments(affVer);
} else {
assert !exchFut.context().mergeExchanges();
affVer = exchFut.initialVersion();
affAssignment = grp.affinity().idealAssignmentRaw();
}
initPartitions(affVer, affAssignment, exchFut, updateSeq);
}
}
consistencyCheck();
if (updateMoving) {
assert grp.affinity().lastVersion().equals(evts.topologyVersion());
createMovingPartitions(grp.affinity().readyAffinity(evts.topologyVersion()));
}
if (log.isDebugEnabled()) {
log.debug("Partition map after beforeExchange [grp=" + grp.cacheOrGroupName() + ", " + "exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']');
}
if (log.isTraceEnabled()) {
log.trace("Partition states after beforeExchange [grp=" + grp.cacheOrGroupName() + ", exchId=" + exchFut.exchangeId() + ", states=" + dumpPartitionStates() + ']');
}
} finally {
lock.writeLock().unlock();
}
} finally {
ctx.database().checkpointReadUnlock();
}
}
use of org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion in project ignite by apache.
the class GridDhtPartitionTopologyImpl method afterExchange.
/**
* {@inheritDoc}
*/
@Override
public boolean afterExchange(GridDhtPartitionsExchangeFuture exchFut) {
boolean changed = false;
int partitions = grp.affinity().partitions();
AffinityTopologyVersion topVer = exchFut.context().events().topologyVersion();
assert grp.affinity().lastVersion().equals(topVer) : "Affinity is not initialized " + "[grp=" + grp.cacheOrGroupName() + ", topVer=" + topVer + ", affVer=" + grp.affinity().lastVersion() + ", fut=" + exchFut + ']';
ctx.database().checkpointReadLock();
try {
lock.writeLock().lock();
try {
if (stopping)
return false;
assert readyTopVer.initialized() : readyTopVer;
assert lastTopChangeVer.equals(readyTopVer);
if (log.isDebugEnabled()) {
log.debug("Partition map before afterExchange [grp=" + grp.cacheOrGroupName() + ", exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']');
}
if (log.isTraceEnabled()) {
log.trace("Partition states before afterExchange [grp=" + grp.cacheOrGroupName() + ", exchVer=" + exchFut.exchangeId() + ", states=" + dumpPartitionStates() + ']');
}
long updateSeq = this.updateSeq.incrementAndGet();
// Skip partition updates in case of not real exchange.
if (!ctx.localNode().isClient() && exchFut.exchangeType() == ALL) {
for (int p = 0; p < partitions; p++) {
GridDhtLocalPartition locPart = localPartition0(p, topVer, false, true);
if (partitionLocalNode(p, topVer)) {
// Prepare partition to rebalance if it's not happened on full map update phase.
if (locPart == null || locPart.state() == RENTING || locPart.state() == EVICTED)
locPart = rebalancePartition(p, true, exchFut);
GridDhtPartitionState state = locPart.state();
if (state == MOVING) {
if (grp.rebalanceEnabled()) {
Collection<ClusterNode> owners = owners(p);
// then new exchange should be started with detecting lost partitions.
if (!F.isEmpty(owners)) {
if (log.isDebugEnabled())
log.debug("Will not own partition (there are owners to rebalance from) " + "[grp=" + grp.cacheOrGroupName() + ", p=" + p + ", owners = " + owners + ']');
}
} else
updateSeq = updateLocal(p, locPart.state(), updateSeq, topVer);
}
} else {
if (locPart != null) {
GridDhtPartitionState state = locPart.state();
if (state == MOVING) {
locPart.rent();
updateSeq = updateLocal(p, locPart.state(), updateSeq, topVer);
changed = true;
if (log.isDebugEnabled()) {
log.debug("Evicting MOVING partition (it does not belong to affinity) [" + "grp=" + grp.cacheOrGroupName() + ", p=" + locPart.id() + ']');
}
}
}
}
}
}
AffinityAssignment aff = grp.affinity().readyAffinity(topVer);
if (node2part != null && node2part.valid())
changed |= checkEvictions(updateSeq, aff);
updateRebalanceVersion(aff.topologyVersion(), aff.assignment());
consistencyCheck();
if (log.isTraceEnabled()) {
log.trace("Partition states after afterExchange [grp=" + grp.cacheOrGroupName() + ", exchVer=" + exchFut.exchangeId() + ", states=" + dumpPartitionStates() + ']');
}
} finally {
lock.writeLock().unlock();
}
} finally {
ctx.database().checkpointReadUnlock();
}
return changed;
}
use of org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion in project ignite by apache.
the class GridClientPartitionTopology method updateTopologyVersion.
/**
* {@inheritDoc}
*/
@Override
public void updateTopologyVersion(GridDhtTopologyFuture exchFut, DiscoCache discoCache, long updSeq, boolean stopping) throws IgniteInterruptedCheckedException {
U.writeLock(lock);
try {
AffinityTopologyVersion exchTopVer = exchFut.initialVersion();
assert exchTopVer.compareTo(topVer) > 0 : "Invalid topology version [grp=" + grpId + ", topVer=" + topVer + ", exchVer=" + exchTopVer + ", discoCacheVer=" + (this.discoCache != null ? this.discoCache.version() : "None") + ", exchDiscoCacheVer=" + discoCache.version() + ']';
this.stopping = stopping;
topVer = exchTopVer;
this.discoCache = discoCache;
updateSeq.setIfGreater(updSeq);
topReadyFut = exchFut;
} finally {
lock.writeLock().unlock();
}
}
Aggregations