use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method onBecomeCoordinator.
/**
* @param newCrdFut Coordinator initialization future.
*/
private void onBecomeCoordinator(InitNewCoordinatorFuture newCrdFut) {
boolean allRcvd = false;
cctx.exchange().onCoordinatorInitialized();
if (newCrdFut.restoreState()) {
GridDhtPartitionsFullMessage fullMsg = newCrdFut.fullMessage();
assert msgs.isEmpty() : msgs;
if (fullMsg != null) {
if (log.isInfoEnabled()) {
log.info("New coordinator restored state [ver=" + initialVersion() + ", resVer=" + fullMsg.resultTopologyVersion() + ']');
}
synchronized (mux) {
state = ExchangeLocalState.DONE;
finishState = new FinishState(crd.id(), fullMsg.resultTopologyVersion(), fullMsg);
}
fullMsg.exchangeId(exchId);
processFullMessage(false, null, fullMsg);
Map<ClusterNode, GridDhtPartitionsSingleMessage> msgs = newCrdFut.messages();
if (!F.isEmpty(msgs)) {
Map<Integer, CacheGroupAffinityMessage> joinedNodeAff = new ConcurrentHashMap<>();
// Reserve at least 2 threads for system operations.
int parallelismLvl = U.availableThreadCount(cctx.kernalContext(), GridIoPolicy.SYSTEM_POOL, 2);
try {
U.doInParallel(parallelismLvl, cctx.kernalContext().pools().getSystemExecutorService(), msgs.entrySet(), entry -> {
this.msgs.put(entry.getKey().id(), entry.getValue());
GridDhtPartitionsSingleMessage msg = entry.getValue();
Collection<Integer> affReq = msg.cacheGroupsAffinityRequest();
if (!F.isEmpty(affReq)) {
CacheGroupAffinityMessage.createAffinityMessages(cctx, fullMsg.resultTopologyVersion(), affReq, joinedNodeAff);
}
return null;
});
} catch (IgniteCheckedException e) {
throw new IgniteException(e);
}
Map<UUID, GridDhtPartitionsSingleMessage> mergedJoins = newCrdFut.mergedJoinExchangeMessages();
if (log.isInfoEnabled()) {
log.info("New coordinator sends full message [ver=" + initialVersion() + ", resVer=" + fullMsg.resultTopologyVersion() + ", nodes=" + F.nodeIds(msgs.keySet()) + ", mergedJoins=" + (mergedJoins != null ? mergedJoins.keySet() : null) + ']');
}
sendAllPartitions(fullMsg, msgs.keySet(), mergedJoins, joinedNodeAff);
}
return;
} else {
if (log.isInfoEnabled())
log.info("New coordinator restore state finished [ver=" + initialVersion() + ']');
for (Map.Entry<ClusterNode, GridDhtPartitionsSingleMessage> e : newCrdFut.messages().entrySet()) {
GridDhtPartitionsSingleMessage msg = e.getValue();
if (!msg.client()) {
msgs.put(e.getKey().id(), e.getValue());
if (dynamicCacheStartExchange() && msg.getError() != null)
exchangeGlobalExceptions.put(e.getKey().id(), msg.getError());
updatePartitionSingleMap(e.getKey().id(), msg);
}
}
}
allRcvd = true;
synchronized (mux) {
// Do not process messages.
remaining.clear();
assert crd != null && crd.isLocal();
state = ExchangeLocalState.CRD;
assert mergedJoinExchMsgs == null;
}
} else {
Set<UUID> remaining0 = null;
synchronized (mux) {
assert crd != null && crd.isLocal();
state = ExchangeLocalState.CRD;
assert mergedJoinExchMsgs == null;
if (log.isInfoEnabled()) {
log.info("New coordinator initialization finished [ver=" + initialVersion() + ", remaining=" + remaining + ']');
}
if (!remaining.isEmpty())
remaining0 = new HashSet<>(remaining);
}
if (remaining0 != null) {
// It is possible that some nodes finished exchange with previous coordinator.
GridDhtPartitionsSingleRequest req = new GridDhtPartitionsSingleRequest(exchId);
for (UUID nodeId : remaining0) {
try {
if (!pendingSingleMsgs.containsKey(nodeId)) {
if (log.isInfoEnabled()) {
log.info("New coordinator sends request [ver=" + initialVersion() + ", node=" + nodeId + ']');
}
cctx.io().send(nodeId, req, SYSTEM_POOL);
}
} catch (ClusterTopologyCheckedException ignored) {
if (log.isDebugEnabled())
log.debug("Node left during partition exchange [nodeId=" + nodeId + ", exchId=" + exchId + ']');
} catch (IgniteCheckedException e) {
U.error(log, "Failed to request partitions from node: " + nodeId, e);
}
}
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> m : pendingSingleMsgs.entrySet()) {
if (log.isInfoEnabled()) {
log.info("New coordinator process pending message [ver=" + initialVersion() + ", node=" + m.getKey() + ']');
}
processSingleMessage(m.getKey(), m.getValue());
}
}
}
if (allRcvd) {
awaitSingleMapUpdates();
onAllReceived(newCrdFut.messages().keySet());
}
}
use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method finishExchangeOnCoordinator.
/**
* @param sndResNodes Additional nodes to send finish message to.
*/
private void finishExchangeOnCoordinator(@Nullable Collection<ClusterNode> sndResNodes) {
if (isDone() || !enterBusy())
return;
try {
if (!F.isEmpty(exchangeGlobalExceptions) && dynamicCacheStartExchange() && isRollbackSupported()) {
sendExchangeFailureMessage();
return;
}
AffinityTopologyVersion resTopVer = exchCtx.events().topologyVersion();
if (log.isInfoEnabled()) {
log.info("finishExchangeOnCoordinator [topVer=" + initialVersion() + ", resVer=" + resTopVer + ']');
}
Map<Integer, CacheGroupAffinityMessage> idealAffDiff = null;
// Reserve at least 2 threads for system operations.
int parallelismLvl = U.availableThreadCount(cctx.kernalContext(), GridIoPolicy.SYSTEM_POOL, 2);
if (exchCtx.mergeExchanges()) {
synchronized (mux) {
if (mergedJoinExchMsgs != null) {
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> e : mergedJoinExchMsgs.entrySet()) {
msgs.put(e.getKey(), e.getValue());
updatePartitionSingleMap(e.getKey(), e.getValue());
}
}
}
assert exchCtx.events().hasServerJoin() || exchCtx.events().hasServerLeft();
exchCtx.events().processEvents(this);
if (exchCtx.events().hasServerLeft())
idealAffDiff = cctx.affinity().onServerLeftWithExchangeMergeProtocol(this);
else
cctx.affinity().onServerJoinWithExchangeMergeProtocol(this, true);
doInParallel(parallelismLvl, cctx.kernalContext().pools().getSystemExecutorService(), cctx.affinity().cacheGroups().values(), desc -> {
if (desc.config().getCacheMode() == CacheMode.LOCAL)
return null;
CacheGroupContext grp = cctx.cache().cacheGroup(desc.groupId());
GridDhtPartitionTopology top = grp != null ? grp.topology() : cctx.exchange().clientTopology(desc.groupId(), events().discoveryCache());
top.beforeExchange(this, true, true);
return null;
});
}
span.addLog(() -> "Affinity recalculation (crd)");
timeBag.finishGlobalStage("Affinity recalculation (crd)");
Map<Integer, CacheGroupAffinityMessage> joinedNodeAff = new ConcurrentHashMap<>(cctx.cache().cacheGroups().size());
doInParallel(parallelismLvl, cctx.kernalContext().pools().getSystemExecutorService(), msgs.values(), msg -> {
processSingleMessageOnCrdFinish(msg, joinedNodeAff);
return null;
});
timeBag.finishGlobalStage("Collect update counters and create affinity messages");
if (firstDiscoEvt.type() == EVT_DISCOVERY_CUSTOM_EVT) {
assert firstDiscoEvt instanceof DiscoveryCustomEvent;
if (activateCluster() || changedBaseline())
assignPartitionsStates(null);
DiscoveryCustomMessage discoveryCustomMessage = ((DiscoveryCustomEvent) firstDiscoEvt).customMessage();
if (discoveryCustomMessage instanceof DynamicCacheChangeBatch) {
if (exchActions != null) {
Set<String> caches = exchActions.cachesToResetLostPartitions();
if (!F.isEmpty(caches))
resetLostPartitions(caches);
Set<Integer> cacheGroupsToResetOwners = concat(exchActions.cacheGroupsToStart().stream().map(grp -> grp.descriptor().groupId()), exchActions.cachesToResetLostPartitions().stream().map(CU::cacheId)).collect(Collectors.toSet());
assignPartitionsStates(cacheGroupsToResetOwners);
}
} else if (discoveryCustomMessage instanceof SnapshotDiscoveryMessage && ((SnapshotDiscoveryMessage) discoveryCustomMessage).needAssignPartitions()) {
markAffinityReassign();
assignPartitionsStates(null);
}
} else if (exchCtx.events().hasServerJoin())
assignPartitionsStates(null);
else if (exchCtx.events().hasServerLeft())
assignPartitionsStates(emptySet());
// Validation should happen after resetting owners to avoid false desync reporting.
validatePartitionsState();
// Recalculate new affinity based on partitions availability.
if (!exchCtx.mergeExchanges() && forceAffReassignment) {
idealAffDiff = cctx.affinity().onCustomEventWithEnforcedAffinityReassignment(this);
timeBag.finishGlobalStage("Ideal affinity diff calculation (enforced)");
}
for (CacheGroupContext grpCtx : cctx.cache().cacheGroups()) {
if (!grpCtx.isLocal())
grpCtx.topology().applyUpdateCounters();
}
timeBag.finishGlobalStage("Apply update counters");
updateLastVersion(cctx.versions().last());
cctx.versions().onExchange(lastVer.get().order());
IgniteProductVersion minVer = exchCtx.events().discoveryCache().minimumNodeVersion();
GridDhtPartitionsFullMessage msg = createPartitionsMessage(true, minVer.compareToIgnoreTimestamp(PARTIAL_COUNTERS_MAP_SINCE) >= 0);
if (!cctx.affinity().rebalanceRequired() && !deactivateCluster())
msg.rebalanced(true);
if (exchCtx.mergeExchanges()) {
assert !centralizedAff;
msg.resultTopologyVersion(resTopVer);
if (exchCtx.events().hasServerLeft())
msg.idealAffinityDiff(idealAffDiff);
} else if (forceAffReassignment)
msg.idealAffinityDiff(idealAffDiff);
msg.prepareMarshal(cctx);
timeBag.finishGlobalStage("Full message preparing");
synchronized (mux) {
finishState = new FinishState(crd.id(), resTopVer, msg);
state = ExchangeLocalState.DONE;
}
if (centralizedAff) {
assert !exchCtx.mergeExchanges();
IgniteInternalFuture<Map<Integer, Map<Integer, List<UUID>>>> fut = cctx.affinity().initAffinityOnNodeLeft(this);
if (!fut.isDone())
fut.listen(this::onAffinityInitialized);
else
onAffinityInitialized(fut);
} else {
Set<ClusterNode> nodes;
Map<UUID, GridDhtPartitionsSingleMessage> mergedJoinExchMsgs0;
synchronized (mux) {
srvNodes.remove(cctx.localNode());
nodes = new LinkedHashSet<>(srvNodes);
mergedJoinExchMsgs0 = mergedJoinExchMsgs;
if (mergedJoinExchMsgs != null) {
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> e : mergedJoinExchMsgs.entrySet()) {
if (e.getValue() != null) {
ClusterNode node = cctx.discovery().node(e.getKey());
if (node != null)
nodes.add(node);
}
}
} else
mergedJoinExchMsgs0 = Collections.emptyMap();
if (!F.isEmpty(sndResNodes))
nodes.addAll(sndResNodes);
}
if (msg.rebalanced())
markRebalanced();
if (!nodes.isEmpty())
sendAllPartitions(msg, nodes, mergedJoinExchMsgs0, joinedNodeAff);
timeBag.finishGlobalStage("Full message sending");
discoveryLag = calculateDiscoveryLag(msgs, mergedJoinExchMsgs0);
if (!stateChangeExchange())
onDone(exchCtx.events().topologyVersion(), null);
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> e : pendingSingleMsgs.entrySet()) {
if (log.isInfoEnabled()) {
log.info("Process pending message on coordinator [node=" + e.getKey() + ", ver=" + initialVersion() + ", resVer=" + resTopVer + ']');
}
processSingleMessage(e.getKey(), e.getValue());
}
}
if (stateChangeExchange()) {
StateChangeRequest req = exchActions.stateChangeRequest();
assert req != null : exchActions;
boolean stateChangeErr = false;
if (!F.isEmpty(exchangeGlobalExceptions)) {
stateChangeErr = true;
cctx.kernalContext().state().onStateChangeError(exchangeGlobalExceptions, req);
} else {
boolean hasMoving = !partsToReload.isEmpty();
Set<Integer> waitGrps = cctx.affinity().waitGroups();
if (!hasMoving) {
for (CacheGroupContext grpCtx : cctx.cache().cacheGroups()) {
if (waitGrps.contains(grpCtx.groupId()) && grpCtx.topology().hasMovingPartitions()) {
hasMoving = true;
break;
}
}
}
cctx.kernalContext().state().onExchangeFinishedOnCoordinator(this, hasMoving);
}
if (!cctx.kernalContext().state().clusterState().localBaselineAutoAdjustment()) {
ClusterState state = stateChangeErr ? ClusterState.INACTIVE : req.state();
ChangeGlobalStateFinishMessage stateFinishMsg = new ChangeGlobalStateFinishMessage(req.requestId(), state, !stateChangeErr);
cctx.discovery().sendCustomEvent(stateFinishMsg);
}
timeBag.finishGlobalStage("State finish message sending");
if (!centralizedAff)
onDone(exchCtx.events().topologyVersion(), null);
}
// Try switch late affinity right now if an exchange has been completed normally.
if (!centralizedAff && isDone() && error() == null && !cctx.kernalContext().isStopping())
cctx.exchange().checkRebalanceState();
} catch (IgniteCheckedException e) {
if (reconnectOnError(e))
onDone(new IgniteNeedReconnectException(cctx.localNode(), e));
else
onDone(e);
} finally {
leaveBusy();
}
}
use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method sendAllPartitions.
/**
* @param fullMsg Message to send.
* @param nodes Target Nodes.
* @param mergedJoinExchMsgs Messages received from merged 'join node' exchanges.
* @param affinityForJoinedNodes Affinity if was requested by some nodes.
*/
private void sendAllPartitions(GridDhtPartitionsFullMessage fullMsg, Collection<ClusterNode> nodes, Map<UUID, GridDhtPartitionsSingleMessage> mergedJoinExchMsgs, Map<Integer, CacheGroupAffinityMessage> affinityForJoinedNodes) {
assert !nodes.contains(cctx.localNode());
if (log.isTraceEnabled()) {
log.trace("Sending full partition map [nodeIds=" + F.viewReadOnly(nodes, F.node2id()) + ", exchId=" + exchId + ", msg=" + fullMsg + ']');
}
// Find any single message with affinity request. This request exists only for newly joined nodes.
Optional<GridDhtPartitionsSingleMessage> singleMsgWithAffinityReq = nodes.stream().flatMap(node -> Optional.ofNullable(msgs.get(node.id())).filter(singleMsg -> singleMsg.cacheGroupsAffinityRequest() != null).map(Stream::of).orElse(Stream.empty())).findAny();
// Prepare full message for newly joined nodes with affinity request.
final GridDhtPartitionsFullMessage fullMsgWithAffinity = singleMsgWithAffinityReq.filter(singleMessage -> affinityForJoinedNodes != null).map(singleMessage -> fullMsg.copy().joinedNodeAffinity(affinityForJoinedNodes)).orElse(null);
// Prepare and send full messages for given nodes.
nodes.stream().map(node -> {
// No joined nodes, just send a regular full message.
if (fullMsgWithAffinity == null)
return new T2<>(node, fullMsg);
return new T2<>(node, // If single message contains affinity request, use special full message for such single messages.
Optional.ofNullable(msgs.get(node.id())).filter(singleMsg -> singleMsg.cacheGroupsAffinityRequest() != null).map(singleMsg -> fullMsgWithAffinity).orElse(fullMsg));
}).map(nodeAndMsg -> {
ClusterNode node = nodeAndMsg.get1();
GridDhtPartitionsFullMessage fullMsgToSend = nodeAndMsg.get2();
// If exchange has merged, use merged version of exchange id.
GridDhtPartitionExchangeId sndExchId = mergedJoinExchMsgs != null ? Optional.ofNullable(mergedJoinExchMsgs.get(node.id())).map(GridDhtPartitionsAbstractMessage::exchangeId).orElse(exchangeId()) : exchangeId();
if (sndExchId != null && !sndExchId.equals(exchangeId())) {
GridDhtPartitionsFullMessage fullMsgWithUpdatedExchangeId = fullMsgToSend.copy();
fullMsgWithUpdatedExchangeId.exchangeId(sndExchId);
return new T2<>(node, fullMsgWithUpdatedExchangeId);
}
return new T2<>(node, fullMsgToSend);
}).forEach(nodeAndMsg -> {
ClusterNode node = nodeAndMsg.get1();
GridDhtPartitionsFullMessage fullMsgToSend = nodeAndMsg.get2();
try {
cctx.io().send(node, fullMsgToSend, SYSTEM_POOL);
} catch (ClusterTopologyCheckedException e) {
if (log.isDebugEnabled())
log.debug("Failed to send partitions, node failed: " + node);
} catch (IgniteCheckedException e) {
U.error(log, "Failed to send partitions [node=" + node + ']', e);
}
});
}
use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionTopologyImpl method beforeExchange.
/**
* {@inheritDoc}
*/
@Override
public void beforeExchange(GridDhtPartitionsExchangeFuture exchFut, boolean affReady, boolean updateMoving) throws IgniteCheckedException {
ctx.database().checkpointReadLock();
try {
U.writeLock(lock);
try {
if (stopping)
return;
assert lastTopChangeVer.equals(exchFut.initialVersion()) : "Invalid topology version [topVer=" + lastTopChangeVer + ", exchId=" + exchFut.exchangeId() + ']';
ExchangeDiscoveryEvents evts = exchFut.context().events();
if (affReady) {
assert grp.affinity().lastVersion().equals(evts.topologyVersion()) : "Invalid affinity version [" + "grp=" + grp.cacheOrGroupName() + ", affVer=" + grp.affinity().lastVersion() + ", evtsVer=" + evts.topologyVersion() + ']';
lastTopChangeVer = readyTopVer = evts.topologyVersion();
discoCache = evts.discoveryCache();
}
if (log.isDebugEnabled()) {
log.debug("Partition map beforeExchange [grp=" + grp.cacheOrGroupName() + ", exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']');
}
long updateSeq = this.updateSeq.incrementAndGet();
if (exchFut.exchangeType() == ALL && !exchFut.rebalanced())
cntrMap.clear();
initializeFullMap(updateSeq);
boolean grpStarted = exchFut.cacheGroupAddedOnExchange(grp.groupId(), grp.receivedFrom());
if (evts.hasServerLeft()) {
for (DiscoveryEvent evt : evts.events()) {
if (ExchangeDiscoveryEvents.serverLeftEvent(evt))
removeNode(evt.eventNode().id());
}
} else if (affReady && grpStarted && exchFut.exchangeType() == NONE) {
assert !exchFut.context().mergeExchanges() : exchFut;
assert node2part != null && node2part.valid() : exchFut;
// Initialize node maps if group was started from joining client.
final List<ClusterNode> nodes = exchFut.firstEventCache().cacheGroupAffinityNodes(grp.groupId());
for (ClusterNode node : nodes) {
if (!node2part.containsKey(node.id()) && ctx.discovery().alive(node)) {
final GridDhtPartitionMap partMap = new GridDhtPartitionMap(node.id(), 1L, exchFut.initialVersion(), new GridPartitionStateMap(), false);
final AffinityAssignment aff = grp.affinity().cachedAffinity(exchFut.initialVersion());
for (Integer p0 : aff.primaryPartitions(node.id())) partMap.put(p0, OWNING);
for (Integer p0 : aff.backupPartitions(node.id())) partMap.put(p0, OWNING);
node2part.put(node.id(), partMap);
}
}
}
if (grp.affinityNode()) {
if (grpStarted || exchFut.firstEvent().type() == EVT_DISCOVERY_CUSTOM_EVT || exchFut.serverNodeDiscoveryEvent()) {
AffinityTopologyVersion affVer;
List<List<ClusterNode>> affAssignment;
if (affReady) {
affVer = evts.topologyVersion();
assert grp.affinity().lastVersion().equals(affVer) : "Invalid affinity [topVer=" + grp.affinity().lastVersion() + ", grp=" + grp.cacheOrGroupName() + ", affVer=" + affVer + ", fut=" + exchFut + ']';
affAssignment = grp.affinity().readyAssignments(affVer);
} else {
assert !exchFut.context().mergeExchanges();
affVer = exchFut.initialVersion();
affAssignment = grp.affinity().idealAssignmentRaw();
}
initPartitions(affVer, affAssignment, exchFut, updateSeq);
}
}
consistencyCheck();
if (updateMoving) {
assert grp.affinity().lastVersion().equals(evts.topologyVersion());
createMovingPartitions(grp.affinity().readyAffinity(evts.topologyVersion()));
}
if (log.isDebugEnabled()) {
log.debug("Partition map after beforeExchange [grp=" + grp.cacheOrGroupName() + ", " + "exchId=" + exchFut.exchangeId() + ", fullMap=" + fullMapString() + ']');
}
if (log.isTraceEnabled()) {
log.trace("Partition states after beforeExchange [grp=" + grp.cacheOrGroupName() + ", exchId=" + exchFut.exchangeId() + ", states=" + dumpPartitionStates() + ']');
}
} finally {
lock.writeLock().unlock();
}
} finally {
ctx.database().checkpointReadUnlock();
}
}
use of org.apache.ignite.cluster.ClusterNode in project ignite by apache.
the class GridDhtPartitionTopologyImpl method nodes.
/**
* {@inheritDoc}
*/
@Override
public List<ClusterNode> nodes(int p, AffinityTopologyVersion topVer) {
AffinityAssignment affAssignment = grp.affinity().cachedAffinity(topVer);
List<ClusterNode> affNodes = affAssignment.get(p);
List<ClusterNode> nodes = nodes0(p, affAssignment, affNodes);
return nodes != null ? nodes : affNodes;
}
Aggregations