use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState in project ignite by apache.
the class CacheAffinitySharedManager method initAffinityBasedOnPartitionsAvailability.
/**
* Initializes current affinity assignment based on partitions availability. Nodes that have most recent data will
* be considered affinity nodes.
*
* @param topVer Topology version.
* @param fut Exchange future.
* @param c Closure converting affinity diff.
* @param initAff {@code True} if need initialize affinity.
* @return Affinity assignment for each of registered cache group.
*/
private <T> Map<Integer, Map<Integer, List<T>>> initAffinityBasedOnPartitionsAvailability(final AffinityTopologyVersion topVer, final GridDhtPartitionsExchangeFuture fut, final IgniteClosure<ClusterNode, T> c, final boolean initAff) {
final boolean enforcedCentralizedAssignment = DiscoveryCustomEvent.requiresCentralizedAffinityAssignment(fut.firstEvent());
final WaitRebalanceInfo waitRebalanceInfo = enforcedCentralizedAssignment ? new WaitRebalanceInfo(fut.exchangeId().topologyVersion()) : new WaitRebalanceInfo(fut.context().events().lastServerEventVersion());
final Collection<ClusterNode> evtNodes = fut.context().events().discoveryCache().serverNodes();
final Map<Integer, Map<Integer, List<T>>> assignment = new ConcurrentHashMap<>();
forAllRegisteredCacheGroups(new IgniteInClosureX<CacheGroupDescriptor>() {
@Override
public void applyx(CacheGroupDescriptor desc) throws IgniteCheckedException {
CacheGroupHolder grpHolder = getOrCreateGroupHolder(topVer, desc);
if (!grpHolder.rebalanceEnabled || (fut.cacheGroupAddedOnExchange(desc.groupId(), desc.receivedFrom()) && !enforcedCentralizedAssignment))
return;
AffinityTopologyVersion affTopVer = grpHolder.affinity().lastVersion();
assert (affTopVer.topologyVersion() > 0 && !affTopVer.equals(topVer)) || enforcedCentralizedAssignment : "Invalid affinity version [last=" + affTopVer + ", futVer=" + topVer + ", grp=" + desc.cacheOrGroupName() + ']';
List<List<ClusterNode>> curAssignment = grpHolder.affinity().assignments(affTopVer);
List<List<ClusterNode>> newAssignment = grpHolder.affinity().idealAssignmentRaw();
assert newAssignment != null;
List<List<ClusterNode>> newAssignment0 = initAff ? new ArrayList<>(newAssignment) : null;
GridDhtPartitionTopology top = grpHolder.topology(fut.context().events().discoveryCache());
Map<Integer, List<T>> cacheAssignment = null;
for (int p = 0; p < newAssignment.size(); p++) {
List<ClusterNode> newNodes = newAssignment.get(p);
List<ClusterNode> curNodes = curAssignment.get(p);
assert evtNodes.containsAll(newNodes) : "Invalid new assignment [grp=" + grpHolder.aff.cacheOrGroupName() + ", nodes=" + newNodes + ", topVer=" + fut.context().events().discoveryCache().version() + ", evts=" + fut.context().events().events() + "]";
ClusterNode curPrimary = !curNodes.isEmpty() ? curNodes.get(0) : null;
ClusterNode newPrimary = !newNodes.isEmpty() ? newNodes.get(0) : null;
List<ClusterNode> newNodes0 = null;
assert newPrimary == null || evtNodes.contains(newPrimary) : "Invalid new primary [" + "grp=" + desc.cacheOrGroupName() + ", node=" + newPrimary + ", topVer=" + topVer + ']';
List<ClusterNode> owners = top.owners(p, topVer);
// It is essential that curPrimary node has partition in OWNING state.
if (!owners.isEmpty() && !owners.contains(curPrimary))
curPrimary = owners.get(0);
// If new assignment is empty preserve current ownership for alive nodes.
if (curPrimary != null && newPrimary == null) {
newNodes0 = new ArrayList<>(curNodes.size());
for (ClusterNode node : curNodes) {
if (evtNodes.contains(node))
newNodes0.add(node);
}
} else if (curPrimary != null && !curPrimary.equals(newPrimary)) {
GridDhtPartitionState state = top.partitionState(newPrimary.id(), p);
if (evtNodes.contains(curPrimary)) {
if (state != OWNING) {
newNodes0 = latePrimaryAssignment(grpHolder.affinity(), p, curPrimary, newNodes, waitRebalanceInfo);
}
} else {
if (state != OWNING) {
for (int i = 1; i < curNodes.size(); i++) {
ClusterNode curNode = curNodes.get(i);
if (top.partitionState(curNode.id(), p) == OWNING && evtNodes.contains(curNode)) {
newNodes0 = latePrimaryAssignment(grpHolder.affinity(), p, curNode, newNodes, waitRebalanceInfo);
break;
}
}
if (newNodes0 == null) {
for (ClusterNode owner : owners) {
if (evtNodes.contains(owner)) {
newNodes0 = latePrimaryAssignment(grpHolder.affinity(), p, owner, newNodes, waitRebalanceInfo);
break;
}
}
}
}
}
}
// This will happen if no primary has changed but some backups still need to be rebalanced.
if (!owners.isEmpty() && !owners.containsAll(newNodes) && !top.lostPartitions().contains(p))
waitRebalanceInfo.add(grpHolder.groupId(), p, newNodes);
if (newNodes0 != null) {
assert evtNodes.containsAll(newNodes0) : "Invalid late assignment [grp=" + grpHolder.aff.cacheOrGroupName() + ", nodes=" + newNodes + ", topVer=" + fut.context().events().discoveryCache().version() + ", evts=" + fut.context().events().events() + "]";
if (newAssignment0 != null)
newAssignment0.set(p, newNodes0);
if (cacheAssignment == null)
cacheAssignment = new HashMap<>();
List<T> n = new ArrayList<>(newNodes0.size());
for (int i = 0; i < newNodes0.size(); i++) n.add(c.apply(newNodes0.get(i)));
cacheAssignment.put(p, n);
}
}
if (cacheAssignment != null)
assignment.put(grpHolder.groupId(), cacheAssignment);
if (initAff)
grpHolder.affinity().initialize(topVer, newAssignment0);
fut.timeBag().finishLocalStage("Affinity recalculation (partitions availability) " + "[grp=" + desc.cacheOrGroupName() + "]");
}
});
if (log.isDebugEnabled()) {
log.debug("Computed new affinity after node left [topVer=" + topVer + ", waitGrps=" + groupNames(waitRebalanceInfo.waitGrps.keySet()) + ']');
}
synchronized (mux) {
waitInfo = !waitRebalanceInfo.empty() ? waitRebalanceInfo : null;
}
return assignment;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState in project ignite by apache.
the class CacheMetricsImpl method getEntriesStat.
/**
* Calculates entries count/partitions count metrics using one iteration over local partitions for all metrics
*/
public EntriesStatMetrics getEntriesStat() {
int owningPartCnt = 0;
int movingPartCnt = 0;
long offHeapEntriesCnt = 0L;
long offHeapPrimaryEntriesCnt = 0L;
long offHeapBackupEntriesCnt = 0L;
long heapEntriesCnt = 0L;
int size = 0;
long sizeLong = 0L;
boolean isEmpty;
try {
AffinityTopologyVersion topVer = cctx.affinity().affinityTopologyVersion();
if (AffinityTopologyVersion.NONE.equals(topVer))
return unknownEntriesStat();
final GridCacheAdapter<?, ?> cache = cctx.cache();
if (cache != null) {
offHeapEntriesCnt = cache.offHeapEntriesCount();
size = cache.localSize(null);
sizeLong = cache.localSizeLong(null);
}
if (cctx.isLocal()) {
if (cache != null) {
offHeapPrimaryEntriesCnt = offHeapEntriesCnt;
heapEntriesCnt = cache.sizeLong();
}
} else {
IntSet primaries = ImmutableIntSet.wrap(cctx.affinity().primaryPartitions(cctx.localNodeId(), topVer));
IntSet backups = ImmutableIntSet.wrap(cctx.affinity().backupPartitions(cctx.localNodeId(), topVer));
if (cctx.isNear() && cache != null)
heapEntriesCnt = cache.nearSize();
for (GridDhtLocalPartition part : cctx.topology().currentLocalPartitions()) {
// Partitions count.
GridDhtPartitionState partState = part.state();
if (partState == GridDhtPartitionState.OWNING)
owningPartCnt++;
if (partState == GridDhtPartitionState.MOVING)
movingPartCnt++;
// Offheap entries count
if (cache == null)
continue;
long cacheSize = part.dataStore().cacheSize(cctx.cacheId());
if (primaries.contains(part.id()))
offHeapPrimaryEntriesCnt += cacheSize;
else if (backups.contains(part.id()))
offHeapBackupEntriesCnt += cacheSize;
heapEntriesCnt += part.publicSize(cctx.cacheId());
}
}
} catch (Exception e) {
return unknownEntriesStat();
}
isEmpty = (offHeapEntriesCnt == 0);
EntriesStatMetrics stat = new EntriesStatMetrics();
stat.offHeapEntriesCount(offHeapEntriesCnt);
stat.offHeapPrimaryEntriesCount(offHeapPrimaryEntriesCnt);
stat.offHeapBackupEntriesCount(offHeapBackupEntriesCnt);
stat.heapEntriesCount(heapEntriesCnt);
stat.size(size);
stat.cacheSize(sizeLong);
stat.keySize(size);
stat.isEmpty(isEmpty);
stat.totalPartitionsCount(owningPartCnt + movingPartCnt);
stat.rebalancingPartitionsCount(movingPartCnt);
return stat;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState in project ignite by apache.
the class GridDhtPartitionsExchangeFuture method assignPartitionStates.
/**
* Collects and determines new owners of partitions for all nodes for given {@code top}.
*
* @param top Topology to assign.
* @param resetOwners True if need to reset partition state considering of counter, false otherwise.
* @return Partitions supply info list.
*/
private List<SupplyPartitionInfo> assignPartitionStates(GridDhtPartitionTopology top, boolean resetOwners) {
Map<Integer, CounterWithNodes> maxCntrs = new HashMap<>();
Map<Integer, TreeSet<Long>> varCntrs = new HashMap<>();
for (Map.Entry<UUID, GridDhtPartitionsSingleMessage> e : msgs.entrySet()) {
CachePartitionPartialCountersMap nodeCntrs = e.getValue().partitionUpdateCounters(top.groupId(), top.partitions());
assert nodeCntrs != null;
for (int i = 0; i < nodeCntrs.size(); i++) {
int p = nodeCntrs.partitionAt(i);
UUID remoteNodeId = e.getKey();
GridDhtPartitionState state = top.partitionState(remoteNodeId, p);
if (state != GridDhtPartitionState.OWNING && state != GridDhtPartitionState.MOVING)
continue;
long cntr = state == GridDhtPartitionState.MOVING ? nodeCntrs.initialUpdateCounterAt(i) : nodeCntrs.updateCounterAt(i);
varCntrs.computeIfAbsent(p, key -> new TreeSet<>()).add(cntr);
if (state != GridDhtPartitionState.OWNING)
continue;
CounterWithNodes maxCntr = maxCntrs.get(p);
if (maxCntr == null || cntr > maxCntr.cnt)
maxCntrs.put(p, new CounterWithNodes(cntr, e.getValue().partitionSizes(top.groupId()).get(p), remoteNodeId));
else if (cntr == maxCntr.cnt)
maxCntr.nodes.add(remoteNodeId);
}
}
// Also must process counters from the local node.
for (GridDhtLocalPartition part : top.currentLocalPartitions()) {
GridDhtPartitionState state = top.partitionState(cctx.localNodeId(), part.id());
if (state != GridDhtPartitionState.OWNING && state != GridDhtPartitionState.MOVING)
continue;
final long cntr = state == GridDhtPartitionState.MOVING ? part.initialUpdateCounter() : part.updateCounter();
varCntrs.computeIfAbsent(part.id(), key -> new TreeSet<>()).add(cntr);
if (state != GridDhtPartitionState.OWNING)
continue;
CounterWithNodes maxCntr = maxCntrs.get(part.id());
if (maxCntr == null && cntr == 0) {
CounterWithNodes cntrObj = new CounterWithNodes(0, 0L, cctx.localNodeId());
for (UUID nodeId : msgs.keySet()) {
if (top.partitionState(nodeId, part.id()) == GridDhtPartitionState.OWNING)
cntrObj.nodes.add(nodeId);
}
maxCntrs.put(part.id(), cntrObj);
} else if (maxCntr == null || cntr > maxCntr.cnt)
maxCntrs.put(part.id(), new CounterWithNodes(cntr, part.fullSize(), cctx.localNodeId()));
else if (cntr == maxCntr.cnt)
maxCntr.nodes.add(cctx.localNodeId());
}
Set<Integer> haveHistory = new HashSet<>();
List<SupplyPartitionInfo> list = assignHistoricalSuppliers(top, maxCntrs, varCntrs, haveHistory);
if (resetOwners)
resetOwnersByCounter(top, maxCntrs, haveHistory);
return list;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState in project ignite by apache.
the class GridCacheOffheapManager method saveStoreMetadata.
/**
* @param store Store to save metadata.
* @throws IgniteCheckedException If failed.
*/
private void saveStoreMetadata(CacheDataStore store, Context ctx, boolean beforeDestroy, boolean needSnapshot) throws IgniteCheckedException {
RowStore rowStore0 = store.rowStore();
if (rowStore0 != null && (partitionStatesRestored || grp.isLocal())) {
((CacheFreeList) rowStore0.freeList()).saveMetadata(grp.statisticsHolderData());
PartitionMetaStorage<SimpleDataRow> partStore = store.partStorage();
long updCntr = store.updateCounter();
long size = store.fullSize();
long rmvId = globalRemoveId().get();
byte[] updCntrsBytes = store.partUpdateCounter().getBytes();
PageMemoryEx pageMem = (PageMemoryEx) grp.dataRegion().pageMemory();
IgniteWriteAheadLogManager wal = this.ctx.wal();
GridEncryptionManager encMgr = this.ctx.kernalContext().encryption();
if (size > 0 || updCntr > 0 || !store.partUpdateCounter().sequential() || (grp.config().isEncryptionEnabled() && encMgr.getEncryptionState(grp.groupId(), store.partId()) > 0)) {
GridDhtPartitionState state = null;
// localPartition will not acquire writeLock here because create=false.
GridDhtLocalPartition part = null;
if (!grp.isLocal()) {
if (beforeDestroy)
state = GridDhtPartitionState.EVICTED;
else {
part = getPartition(store);
if (part != null && part.state() != GridDhtPartitionState.EVICTED)
state = part.state();
}
// Do not save meta for evicted partitions on next checkpoints.
if (state == null)
return;
}
int grpId = grp.groupId();
long partMetaId = pageMem.partitionMetaPageId(grpId, store.partId());
long partMetaPage = pageMem.acquirePage(grpId, partMetaId);
try {
long partMetaPageAddr = pageMem.writeLock(grpId, partMetaId, partMetaPage);
if (partMetaPageAddr == 0L) {
U.warn(log, "Failed to acquire write lock for meta page [metaPage=" + partMetaPage + ", beforeDestroy=" + beforeDestroy + ", size=" + size + ", updCntr=" + updCntr + ", state=" + state + ']');
return;
}
boolean changed = false;
try {
PagePartitionMetaIOV3 io = PageIO.getPageIO(partMetaPageAddr);
long link = io.getGapsLink(partMetaPageAddr);
if (updCntrsBytes == null && link != 0) {
partStore.removeDataRowByLink(link, grp.statisticsHolderData());
io.setGapsLink(partMetaPageAddr, (link = 0));
changed = true;
} else if (updCntrsBytes != null && link == 0) {
SimpleDataRow row = new SimpleDataRow(store.partId(), updCntrsBytes);
partStore.insertDataRow(row, grp.statisticsHolderData());
io.setGapsLink(partMetaPageAddr, (link = row.link()));
changed = true;
} else if (updCntrsBytes != null && link != 0) {
byte[] prev = partStore.readRow(link);
assert prev != null : "Read null gaps using link=" + link;
if (!Arrays.equals(prev, updCntrsBytes)) {
partStore.removeDataRowByLink(link, grp.statisticsHolderData());
SimpleDataRow row = new SimpleDataRow(store.partId(), updCntrsBytes);
partStore.insertDataRow(row, grp.statisticsHolderData());
io.setGapsLink(partMetaPageAddr, (link = row.link()));
changed = true;
}
}
if (changed)
partStore.saveMetadata(grp.statisticsHolderData());
changed |= io.setUpdateCounter(partMetaPageAddr, updCntr);
changed |= io.setGlobalRemoveId(partMetaPageAddr, rmvId);
changed |= io.setSize(partMetaPageAddr, size);
int encryptIdx = 0;
int encryptCnt = 0;
if (grp.config().isEncryptionEnabled()) {
long reencryptState = encMgr.getEncryptionState(grpId, store.partId());
if (reencryptState != 0) {
encryptIdx = ReencryptStateUtils.pageIndex(reencryptState);
encryptCnt = ReencryptStateUtils.pageCount(reencryptState);
if (encryptIdx == encryptCnt) {
encMgr.setEncryptionState(grp, store.partId(), 0, 0);
encryptIdx = encryptCnt = 0;
}
changed |= io.setEncryptedPageIndex(partMetaPageAddr, encryptIdx);
changed |= io.setEncryptedPageCount(partMetaPageAddr, encryptCnt);
}
}
if (state != null)
changed |= io.setPartitionState(partMetaPageAddr, (byte) state.ordinal());
else
assert grp.isLocal() : grp.cacheOrGroupName();
long cntrsPageId;
if (grp.sharedGroup()) {
long initCntrPageId = io.getCountersPageId(partMetaPageAddr);
Map<Integer, Long> newSizes = store.cacheSizes();
Map<Integer, Long> prevSizes = readSharedGroupCacheSizes(pageMem, grpId, initCntrPageId);
if (prevSizes != null && prevSizes.equals(newSizes))
// Preventing modification of sizes pages for store
cntrsPageId = initCntrPageId;
else {
cntrsPageId = writeSharedGroupCacheSizes(pageMem, grpId, initCntrPageId, store.partId(), newSizes);
if (initCntrPageId == 0 && cntrsPageId != 0) {
io.setCountersPageId(partMetaPageAddr, cntrsPageId);
changed = true;
}
}
} else
cntrsPageId = 0L;
int pageCnt;
if (needSnapshot) {
pageCnt = this.ctx.pageStore().pages(grpId, store.partId());
io.setCandidatePageCount(partMetaPageAddr, size == 0 ? 0 : pageCnt);
if (state == OWNING) {
assert part != null;
if (!addPartition(part, ctx.partitionStatMap(), partMetaPageAddr, io, grpId, store.partId(), this.ctx.pageStore().pages(grpId, store.partId()), store.fullSize()))
U.warn(log, "Partition was concurrently evicted grpId=" + grpId + ", partitionId=" + part.id());
} else if (state == MOVING || state == RENTING) {
if (ctx.partitionStatMap().forceSkipIndexPartition(grpId)) {
if (log.isInfoEnabled())
log.info("Will not include SQL indexes to snapshot because there is " + "a partition not in " + OWNING + " state [grp=" + grp.cacheOrGroupName() + ", partId=" + store.partId() + ", state=" + state + ']');
}
}
changed = true;
} else
pageCnt = io.getCandidatePageCount(partMetaPageAddr);
if (changed && isWalDeltaRecordNeeded(pageMem, grpId, partMetaId, partMetaPage, wal, null))
wal.log(new MetaPageUpdatePartitionDataRecordV3(grpId, partMetaId, updCntr, rmvId, // TODO: Partition size may be long
(int) size, cntrsPageId, state == null ? -1 : (byte) state.ordinal(), pageCnt, link, encryptIdx, encryptCnt));
if (changed) {
partStore.saveMetadata(grp.statisticsHolderData());
io.setPartitionMetaStoreReuseListRoot(partMetaPageAddr, partStore.metaPageId());
}
} finally {
pageMem.writeUnlock(grpId, partMetaId, partMetaPage, null, changed);
}
} finally {
pageMem.releasePage(grpId, partMetaId, partMetaPage);
}
} else if (needSnapshot)
tryAddEmptyPartitionToSnapshot(store, ctx);
} else if (needSnapshot)
tryAddEmptyPartitionToSnapshot(store, ctx);
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState in project ignite by apache.
the class GridCacheOffheapManager method updateState.
/**
* @param part Partition to restore state for.
* @param stateId State enum ordinal.
*/
private void updateState(GridDhtLocalPartition part, int stateId) {
if (stateId != -1) {
GridDhtPartitionState state = GridDhtPartitionState.fromOrdinal(stateId);
assert state != null;
part.restoreState(state == EVICTED ? RENTING : state);
}
}
Aggregations