use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition in project ignite by apache.
the class IgniteTxAdapter method applyTxSizes.
* Makes cache sizes changes accumulated during transaction visible outside of transaction.
protected void applyTxSizes() {
TxCounters txCntrs = txCounters(false);
if (txCntrs == null)
Map<Integer, ? extends Map<Integer, AtomicLong>> sizeDeltas = txCntrs.sizeDeltas();
for (Map.Entry<Integer, ? extends Map<Integer, AtomicLong>> entry : sizeDeltas.entrySet()) {
Integer cacheId = entry.getKey();
Map<Integer, AtomicLong> deltas = entry.getValue();
assert !F.isEmpty(deltas);
GridDhtPartitionTopology top = cctx.cacheContext(cacheId).topology();
// Need to reserve on backups only
boolean reserve = dht() && remote();
for (Map.Entry<Integer, AtomicLong> e : deltas.entrySet()) {
boolean invalid = false;
int p = e.getKey();
long delta = e.getValue().get();
try {
GridDhtLocalPartition part = top.localPartition(p);
if (!reserve || part != null && part.reserve()) {
assert part != null;
try {
if (part.state() != GridDhtPartitionState.RENTING)
part.dataStore().updateSize(cacheId, delta);
invalid = true;
} finally {
if (reserve)
} else
invalid = true;
} catch (GridDhtInvalidPartitionException e1) {
invalid = true;
if (invalid) {
assert reserve;
if (log.isDebugEnabled())
log.debug("Trying to apply size delta for invalid partition: " + "[cacheId=" + cacheId + ", part=" + p + "]");
the class IgniteTxHandler method applyPartitionsUpdatesCounters.
* Applies partition counter updates for transactions.
* <p>
* Called after entries are written to WAL on commit or during rollback to close gaps in update counter sequence.
* <p>
* On rollback counters should be applied on the primary only after backup nodes, otherwise if the primary fail
* before sending rollback requests to backups remote transactions can be committed by recovery protocol and
* partition consistency will not be restored when primary returns to the grid because RollbackRecord was written
* (actual for persistent mode only).
* @param counters Counter values to be updated.
* @param rollback {@code True} if applied during rollbacks.
* @param rollbackOnPrimary {@code True} if rollback happens on primary node. Passed to CQ engine.
public void applyPartitionsUpdatesCounters(Iterable<PartitionUpdateCountersMessage> counters, boolean rollback, boolean rollbackOnPrimary) throws IgniteCheckedException {
if (counters == null)
WALPointer ptr = null;
try {
for (PartitionUpdateCountersMessage counter : counters) {
GridCacheContext ctx0 = ctx.cacheContext(counter.cacheId());
GridDhtPartitionTopology top = ctx0.topology();
AffinityTopologyVersion topVer = top.readyTopologyVersion();
assert top != null;
for (int i = 0; i < counter.size(); i++) {
boolean invalid = false;
try {
GridDhtLocalPartition part = top.localPartition(counter.partition(i));
if (part != null && part.reserve()) {
try {
if (part.state() != RENTING) {
// Check is actual only for backup node.
long start = counter.initialCounter(i);
long delta = counter.updatesCount(i);
boolean updated = part.updateCounter(start, delta);
// Need to log rolled back range for logical recovery.
if (updated && rollback) {
CacheGroupContext grpCtx =;
if (grpCtx.persistenceEnabled() && grpCtx.walEnabled() && !grpCtx.mvccEnabled()) {
RollbackRecord rec = new RollbackRecord(grpCtx.groupId(),, start, delta);
ptr = ctx.wal().log(rec);
for (int cntr = 1; cntr <= delta; cntr++) {
ctx0.continuousQueries().skipUpdateCounter(null,, start + cntr, topVer, rollbackOnPrimary);
} else
invalid = true;
} finally {
} else
invalid = true;
} catch (GridDhtInvalidPartitionException e) {
invalid = true;
if (log.isDebugEnabled() && invalid) {
log.debug("Received partition update counters message for invalid partition, ignoring: " + "[cacheId=" + counter.cacheId() + ", part=" + counter.partition(i) + ']');
} finally {
if (ptr != null)
ctx.wal().flush(ptr, false);
the class GridCommandHandlerIndexingUtils method breakSqlIndex.
* Deleting records from the index bypassing cache.
* @param internalCache Cache.
* @param partId Partition number.
* @param filter Row filter.
* @throws Exception If failed.
static <K, V> void breakSqlIndex(IgniteInternalCache<K, V> internalCache, int partId, @Nullable Predicate<CacheDataRow> filter) throws Exception {
GridCacheContext<K, V> cacheCtx = internalCache.context();
GridDhtLocalPartition locPart = cacheCtx.topology().localPartitions().get(partId);
GridIterator<CacheDataRow> cacheDataGridIter =;
GridQueryProcessor qryProcessor = internalCache.context().kernalContext().query();
while (cacheDataGridIter.hasNextX()) {
CacheDataRow cacheDataRow = cacheDataGridIter.nextX();
if (nonNull(filter) && !filter.test(cacheDataRow))
try {
qryProcessor.remove(cacheCtx, cacheDataRow);
} finally {
the class GridDhtPartitionDemander method handleSupplyMessage.
* Handles supply message from {@code nodeId} with specified {@code topicId}.
* Supply message contains entries to populate rebalancing partitions.
* There is a cyclic process:
* Populate rebalancing partitions with entries from Supply message.
* If not all partitions specified in {@link #rebalanceFut} were rebalanced or marked as missed
* send new Demand message to request next batch of entries.
* @param nodeId Node id.
* @param supplyMsg Supply message.
public void handleSupplyMessage(final UUID nodeId, final GridDhtPartitionSupplyMessage supplyMsg) {
AffinityTopologyVersion topVer = supplyMsg.topologyVersion();
RebalanceFuture fut = rebalanceFut;
ClusterNode node = ctx.node(nodeId);
try {
String errMsg = null;
if (fut.isDone())
errMsg = "rebalance completed";
else if (node == null)
errMsg = "supplier has left cluster";
else if (!rebalanceFut.isActual(supplyMsg.rebalanceId()))
errMsg = "topology changed";
if (errMsg != null) {
if (log.isDebugEnabled()) {
log.debug("Supply message has been ignored (" + errMsg + ") [" + demandRoutineInfo(nodeId, supplyMsg) + ']');
if (log.isDebugEnabled())
log.debug("Received supply message [" + demandRoutineInfo(nodeId, supplyMsg) + ']');
// Check whether there were error during supplying process.
Throwable msgExc = null;
final GridDhtPartitionTopology top = grp.topology();
if (supplyMsg.classError() != null)
msgExc = supplyMsg.classError();
else if (supplyMsg.error() != null)
msgExc = supplyMsg.error();
if (msgExc != null) {
GridDhtPartitionMap partMap = top.localPartitionMap();
Set<Integer> unstableParts = supplyMsg.infos().keySet().stream().filter(p -> partMap.get(p) == MOVING).collect(Collectors.toSet());
U.error(log, "Rebalancing routine has failed, some partitions could be unavailable for reading" + " [" + demandRoutineInfo(nodeId, supplyMsg) + ", unavailablePartitions=" + S.compact(unstableParts) + ']', msgExc);
if (grp.sharedGroup()) {
for (GridCacheContext cctx : grp.caches()) {
if (cctx.statisticsEnabled()) {
long keysCnt = supplyMsg.keysForCache(cctx.cacheId());
if (keysCnt != -1)
// Can not be calculated per cache.
} else {
GridCacheContext cctx = grp.singleCacheContext();
if (cctx.statisticsEnabled()) {
if (supplyMsg.estimatedKeysCount() != -1)
try {
AffinityAssignment aff = grp.affinity().cachedAffinity(topVer);
// Preload.
for (Map.Entry<Integer, CacheEntryInfoCollection> e : supplyMsg.infos().entrySet()) {
int p = e.getKey();
if (aff.get(p).contains(ctx.localNode())) {
GridDhtLocalPartition part;
try {
part = top.localPartition(p, topVer, true);
} catch (GridDhtInvalidPartitionException err) {
assert !topVer.equals(top.lastTopologyChangeVersion());
if (log.isDebugEnabled()) {
log.debug("Failed to get partition for rebalancing [" + "grp=" + grp.cacheOrGroupName() + ", err=" + err + ", p=" + p + ", topVer=" + topVer + ", lastTopVer=" + top.lastTopologyChangeVersion() + ']');
assert part != null;
boolean last = supplyMsg.last().containsKey(p);
if (part.state() == MOVING) {
boolean reserved = part.reserve();
assert reserved : "Failed to reserve partition [igniteInstanceName=" + ctx.igniteInstanceName() + ", grp=" + grp.cacheOrGroupName() + ", part=" + part + ']';
try {
long[] byteRcv = { 0 };
GridIterableAdapter<GridCacheEntryInfo> infosWrap = new GridIterableAdapter<>(new IteratorWrapper<GridCacheEntryInfo>(e.getValue().infos().iterator()) {
* {@inheritDoc}
public GridCacheEntryInfo nextX() throws IgniteCheckedException {
GridCacheEntryInfo i = super.nextX();
byteRcv[0] += i.marshalledSize(ctx.cacheObjectContext(i.cacheId()));
return i;
try {
if (grp.mvccEnabled())
mvccPreloadEntries(topVer, node, p, infosWrap);
else {
preloadEntries(topVer, part, infosWrap);
rebalanceFut.onReceivedKeys(p, e.getValue().infos().size(), node);
} catch (GridDhtInvalidPartitionException ignored) {
if (log.isDebugEnabled())
log.debug("Partition became invalid during rebalancing (will ignore): " + p);
fut.onReceivedBytes(p, byteRcv[0], node);
// If message was last for this partition, then we take ownership.
if (last)
ownPartition(fut, p, nodeId, supplyMsg);
} finally {
} else {
if (last)
fut.partitionDone(nodeId, p, false);
if (log.isDebugEnabled())
log.debug("Skipping rebalancing partition (state is not MOVING): " + '[' + demandRoutineInfo(nodeId, supplyMsg) + ", p=" + p + ']');
} else {
fut.partitionDone(nodeId, p, false);
if (log.isDebugEnabled())
log.debug("Skipping rebalancing partition (affinity changed): " + '[' + demandRoutineInfo(nodeId, supplyMsg) + ", p=" + p + ']');
// Only request partitions based on latest topology version.
for (Integer miss : supplyMsg.missed()) {
if (aff.get(miss).contains(ctx.localNode()))
fut.partitionMissed(nodeId, miss);
for (Integer miss : supplyMsg.missed()) fut.partitionDone(nodeId, miss, false);
GridDhtPartitionDemandMessage d = new GridDhtPartitionDemandMessage(supplyMsg.rebalanceId(), supplyMsg.topologyVersion(), grp.groupId());
if (!fut.isDone()) {
// Send demand message.
try {, d.topic(), d.convertIfNeeded(node.version()), grp.ioPolicy(), grp.preloader().timeout());
if (log.isDebugEnabled())
log.debug("Send next demand message [" + demandRoutineInfo(nodeId, supplyMsg) + "]");
} catch (ClusterTopologyCheckedException e) {
if (log.isDebugEnabled())
log.debug("Supplier has left [" + demandRoutineInfo(nodeId, supplyMsg) + ", errMsg=" + e.getMessage() + ']');
} else {
if (log.isDebugEnabled())
log.debug("Will not request next demand message [" + demandRoutineInfo(nodeId, supplyMsg) + ", rebalanceFuture=" + fut + ']');
} catch (IgniteSpiException | IgniteCheckedException e) {
LT.error(log, e, "Error during rebalancing [" + demandRoutineInfo(nodeId, supplyMsg) + ", err=" + e + ']');
} finally {
the class GridDhtPartitionSupplier method handleDemandMessage.
* For each demand message method lookups (or creates new) supply context and starts to iterate entries across requested partitions.
* Each entry in iterator is placed to prepared supply message.
* If supply message size in bytes becomes greater than {@link IgniteConfiguration#getRebalanceBatchSize()}
* method sends this message to demand node and saves partial state of iterated entries to supply context,
* then restores the context again after new demand message with the same context id is arrived.
* @param topicId Id of the topic is used for the supply-demand communication.
* @param nodeId Id of the node which sent the demand message.
* @param demandMsg Demand message.
public void handleDemandMessage(int topicId, UUID nodeId, GridDhtPartitionDemandMessage demandMsg) {
assert demandMsg != null;
assert nodeId != null;
T3<UUID, Integer, AffinityTopologyVersion> contextId = new T3<>(nodeId, topicId, demandMsg.topologyVersion());
if (demandMsg.rebalanceId() < 0) {
// Demand node requested context cleanup.
synchronized (scMap) {
SupplyContext sctx = scMap.get(contextId);
if (sctx != null && sctx.rebalanceId == -demandMsg.rebalanceId()) {
clearContext(scMap.remove(contextId), log);
if (log.isDebugEnabled())
log.debug("Supply context cleaned [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", supplyContext=" + sctx + "]");
} else {
if (log.isDebugEnabled())
log.debug("Stale supply context cleanup message [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", supplyContext=" + sctx + "]");
ClusterNode demanderNode = grp.shared().discovery().node(nodeId);
if (demanderNode == null) {
if (log.isDebugEnabled())
log.debug("Demand message rejected (demander left cluster) [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + "]");
IgniteRebalanceIterator iter = null;
SupplyContext sctx = null;
Set<Integer> remainingParts = null;
GridDhtPartitionSupplyMessage supplyMsg = new GridDhtPartitionSupplyMessage(demandMsg.rebalanceId(), grp.groupId(), demandMsg.topologyVersion(), grp.deploymentEnabled());
try {
synchronized (scMap) {
sctx = scMap.remove(contextId);
if (sctx != null && demandMsg.rebalanceId() < sctx.rebalanceId) {
// Stale message, return context back and return.
scMap.put(contextId, sctx);
if (log.isDebugEnabled())
log.debug("Stale demand message [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", actualContext=" + sctx + "]");
// Demand request should not contain empty partitions if no supply context is associated with it.
if (sctx == null && (demandMsg.partitions() == null || demandMsg.partitions().isEmpty())) {
if (log.isDebugEnabled())
log.debug("Empty demand message (no context and partitions) [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + "]");
if (log.isDebugEnabled())
log.debug("Demand message accepted [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + "]");
assert !(sctx != null && !demandMsg.partitions().isEmpty());
long maxBatchesCnt = /* Each thread should gain prefetched batches. */
grp.preloader().batchesPrefetchCount() * grp.shared().gridConfig().getRebalanceThreadPoolSize();
if (sctx == null) {
if (log.isDebugEnabled())
log.debug("Starting supplying rebalancing [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", fullPartitions=" + S.compact(demandMsg.partitions().fullSet()) + ", histPartitions=" + S.compact(demandMsg.partitions().historicalSet()) + "]");
} else
maxBatchesCnt = 1;
if (sctx == null || sctx.iterator == null) {
remainingParts = new HashSet<>(demandMsg.partitions().fullSet());
CachePartitionPartialCountersMap histMap = demandMsg.partitions().historicalMap();
for (int i = 0; i < histMap.size(); i++) {
int p = histMap.partitionAt(i);
iter = grp.offheap().rebalanceIterator(demandMsg.partitions(), demandMsg.topologyVersion());
for (Integer part : demandMsg.partitions().fullSet()) {
if (iter.isPartitionMissing(part))
GridDhtLocalPartition loc = top.localPartition(part, demandMsg.topologyVersion(), false);
assert loc != null && loc.state() == GridDhtPartitionState.OWNING : "Partition should be in OWNING state: " + loc;
for (int i = 0; i < histMap.size(); i++) {
int p = histMap.partitionAt(i);
if (iter.isPartitionMissing(p))
supplyMsg.addEstimatedKeysCount(histMap.updateCounterAt(i) - histMap.initialUpdateCounterAt(i));
} else {
iter = sctx.iterator;
remainingParts = sctx.remainingParts;
final int msgMaxSize = grp.preloader().batchSize();
long batchesCnt = 0;
CacheDataRow prevRow = null;
while (iter.hasNext()) {
CacheDataRow row = iter.peek();
// Prevent mvcc entry history splitting into separate batches.
boolean canFlushHistory = !grp.mvccEnabled() || prevRow != null && ((grp.sharedGroup() && row.cacheId() != prevRow.cacheId()) || !row.key().equals(prevRow.key()));
if (canFlushHistory && supplyMsg.messageSize() >= msgMaxSize) {
if (++batchesCnt >= maxBatchesCnt) {
saveSupplyContext(contextId, iter, remainingParts, demandMsg.rebalanceId());
reply(topicId, demanderNode, demandMsg, supplyMsg, contextId);
} else {
if (!reply(topicId, demanderNode, demandMsg, supplyMsg, contextId))
supplyMsg = new GridDhtPartitionSupplyMessage(demandMsg.rebalanceId(), grp.groupId(), demandMsg.topologyVersion(), grp.deploymentEnabled());
row =;
prevRow = row;
int part = row.partition();
GridDhtLocalPartition loc = top.localPartition(part, demandMsg.topologyVersion(), false);
assert (loc != null && loc.state() == OWNING && loc.reservations() > 0) || iter.isPartitionMissing(part) : "Partition should be in OWNING state and has at least 1 reservation " + loc;
if (iter.isPartitionMissing(part) && remainingParts.contains(part)) {
if (grp.eventRecordable(EVT_CACHE_REBALANCE_PART_MISSED))
if (log.isDebugEnabled())
log.debug("Requested partition is marked as missing [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", p=" + part + "]");
if (!remainingParts.contains(part))
GridCacheEntryInfo info = extractEntryInfo(row);
if (info == null)
supplyMsg.addEntry0(part, iter.historical(part), info, grp.shared(), grp.cacheObjectContext());
if (iter.isPartitionDone(part)) {
supplyMsg.last(part, loc.updateCounter());
Iterator<Integer> remainingIter = remainingParts.iterator();
while (remainingIter.hasNext()) {
int p =;
if (iter.isPartitionDone(p)) {
GridDhtLocalPartition loc = top.localPartition(p, demandMsg.topologyVersion(), false);
assert loc != null : "Supply partition is gone: grp=" + grp.cacheOrGroupName() + ", p=" + p;
supplyMsg.last(p, loc.updateCounter());
} else if (iter.isPartitionMissing(p)) {
if (grp.eventRecordable(EVT_CACHE_REBALANCE_PART_MISSED))
assert remainingParts.isEmpty() : "Partitions after rebalance should be either done or missing: " + remainingParts;
if (sctx != null)
clearContext(sctx, log);
reply(topicId, demanderNode, demandMsg, supplyMsg, contextId);
if (log.isInfoEnabled())"Finished supplying rebalancing [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + "]");
} catch (Throwable t) {
if (iter != null && !iter.isClosed()) {
try {
} catch (IgniteCheckedException e) {
if (grp.shared().kernalContext().isStopping())
// Sending supply messages with error requires new protocol.
boolean sendErrMsg = demanderNode.version().compareTo(GridDhtPartitionSupplyMessageV2.AVAILABLE_SINCE) >= 0;
if (t instanceof IgniteSpiException) {
if (log.isDebugEnabled())
log.debug("Failed to send message to node (current node is stopping?) [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", msg=" + t.getMessage() + ']');
sendErrMsg = false;
} else
U.error(log, "Failed to continue supplying [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ']', t);
try {
if (sctx != null)
clearContext(sctx, log);
} catch (Throwable t1) {
U.error(log, "Failed to cleanup supplying context [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ']', t1);
if (!sendErrMsg)
boolean fallbackToFullRebalance = X.hasCause(t, IgniteHistoricalIteratorException.class);
try {
GridDhtPartitionSupplyMessage errMsg;
if (fallbackToFullRebalance) {
// Mark the last checkpoint as not applicable for WAL rebalance.
// Mark all remaining partitions as missed to trigger full rebalance.
if (iter == null && F.isEmpty(remainingParts)) {
remainingParts = new HashSet<>(demandMsg.partitions().fullSet());
for (int p : Optional.ofNullable(remainingParts).orElseGet(Collections::emptySet)) supplyMsg.missed(p);
errMsg = supplyMsg;
} else {
errMsg = new GridDhtPartitionSupplyMessageV2(demandMsg.rebalanceId(), grp.groupId(), demandMsg.topologyVersion(), grp.deploymentEnabled(), t);
reply(topicId, demanderNode, demandMsg, errMsg, contextId);
} catch (Throwable t1) {
U.error(log, "Failed to send supply error message [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ']', t1);
// instead of triggering failure handler.
if (!fallbackToFullRebalance) {
grp.shared().kernalContext().failure().process(new FailureContext(FailureType.CRITICAL_ERROR, new IgniteCheckedException("Failed to continue supplying [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ']', t)));