use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.MOVING in project ignite by apache.
the class GridDhtPartitionTopologyImpl method resetOwners.
/**
* {@inheritDoc}
*/
@Override
public Map<UUID, Set<Integer>> resetOwners(Map<Integer, Set<UUID>> ownersByUpdCounters, Set<Integer> haveHist, GridDhtPartitionsExchangeFuture exchFut) {
Map<UUID, Set<Integer>> res = new HashMap<>();
Collection<DiscoveryEvent> evts = exchFut.events().events();
Set<UUID> joinedNodes = U.newHashSet(evts.size());
for (DiscoveryEvent evt : evts) {
if (evt.type() == EVT_NODE_JOINED)
joinedNodes.add(evt.eventNode().id());
}
ctx.database().checkpointReadLock();
try {
Map<UUID, Set<Integer>> addToWaitGroups = new HashMap<>();
lock.writeLock().lock();
try {
// First process local partitions.
UUID locNodeId = ctx.localNodeId();
for (Map.Entry<Integer, Set<UUID>> entry : ownersByUpdCounters.entrySet()) {
int part = entry.getKey();
Set<UUID> maxCounterPartOwners = entry.getValue();
GridDhtLocalPartition locPart = localPartition(part);
if (locPart == null || locPart.state() != OWNING)
continue;
// Partition state should be mutated only on joining nodes if they are exists for the exchange.
if (joinedNodes.isEmpty() && !maxCounterPartOwners.contains(locNodeId)) {
rebalancePartition(part, !haveHist.contains(part), exchFut);
res.computeIfAbsent(locNodeId, n -> new HashSet<>()).add(part);
}
}
// Then process node maps.
for (Map.Entry<Integer, Set<UUID>> entry : ownersByUpdCounters.entrySet()) {
int part = entry.getKey();
Set<UUID> maxCounterPartOwners = entry.getValue();
for (Map.Entry<UUID, GridDhtPartitionMap> remotes : node2part.entrySet()) {
UUID remoteNodeId = remotes.getKey();
if (!joinedNodes.isEmpty() && !joinedNodes.contains(remoteNodeId))
continue;
GridDhtPartitionMap partMap = remotes.getValue();
GridDhtPartitionState state = partMap.get(part);
if (state != OWNING)
continue;
if (!maxCounterPartOwners.contains(remoteNodeId)) {
partMap.put(part, MOVING);
partMap.updateSequence(partMap.updateSequence() + 1, partMap.topologyVersion());
if (partMap.nodeId().equals(locNodeId))
updateSeq.setIfGreater(partMap.updateSequence());
res.computeIfAbsent(remoteNodeId, n -> new HashSet<>()).add(part);
}
}
}
for (Map.Entry<UUID, Set<Integer>> entry : res.entrySet()) {
UUID nodeId = entry.getKey();
Set<Integer> rebalancedParts = entry.getValue();
addToWaitGroups.put(nodeId, new HashSet<>(rebalancedParts));
if (!rebalancedParts.isEmpty()) {
Set<Integer> historical = rebalancedParts.stream().filter(haveHist::contains).collect(Collectors.toSet());
// Filter out partitions having WAL history.
rebalancedParts.removeAll(historical);
U.warn(log, "Partitions have been scheduled for rebalancing due to outdated update counter " + "[grp=" + grp.cacheOrGroupName() + ", readyTopVer=" + readyTopVer + ", topVer=" + exchFut.initialVersion() + ", nodeId=" + nodeId + ", partsFull=" + S.compact(rebalancedParts) + ", partsHistorical=" + S.compact(historical) + "]");
}
}
node2part = new GridDhtPartitionFullMap(node2part, updateSeq.incrementAndGet());
} finally {
lock.writeLock().unlock();
}
List<List<ClusterNode>> ideal = ctx.affinity().affinity(groupId()).idealAssignmentRaw();
for (Map.Entry<UUID, Set<Integer>> entry : addToWaitGroups.entrySet()) {
// Add to wait groups to ensure late assignment switch after all partitions are rebalanced.
for (Integer part : entry.getValue()) {
ctx.cache().context().affinity().addToWaitGroup(groupId(), part, topologyVersionFuture().initialVersion(), ideal.get(part));
}
}
} finally {
ctx.database().checkpointReadUnlock();
}
return res;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.MOVING in project ignite by apache.
the class GridDhtPartitionDemander method addAssignments.
/**
* This method initiates new rebalance process from given {@code assignments} by creating new rebalance
* future based on them. Cancels previous rebalance future and sends rebalance started event.
* In case of delayed rebalance method schedules the new one with configured delay based on {@code lastExchangeFut}.
*
* @param assignments Assignments to process.
* @param force {@code True} if preload request by {@link ForceRebalanceExchangeTask}.
* @param rebalanceId Rebalance id generated from exchange thread.
* @param next A next rebalance routine in chain.
* @param forcedRebFut External future for forced rebalance.
* @param compatibleRebFut Future for waiting for compatible rebalances.
*
* @return Rebalancing future or {@code null} to exclude an assignment from a chain.
*/
@Nullable
RebalanceFuture addAssignments(final GridDhtPreloaderAssignments assignments, boolean force, long rebalanceId, final RebalanceFuture next, @Nullable final GridCompoundFuture<Boolean, Boolean> forcedRebFut, GridCompoundFuture<Boolean, Boolean> compatibleRebFut) {
if (log.isDebugEnabled())
log.debug("Adding partition assignments: " + assignments);
assert force == (forcedRebFut != null);
long delay = grp.config().getRebalanceDelay();
if (delay == 0 || force) {
assert assignments != null;
final RebalanceFuture oldFut = rebalanceFut;
if (assignments.cancelled()) {
// Pending exchange.
if (log.isDebugEnabled())
log.debug("Rebalancing skipped due to cancelled assignments.");
return null;
}
if (assignments.isEmpty()) {
// Nothing to rebalance.
if (log.isDebugEnabled())
log.debug("Rebalancing skipped due to empty assignments.");
if (oldFut.isInitial())
oldFut.onDone(true);
else if (!oldFut.isDone())
oldFut.tryCancel();
((GridFutureAdapter) grp.preloader().syncFuture()).onDone();
return null;
}
// Check if ongoing rebalancing is compatible with a new assignment.
if (!force && (!oldFut.isDone() || oldFut.result()) && oldFut.compatibleWith(assignments)) {
if (!oldFut.isDone())
compatibleRebFut.add(oldFut);
return null;
}
// Cancel ongoing rebalancing.
if (!oldFut.isDone() && !oldFut.isInitial())
oldFut.tryCancel();
// Partition states cannot be changed from now on by previous incompatible rebalancing.
// Retain only moving partitions. Assignment can become empty as a result.
// Delayed partition owning happens in the exchange worker as well, so no race with delayed owning here.
assignments.retainMoving(grp.topology());
// Skip rebalanced group.
if (assignments.isEmpty())
return null;
final RebalanceFuture fut = new RebalanceFuture(grp, lastExchangeFut, assignments, log, rebalanceId, next, lastCancelledTime);
if (oldFut.isInitial())
fut.listen(f -> oldFut.onDone(f.result()));
if (forcedRebFut != null)
forcedRebFut.add(fut);
rebalanceFut = fut;
for (final GridCacheContext cctx : grp.caches()) {
if (cctx.statisticsEnabled()) {
final CacheMetricsImpl metrics = cctx.cache().metrics0();
metrics.clearRebalanceCounters();
for (GridDhtPartitionDemandMessage msg : assignments.values()) {
for (Integer partId : msg.partitions().fullSet()) metrics.onRebalancingKeysCountEstimateReceived(grp.topology().globalPartSizes().get(partId));
CachePartitionPartialCountersMap histMap = msg.partitions().historicalMap();
for (int i = 0; i < histMap.size(); i++) {
long from = histMap.initialUpdateCounterAt(i);
long to = histMap.updateCounterAt(i);
metrics.onRebalancingKeysCountEstimateReceived(to - from);
}
}
metrics.startRebalance(0);
}
}
fut.sendRebalanceStartedEvent();
return fut;
} else if (delay > 0) {
for (GridCacheContext cctx : grp.caches()) {
if (cctx.statisticsEnabled()) {
final CacheMetricsImpl metrics = cctx.cache().metrics0();
metrics.startRebalance(delay);
}
}
GridTimeoutObject obj = lastTimeoutObj.get();
if (obj != null)
ctx.time().removeTimeoutObject(obj);
final GridDhtPartitionsExchangeFuture exchFut = lastExchangeFut;
assert exchFut != null : "Delaying rebalance process without topology event.";
obj = new GridTimeoutObjectAdapter(delay) {
@Override
public void onTimeout() {
exchFut.listen(new CI1<IgniteInternalFuture<AffinityTopologyVersion>>() {
@Override
public void apply(IgniteInternalFuture<AffinityTopologyVersion> f) {
ctx.exchange().forceRebalance(exchFut.exchangeId());
}
});
}
};
lastTimeoutObj.set(obj);
ctx.time().addTimeoutObject(obj);
}
return null;
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.MOVING in project ignite by apache.
the class GridDhtPartitionDemander method handleSupplyMessage.
/**
* Handles supply message from {@code nodeId} with specified {@code topicId}.
*
* Supply message contains entries to populate rebalancing partitions.
*
* There is a cyclic process:
* Populate rebalancing partitions with entries from Supply message.
* If not all partitions specified in {@link #rebalanceFut} were rebalanced or marked as missed
* send new Demand message to request next batch of entries.
*
* @param nodeId Node id.
* @param supplyMsg Supply message.
*/
public void handleSupplyMessage(final UUID nodeId, final GridDhtPartitionSupplyMessage supplyMsg) {
AffinityTopologyVersion topVer = supplyMsg.topologyVersion();
RebalanceFuture fut = rebalanceFut;
ClusterNode node = ctx.node(nodeId);
fut.cancelLock.readLock().lock();
try {
String errMsg = null;
if (fut.isDone())
errMsg = "rebalance completed";
else if (node == null)
errMsg = "supplier has left cluster";
else if (!rebalanceFut.isActual(supplyMsg.rebalanceId()))
errMsg = "topology changed";
if (errMsg != null) {
if (log.isDebugEnabled()) {
log.debug("Supply message has been ignored (" + errMsg + ") [" + demandRoutineInfo(nodeId, supplyMsg) + ']');
}
return;
}
if (log.isDebugEnabled())
log.debug("Received supply message [" + demandRoutineInfo(nodeId, supplyMsg) + ']');
// Check whether there were error during supplying process.
Throwable msgExc = null;
final GridDhtPartitionTopology top = grp.topology();
if (supplyMsg.classError() != null)
msgExc = supplyMsg.classError();
else if (supplyMsg.error() != null)
msgExc = supplyMsg.error();
if (msgExc != null) {
GridDhtPartitionMap partMap = top.localPartitionMap();
Set<Integer> unstableParts = supplyMsg.infos().keySet().stream().filter(p -> partMap.get(p) == MOVING).collect(Collectors.toSet());
U.error(log, "Rebalancing routine has failed, some partitions could be unavailable for reading" + " [" + demandRoutineInfo(nodeId, supplyMsg) + ", unavailablePartitions=" + S.compact(unstableParts) + ']', msgExc);
fut.error(nodeId);
return;
}
fut.receivedBytes.addAndGet(supplyMsg.messageSize());
if (grp.sharedGroup()) {
for (GridCacheContext cctx : grp.caches()) {
if (cctx.statisticsEnabled()) {
long keysCnt = supplyMsg.keysForCache(cctx.cacheId());
if (keysCnt != -1)
cctx.cache().metrics0().onRebalancingKeysCountEstimateReceived(keysCnt);
// Can not be calculated per cache.
cctx.cache().metrics0().onRebalanceBatchReceived(supplyMsg.messageSize());
}
}
} else {
GridCacheContext cctx = grp.singleCacheContext();
if (cctx.statisticsEnabled()) {
if (supplyMsg.estimatedKeysCount() != -1)
cctx.cache().metrics0().onRebalancingKeysCountEstimateReceived(supplyMsg.estimatedKeysCount());
cctx.cache().metrics0().onRebalanceBatchReceived(supplyMsg.messageSize());
}
}
try {
AffinityAssignment aff = grp.affinity().cachedAffinity(topVer);
// Preload.
for (Map.Entry<Integer, CacheEntryInfoCollection> e : supplyMsg.infos().entrySet()) {
int p = e.getKey();
if (aff.get(p).contains(ctx.localNode())) {
GridDhtLocalPartition part;
try {
part = top.localPartition(p, topVer, true);
} catch (GridDhtInvalidPartitionException err) {
assert !topVer.equals(top.lastTopologyChangeVersion());
if (log.isDebugEnabled()) {
log.debug("Failed to get partition for rebalancing [" + "grp=" + grp.cacheOrGroupName() + ", err=" + err + ", p=" + p + ", topVer=" + topVer + ", lastTopVer=" + top.lastTopologyChangeVersion() + ']');
}
continue;
}
assert part != null;
boolean last = supplyMsg.last().containsKey(p);
if (part.state() == MOVING) {
boolean reserved = part.reserve();
assert reserved : "Failed to reserve partition [igniteInstanceName=" + ctx.igniteInstanceName() + ", grp=" + grp.cacheOrGroupName() + ", part=" + part + ']';
part.beforeApplyBatch(last);
try {
long[] byteRcv = { 0 };
GridIterableAdapter<GridCacheEntryInfo> infosWrap = new GridIterableAdapter<>(new IteratorWrapper<GridCacheEntryInfo>(e.getValue().infos().iterator()) {
/**
* {@inheritDoc}
*/
@Override
public GridCacheEntryInfo nextX() throws IgniteCheckedException {
GridCacheEntryInfo i = super.nextX();
byteRcv[0] += i.marshalledSize(ctx.cacheObjectContext(i.cacheId()));
return i;
}
});
try {
if (grp.mvccEnabled())
mvccPreloadEntries(topVer, node, p, infosWrap);
else {
preloadEntries(topVer, part, infosWrap);
rebalanceFut.onReceivedKeys(p, e.getValue().infos().size(), node);
}
} catch (GridDhtInvalidPartitionException ignored) {
if (log.isDebugEnabled())
log.debug("Partition became invalid during rebalancing (will ignore): " + p);
}
fut.processed.get(p).increment();
fut.onReceivedBytes(p, byteRcv[0], node);
// If message was last for this partition, then we take ownership.
if (last)
ownPartition(fut, p, nodeId, supplyMsg);
} finally {
part.release();
}
} else {
if (last)
fut.partitionDone(nodeId, p, false);
if (log.isDebugEnabled())
log.debug("Skipping rebalancing partition (state is not MOVING): " + '[' + demandRoutineInfo(nodeId, supplyMsg) + ", p=" + p + ']');
}
} else {
fut.partitionDone(nodeId, p, false);
if (log.isDebugEnabled())
log.debug("Skipping rebalancing partition (affinity changed): " + '[' + demandRoutineInfo(nodeId, supplyMsg) + ", p=" + p + ']');
}
}
// Only request partitions based on latest topology version.
for (Integer miss : supplyMsg.missed()) {
if (aff.get(miss).contains(ctx.localNode()))
fut.partitionMissed(nodeId, miss);
}
for (Integer miss : supplyMsg.missed()) fut.partitionDone(nodeId, miss, false);
GridDhtPartitionDemandMessage d = new GridDhtPartitionDemandMessage(supplyMsg.rebalanceId(), supplyMsg.topologyVersion(), grp.groupId());
d.timeout(grp.preloader().timeout());
if (!fut.isDone()) {
// Send demand message.
try {
ctx.io().sendOrderedMessage(node, d.topic(), d.convertIfNeeded(node.version()), grp.ioPolicy(), grp.preloader().timeout());
if (log.isDebugEnabled())
log.debug("Send next demand message [" + demandRoutineInfo(nodeId, supplyMsg) + "]");
} catch (ClusterTopologyCheckedException e) {
if (log.isDebugEnabled())
log.debug("Supplier has left [" + demandRoutineInfo(nodeId, supplyMsg) + ", errMsg=" + e.getMessage() + ']');
}
} else {
if (log.isDebugEnabled())
log.debug("Will not request next demand message [" + demandRoutineInfo(nodeId, supplyMsg) + ", rebalanceFuture=" + fut + ']');
}
} catch (IgniteSpiException | IgniteCheckedException e) {
fut.error(nodeId);
LT.error(log, e, "Error during rebalancing [" + demandRoutineInfo(nodeId, supplyMsg) + ", err=" + e + ']');
}
} finally {
fut.cancelLock.readLock().unlock();
}
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.MOVING in project ignite by apache.
the class SchemaIndexCachePartitionWorker method processPartition.
/**
* Process partition.
*
* @throws IgniteCheckedException If failed.
*/
private void processPartition() throws IgniteCheckedException {
if (stop())
return;
checkCancelled();
boolean reserved = false;
GridDhtPartitionState partState = locPart.state();
if (partState != EVICTED)
reserved = (partState == OWNING || partState == MOVING || partState == LOST) && locPart.reserve();
if (!reserved)
return;
try {
GridCursor<? extends CacheDataRow> cursor = locPart.dataStore().cursor(cctx.cacheId(), null, null, KEY_ONLY);
boolean locked = false;
try {
int cntr = 0;
while (!stop() && cursor.next()) {
KeyCacheObject key = cursor.get().key();
if (!locked) {
cctx.shared().database().checkpointReadLock();
locked = true;
}
processKey(key);
if (++cntr % batchSize == 0) {
cctx.shared().database().checkpointReadUnlock();
locked = false;
}
cctx.cache().metrics0().addIndexRebuildKeyProcessed(1);
if (locPart.state() == RENTING)
break;
}
wrappedClo.addNumberProcessedKeys(cntr);
} finally {
if (locked)
cctx.shared().database().checkpointReadUnlock();
}
} finally {
locPart.release();
if (partsCnt.getAndUpdate(v -> v > 0 ? v - 1 : 0) > 0)
cctx.group().metrics().decrementIndexBuildCountPartitionsLeft();
}
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtPartitionState.MOVING in project ignite by apache.
the class EvictPartitionInLogTest method testEvictPartByMovingState.
/**
* Test checks the presence of evicted partitions (MOVING state) in log without duplicate partitions.
*
* @throws Exception If failed.
*/
@Test
public void testEvictPartByMovingState() throws Exception {
backups = 1;
IgniteEx node = startGrid();
Map<Integer, Collection<Integer>> parseParts = new ConcurrentHashMap<>();
LogListener logLsnr = logListener("clearing", parseParts, DEFAULT_CACHE_NAMES);
testLog.registerListener(logLsnr);
List<GridDhtLocalPartition> parts = of(DEFAULT_CACHE_NAMES).map(node::cache).map(GridCommonAbstractTest::internalCache0).flatMap(internalCache -> internalCache.context().topology().localPartitions().stream()).peek(p -> p.setState(MOVING)).collect(toList());
parts.subList(0, parts.size() - 1).forEach(GridDhtLocalPartition::clearAsync);
doSleep(500);
parts.get(parts.size() - 1).clearAsync();
check(logLsnr, parts, parseParts);
}
Aggregations