use of org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionExchangeId in project ignite by apache.
the class TxCrossCacheMapOnInvalidTopologyTest method doTestCrossCacheTxMapOnInvalidTopology.
/**
* Test scenario: cross-cache tx is started when node is left in the middle of rebalance, first cache is rebalanced
* and second is partially rebalanced.
*
* First cache map request will trigger client compatible remap for pessimistic txs,
* second cache map request should use new topology version.
*
* For optimistic tx remap is enforced if more than one mapping in transaction or all enlisted caches have compatible
* assignments.
*
* Success: tx is finished on ideal topology version over all mapped nodes.
*
* @param concurrency Concurrency.
* @param isolation Isolation.
*/
private void doTestCrossCacheTxMapOnInvalidTopology(TransactionConcurrency concurrency, TransactionIsolation isolation) throws Exception {
try {
IgniteEx crd = startGrid(0);
IgniteEx g1 = startGrid(1);
awaitPartitionMapExchange();
IgniteEx client = startClientGrid("client");
assertNotNull(client.cache(CACHE1));
assertNotNull(client.cache(CACHE2));
try (IgniteDataStreamer<Object, Object> streamer = crd.dataStreamer(CACHE1)) {
// Put 500 keys per partition.
for (int k = 0; k < PARTS_CNT * 500; k++) streamer.addData(k, new byte[10]);
}
try (IgniteDataStreamer<Object, Object> streamer = crd.dataStreamer(CACHE2)) {
// Put 500 keys per partition.
for (int k = 0; k < PARTS_CNT * 500; k++) streamer.addData(k, new byte[10]);
}
TestRecordingCommunicationSpi crdSpi = TestRecordingCommunicationSpi.spi(crd);
final AffinityTopologyVersion joinVer = new AffinityTopologyVersion(4, 0);
AffinityTopologyVersion leftVer = new AffinityTopologyVersion(5, 0);
AffinityTopologyVersion idealVer = new AffinityTopologyVersion(5, 1);
AtomicReference<Set<Integer>> full = new AtomicReference<>();
GridConcurrentSkipListSet<Integer> leftVerParts = new GridConcurrentSkipListSet<>();
crdSpi.blockMessages((node, m) -> {
if (m instanceof GridDhtPartitionSupplyMessage) {
GridDhtPartitionSupplyMessage msg = (GridDhtPartitionSupplyMessage) m;
// Allow full rebalance for cache 1 and system cache.
if (msg.groupId() != CU.cacheId(CACHE2))
return false;
// Allow only first batch for cache 2.
if (msg.topologyVersion().equals(joinVer)) {
if (full.get() == null) {
Map<Integer, Long> last = U.field(msg, "last");
full.set(last.keySet());
return false;
}
return true;
}
if (msg.topologyVersion().equals(leftVer)) {
Map<Integer, Long> last = U.field(msg, "last");
leftVerParts.addAll(last.keySet());
return true;
}
} else if (m instanceof GridDhtPartitionsFullMessage) {
GridDhtPartitionsFullMessage msg = (GridDhtPartitionsFullMessage) m;
// Delay full message for ideal topology switch.
GridDhtPartitionExchangeId exchId = msg.exchangeId();
if (exchId != null && exchId.topologyVersion().equals(idealVer))
return true;
}
return false;
});
TestRecordingCommunicationSpi g1Spi = TestRecordingCommunicationSpi.spi(g1);
g1Spi.blockMessages((node, msg) -> {
if (msg instanceof GridDhtPartitionSupplyMessage) {
GridDhtPartitionSupplyMessage m = (GridDhtPartitionSupplyMessage) msg;
return m.groupId() == CU.cacheId(CACHE2);
}
return false;
});
startGrid(2);
crdSpi.waitForBlocked();
g1Spi.waitForBlocked();
// Wait partial owning.
assertTrue("Timed out while waiting for rebalance", GridTestUtils.waitForCondition(() -> {
// Await full rebalance for cache 2.
GridDhtPartitionTopology top0 = grid(2).cachex(CACHE1).context().topology();
for (int p = 0; p < PARTS_CNT; p++) {
if (top0.localPartition(p).state() != OWNING)
return false;
}
// Await partial rebalance for cache 1.
GridDhtPartitionTopology top1 = grid(2).cachex(CACHE2).context().topology();
for (Integer part : full.get()) {
if (top1.localPartition(part).state() != OWNING)
return false;
}
return true;
}, 10_000));
// At this point cache 1 is fully rebalanced and cache 2 is partially rebalanced.
// Stop supplier in the middle of rebalance.
g1.close();
// Wait for topologies and calculate required partitions.
grid(0).cachex(CACHE1).context().affinity().affinityReadyFuture(leftVer).get();
grid(2).cachex(CACHE1).context().affinity().affinityReadyFuture(leftVer).get();
grid(0).cachex(CACHE2).context().affinity().affinityReadyFuture(leftVer).get();
grid(2).cachex(CACHE2).context().affinity().affinityReadyFuture(leftVer).get();
AffinityAssignment assignment0 = grid(0).cachex(CACHE1).context().affinity().assignment(leftVer);
AffinityAssignment assignment = grid(0).cachex(CACHE2).context().affinity().assignment(leftVer);
// Search for a partition with incompatible assignment.
// Partition for cache1 which is mapped for both late and ideal topologies to the same primary.
int stablePart = -1;
// Partition for cache2 which is mapped for both late and ideal topologies on different primaries.
int movingPart = -1;
for (int p = 0; p < assignment0.assignment().size(); p++) {
List<ClusterNode> curr = assignment.assignment().get(p);
List<ClusterNode> ideal = assignment.idealAssignment().get(p);
if (curr.equals(ideal) && curr.get(0).order() == 1) {
stablePart = p;
break;
}
}
assertFalse(stablePart == -1);
for (int p = 0; p < assignment.assignment().size(); p++) {
List<ClusterNode> curr = assignment.assignment().get(p);
List<ClusterNode> ideal = assignment.idealAssignment().get(p);
if (!curr.equals(ideal) && curr.get(0).order() == 1) {
movingPart = p;
break;
}
}
assertFalse(movingPart == -1);
TestRecordingCommunicationSpi.spi(client).blockMessages(new IgniteBiPredicate<ClusterNode, Message>() {
@Override
public boolean apply(ClusterNode node, Message msg) {
if (concurrency == PESSIMISTIC)
return msg instanceof GridNearLockRequest;
else
return msg instanceof GridNearTxPrepareRequest;
}
});
final int finalStablePart = stablePart;
final int finalMovingPart = movingPart;
IgniteInternalFuture<?> txFut = multithreadedAsync(() -> {
try (Transaction tx = client.transactions().txStart(concurrency, isolation)) {
// Will map on crd(order=1).
client.cache(CACHE1).put(finalStablePart, 0);
// Next request will remap to ideal topology, but it's not ready on other node except crd.
client.cache(CACHE2).put(finalMovingPart, 0);
tx.commit();
}
}, 1, "tx-thread");
// Wait until all missing supply messages are blocked.
assertTrue(GridTestUtils.waitForCondition(() -> leftVerParts.size() == PARTS_CNT - full.get().size(), 5_000));
// Delay first lock request on late topology.
TestRecordingCommunicationSpi.spi(client).waitForBlocked();
// At this point only supply messages should be blocked.
// Unblock to continue rebalance and trigger ideal topology switch.
crdSpi.stopBlock(true, null, false, true);
// Wait until ideal topology is ready on crd.
crd.context().cache().context().exchange().affinityReadyFuture(idealVer).get(10_000);
// Other node must wait for full message.
assertFalse(GridTestUtils.waitForCondition(() -> grid(2).context().cache().context().exchange().affinityReadyFuture(idealVer).isDone(), 1_000));
// Map on unstable topology (PME is in progress on other node).
TestRecordingCommunicationSpi.spi(client).stopBlock();
// Capture local transaction.
IgniteInternalTx tx0 = client.context().cache().context().tm().activeTransactions().iterator().next();
// Expected behavior: tx must hang (both pessimistic and optimistic) because topology is not ready.
try {
txFut.get(3_000);
fail("TX must not complete");
} catch (IgniteFutureTimeoutCheckedException e) {
// Expected.
}
crdSpi.stopBlock();
txFut.get();
// Check transaction map version. Should be mapped on ideal topology.
assertEquals(tx0.topologyVersionSnapshot(), idealVer);
awaitPartitionMapExchange();
checkFutures();
} finally {
stopAllGrids();
}
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionExchangeId in project ignite by apache.
the class IgniteClusterSnapshotSelfTest method testClusterSnapshotOnMovingPartitionsCoordinatorLeft.
/**
* @throws Exception If fails.
*/
@Test
public void testClusterSnapshotOnMovingPartitionsCoordinatorLeft() throws Exception {
startGridsWithCache(2, dfltCacheCfg, CACHE_KEYS_RANGE);
for (Ignite grid : G.allGrids()) {
TestRecordingCommunicationSpi.spi(grid).blockMessages((node, msg) -> msg instanceof GridDhtPartitionSupplyMessage);
}
Ignite ignite = startGrid(2);
ignite.cluster().setBaselineTopology(ignite.cluster().topologyVersion());
TestRecordingCommunicationSpi.spi(grid(0)).waitForBlocked();
CountDownLatch latch = new CountDownLatch(G.allGrids().size());
IgniteInternalFuture<?> stopFut = GridTestUtils.runAsync(() -> {
try {
U.await(latch);
stopGrid(0);
} catch (IgniteInterruptedCheckedException e) {
fail("Must not fail here: " + e.getMessage());
}
});
Queue<T2<GridDhtPartitionExchangeId, Boolean>> exchFuts = new ConcurrentLinkedQueue<>();
for (Ignite ig : G.allGrids()) {
((IgniteEx) ig).context().cache().context().exchange().registerExchangeAwareComponent(new PartitionsExchangeAware() {
/**
* {@inheritDoc}
*/
@Override
public void onInitBeforeTopologyLock(GridDhtPartitionsExchangeFuture fut) {
if (!(fut.firstEvent() instanceof DiscoveryCustomEvent))
return;
try {
exchFuts.add(new T2<>(fut.exchangeId(), fut.rebalanced()));
latch.countDown();
stopFut.get();
} catch (IgniteCheckedException e) {
U.log(log, "Interrupted on coordinator: " + e.getMessage());
}
}
});
}
IgniteFuture<Void> fut = ignite.snapshot().createSnapshot(SNAPSHOT_NAME);
stopFut.get();
assertThrowsAnyCause(log, fut::get, IgniteException.class, "Snapshot creation has been finished with an error");
assertEquals("Snapshot futures expected: " + exchFuts, 3, exchFuts.size());
for (T2<GridDhtPartitionExchangeId, Boolean> exch : exchFuts) assertFalse("Snapshot `rebalanced` must be false with moving partitions: " + exch.get1(), exch.get2());
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionExchangeId in project ignite by apache.
the class GridCachePartitionExchangeManager method onKernalStart0.
/** {@inheritDoc} */
@Override
protected void onKernalStart0(boolean reconnect) throws IgniteCheckedException {
super.onKernalStart0(reconnect);
ClusterNode loc = cctx.localNode();
long startTime = loc.metrics().getStartTime();
assert startTime > 0;
// Generate dummy discovery event for local node joining.
T2<DiscoveryEvent, DiscoCache> locJoin = cctx.discovery().localJoin();
DiscoveryEvent discoEvt = locJoin.get1();
DiscoCache discoCache = locJoin.get2();
GridDhtPartitionExchangeId exchId = initialExchangeId();
GridDhtPartitionsExchangeFuture fut = exchangeFuture(exchId, discoEvt, discoCache, null, null);
if (reconnect)
reconnectExchangeFut = new GridFutureAdapter<>();
exchWorker.addFirstExchangeFuture(fut);
if (!cctx.kernalContext().clientNode()) {
for (int cnt = 0; cnt < cctx.gridConfig().getRebalanceThreadPoolSize(); cnt++) {
final int idx = cnt;
cctx.io().addOrderedHandler(rebalanceTopic(cnt), new CI2<UUID, GridCacheMessage>() {
@Override
public void apply(final UUID id, final GridCacheMessage m) {
if (!enterBusy())
return;
try {
GridCacheContext cacheCtx = cctx.cacheContext(m.cacheId);
if (cacheCtx != null) {
if (m instanceof GridDhtPartitionSupplyMessage)
cacheCtx.preloader().handleSupplyMessage(idx, id, (GridDhtPartitionSupplyMessage) m);
else if (m instanceof GridDhtPartitionDemandMessage)
cacheCtx.preloader().handleDemandMessage(idx, id, (GridDhtPartitionDemandMessage) m);
else
U.error(log, "Unsupported message type: " + m.getClass().getName());
}
} finally {
leaveBusy();
}
}
});
}
}
new IgniteThread(cctx.igniteInstanceName(), "exchange-worker", exchWorker).start();
if (reconnect) {
fut.listen(new CI1<IgniteInternalFuture<AffinityTopologyVersion>>() {
@Override
public void apply(IgniteInternalFuture<AffinityTopologyVersion> fut) {
try {
fut.get();
for (GridCacheContext cacheCtx : cctx.cacheContexts()) cacheCtx.preloader().onInitialExchangeComplete(null);
reconnectExchangeFut.onDone();
} catch (IgniteCheckedException e) {
for (GridCacheContext cacheCtx : cctx.cacheContexts()) cacheCtx.preloader().onInitialExchangeComplete(e);
reconnectExchangeFut.onDone(e);
}
}
});
} else {
if (log.isDebugEnabled())
log.debug("Beginning to wait on local exchange future: " + fut);
boolean first = true;
while (true) {
try {
fut.get(cctx.preloadExchangeTimeout());
break;
} catch (IgniteFutureTimeoutCheckedException ignored) {
if (first) {
U.warn(log, "Failed to wait for initial partition map exchange. " + "Possible reasons are: " + U.nl() + " ^-- Transactions in deadlock." + U.nl() + " ^-- Long running transactions (ignore if this is the case)." + U.nl() + " ^-- Unreleased explicit locks.");
first = false;
} else
U.warn(log, "Still waiting for initial partition map exchange [fut=" + fut + ']');
} catch (IgniteNeedReconnectException e) {
throw e;
} catch (Exception e) {
if (fut.reconnectOnError(e))
throw new IgniteNeedReconnectException(cctx.localNode(), e);
throw e;
}
}
AffinityTopologyVersion nodeStartVer = new AffinityTopologyVersion(discoEvt.topologyVersion(), 0);
for (GridCacheContext cacheCtx : cctx.cacheContexts()) {
if (nodeStartVer.equals(cacheCtx.startTopologyVersion()))
cacheCtx.preloader().onInitialExchangeComplete(null);
}
if (log.isDebugEnabled())
log.debug("Finished waiting for initial exchange: " + fut.exchangeId());
}
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionExchangeId in project ignite by apache.
the class GridDhtPartitionTopologyImpl method initPartitions0.
/**
* @param exchFut Exchange future.
* @param updateSeq Update sequence.
*/
private void initPartitions0(GridDhtPartitionsExchangeFuture exchFut, long updateSeq) {
ClusterNode loc = cctx.localNode();
ClusterNode oldest = discoCache.oldestAliveServerNodeWithCache();
GridDhtPartitionExchangeId exchId = exchFut.exchangeId();
assert topVer.equals(exchFut.topologyVersion()) : "Invalid topology [topVer=" + topVer + ", cache=" + cctx.name() + ", futVer=" + exchFut.topologyVersion() + ", fut=" + exchFut + ']';
assert cctx.affinity().affinityTopologyVersion().equals(exchFut.topologyVersion()) : "Invalid affinity [topVer=" + cctx.affinity().affinityTopologyVersion() + ", cache=" + cctx.name() + ", futVer=" + exchFut.topologyVersion() + ", fut=" + exchFut + ']';
List<List<ClusterNode>> aff = cctx.affinity().assignments(exchFut.topologyVersion());
int num = cctx.affinity().partitions();
if (cctx.rebalanceEnabled()) {
boolean added = exchFut.cacheAddedOnExchange(cctx.cacheId(), cctx.receivedFrom());
boolean first = (loc.equals(oldest) && loc.id().equals(exchId.nodeId()) && exchId.isJoined()) || added;
if (first) {
assert exchId.isJoined() || added;
for (int p = 0; p < num; p++) {
if (localNode(p, aff)) {
GridDhtLocalPartition locPart = createPartition(p);
boolean owned = locPart.own();
assert owned : "Failed to own partition for oldest node [cacheName" + cctx.name() + ", part=" + locPart + ']';
if (log.isDebugEnabled())
log.debug("Owned partition for oldest node: " + locPart);
updateSeq = updateLocal(p, locPart.state(), updateSeq);
}
}
} else
createPartitions(aff, updateSeq);
} else {
// the partitions this node is not responsible for.
for (int p = 0; p < num; p++) {
GridDhtLocalPartition locPart = localPartition(p, topVer, false, false);
boolean belongs = localNode(p, aff);
if (locPart != null) {
if (!belongs) {
GridDhtPartitionState state = locPart.state();
if (state.active()) {
locPart.rent(false);
updateSeq = updateLocal(p, locPart.state(), updateSeq);
if (log.isDebugEnabled())
log.debug("Evicting partition with rebalancing disabled " + "(it does not belong to affinity): " + locPart);
}
} else
locPart.own();
} else if (belongs) {
locPart = createPartition(p);
locPart.own();
updateLocal(p, locPart.state(), updateSeq);
}
}
}
if (node2part != null && node2part.valid())
checkEvictions(updateSeq, aff);
updateRebalanceVersion(aff);
}
use of org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionExchangeId in project ignite by apache.
the class GridDhtPartitionTopologyImpl method initPartitions0.
/**
* @param affVer Affinity version to use.
* @param exchFut Exchange future.
* @param updateSeq Update sequence.
* @return {@code True} if partitions must be refreshed.
*/
private boolean initPartitions0(AffinityTopologyVersion affVer, GridDhtPartitionsExchangeFuture exchFut, long updateSeq) {
List<List<ClusterNode>> aff = grp.affinity().readyAssignments(affVer);
boolean needRefresh = false;
if (grp.affinityNode()) {
ClusterNode loc = ctx.localNode();
ClusterNode oldest = discoCache.oldestAliveServerNode();
GridDhtPartitionExchangeId exchId = exchFut.exchangeId();
assert grp.affinity().lastVersion().equals(affVer) : "Invalid affinity [topVer=" + grp.affinity().lastVersion() + ", grp=" + grp.cacheOrGroupName() + ", affVer=" + affVer + ", fut=" + exchFut + ']';
int num = grp.affinity().partitions();
if (grp.rebalanceEnabled()) {
boolean added = exchFut.cacheGroupAddedOnExchange(grp.groupId(), grp.receivedFrom());
boolean first = added || (loc.equals(oldest) && loc.id().equals(exchId.nodeId()) && exchId.isJoined());
if (first) {
assert exchId.isJoined() || added;
for (int p = 0; p < num; p++) {
if (localNode(p, aff) || initLocalPartition(p, discoCache)) {
GridDhtLocalPartition locPart = createPartition(p);
if (grp.persistenceEnabled()) {
GridCacheDatabaseSharedManager db = (GridCacheDatabaseSharedManager) grp.shared().database();
locPart.restoreState(db.readPartitionState(grp, locPart.id()));
} else {
boolean owned = locPart.own();
assert owned : "Failed to own partition for oldest node [grp=" + grp.cacheOrGroupName() + ", part=" + locPart + ']';
if (log.isDebugEnabled())
log.debug("Owned partition for oldest node [grp=" + grp.cacheOrGroupName() + ", part=" + locPart + ']');
}
needRefresh = true;
updateSeq = updateLocal(p, locPart.state(), updateSeq, affVer);
}
}
} else
createPartitions(affVer, aff, updateSeq);
} else {
// the partitions this node is not responsible for.
for (int p = 0; p < num; p++) {
GridDhtLocalPartition locPart = localPartition0(p, affVer, false, true, false);
boolean belongs = localNode(p, aff);
if (locPart != null) {
if (!belongs) {
GridDhtPartitionState state = locPart.state();
if (state.active()) {
locPart.rent(false);
updateSeq = updateLocal(p, locPart.state(), updateSeq, affVer);
if (log.isDebugEnabled()) {
log.debug("Evicting partition with rebalancing disabled (it does not belong to " + "affinity) [grp=" + grp.cacheOrGroupName() + ", part=" + locPart + ']');
}
}
} else
locPart.own();
} else if (belongs) {
locPart = createPartition(p);
locPart.own();
updateLocal(p, locPart.state(), updateSeq, affVer);
}
}
}
}
updateRebalanceVersion(aff);
return needRefresh;
}
Aggregations