Search in sources :

Example 1 with IgniteRebalanceIterator

use of org.apache.ignite.internal.processors.cache.IgniteRebalanceIterator in project ignite by apache.

the class GridDhtPartitionSupplier method handleDemandMessage.

/**
 * For each demand message method lookups (or creates new) supply context and starts to iterate entries across requested partitions.
 * Each entry in iterator is placed to prepared supply message.
 *
 * If supply message size in bytes becomes greater than {@link IgniteConfiguration#getRebalanceBatchSize()}
 * method sends this message to demand node and saves partial state of iterated entries to supply context,
 * then restores the context again after new demand message with the same context id is arrived.
 *
 * @param topicId Id of the topic is used for the supply-demand communication.
 * @param nodeId Id of the node which sent the demand message.
 * @param demandMsg Demand message.
 */
public void handleDemandMessage(int topicId, UUID nodeId, GridDhtPartitionDemandMessage demandMsg) {
    assert demandMsg != null;
    assert nodeId != null;
    T3<UUID, Integer, AffinityTopologyVersion> contextId = new T3<>(nodeId, topicId, demandMsg.topologyVersion());
    if (demandMsg.rebalanceId() < 0) {
        // Demand node requested context cleanup.
        synchronized (scMap) {
            SupplyContext sctx = scMap.get(contextId);
            if (sctx != null && sctx.rebalanceId == -demandMsg.rebalanceId()) {
                clearContext(scMap.remove(contextId), log);
                if (log.isDebugEnabled())
                    log.debug("Supply context cleaned [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", supplyContext=" + sctx + "]");
            } else {
                if (log.isDebugEnabled())
                    log.debug("Stale supply context cleanup message [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", supplyContext=" + sctx + "]");
            }
            return;
        }
    }
    ClusterNode demanderNode = grp.shared().discovery().node(nodeId);
    if (demanderNode == null) {
        if (log.isDebugEnabled())
            log.debug("Demand message rejected (demander left cluster) [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + "]");
        return;
    }
    IgniteRebalanceIterator iter = null;
    SupplyContext sctx = null;
    Set<Integer> remainingParts = null;
    GridDhtPartitionSupplyMessage supplyMsg = new GridDhtPartitionSupplyMessage(demandMsg.rebalanceId(), grp.groupId(), demandMsg.topologyVersion(), grp.deploymentEnabled());
    try {
        synchronized (scMap) {
            sctx = scMap.remove(contextId);
            if (sctx != null && demandMsg.rebalanceId() < sctx.rebalanceId) {
                // Stale message, return context back and return.
                scMap.put(contextId, sctx);
                if (log.isDebugEnabled())
                    log.debug("Stale demand message [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", actualContext=" + sctx + "]");
                return;
            }
        }
        // Demand request should not contain empty partitions if no supply context is associated with it.
        if (sctx == null && (demandMsg.partitions() == null || demandMsg.partitions().isEmpty())) {
            if (log.isDebugEnabled())
                log.debug("Empty demand message (no context and partitions) [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + "]");
            return;
        }
        if (log.isDebugEnabled())
            log.debug("Demand message accepted [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + "]");
        assert !(sctx != null && !demandMsg.partitions().isEmpty());
        long maxBatchesCnt = /* Each thread should gain prefetched batches. */
        grp.preloader().batchesPrefetchCount() * grp.shared().gridConfig().getRebalanceThreadPoolSize();
        if (sctx == null) {
            if (log.isDebugEnabled())
                log.debug("Starting supplying rebalancing [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", fullPartitions=" + S.compact(demandMsg.partitions().fullSet()) + ", histPartitions=" + S.compact(demandMsg.partitions().historicalSet()) + "]");
        } else
            maxBatchesCnt = 1;
        if (sctx == null || sctx.iterator == null) {
            remainingParts = new HashSet<>(demandMsg.partitions().fullSet());
            CachePartitionPartialCountersMap histMap = demandMsg.partitions().historicalMap();
            for (int i = 0; i < histMap.size(); i++) {
                int p = histMap.partitionAt(i);
                remainingParts.add(p);
            }
            iter = grp.offheap().rebalanceIterator(demandMsg.partitions(), demandMsg.topologyVersion());
            for (Integer part : demandMsg.partitions().fullSet()) {
                if (iter.isPartitionMissing(part))
                    continue;
                GridDhtLocalPartition loc = top.localPartition(part, demandMsg.topologyVersion(), false);
                assert loc != null && loc.state() == GridDhtPartitionState.OWNING : "Partition should be in OWNING state: " + loc;
                supplyMsg.addEstimatedKeysCount(loc.dataStore().fullSize());
            }
            for (int i = 0; i < histMap.size(); i++) {
                int p = histMap.partitionAt(i);
                if (iter.isPartitionMissing(p))
                    continue;
                supplyMsg.addEstimatedKeysCount(histMap.updateCounterAt(i) - histMap.initialUpdateCounterAt(i));
            }
        } else {
            iter = sctx.iterator;
            remainingParts = sctx.remainingParts;
        }
        final int msgMaxSize = grp.preloader().batchSize();
        long batchesCnt = 0;
        CacheDataRow prevRow = null;
        while (iter.hasNext()) {
            CacheDataRow row = iter.peek();
            // Prevent mvcc entry history splitting into separate batches.
            boolean canFlushHistory = !grp.mvccEnabled() || prevRow != null && ((grp.sharedGroup() && row.cacheId() != prevRow.cacheId()) || !row.key().equals(prevRow.key()));
            if (canFlushHistory && supplyMsg.messageSize() >= msgMaxSize) {
                if (++batchesCnt >= maxBatchesCnt) {
                    saveSupplyContext(contextId, iter, remainingParts, demandMsg.rebalanceId());
                    reply(topicId, demanderNode, demandMsg, supplyMsg, contextId);
                    return;
                } else {
                    if (!reply(topicId, demanderNode, demandMsg, supplyMsg, contextId))
                        return;
                    supplyMsg = new GridDhtPartitionSupplyMessage(demandMsg.rebalanceId(), grp.groupId(), demandMsg.topologyVersion(), grp.deploymentEnabled());
                }
            }
            row = iter.next();
            prevRow = row;
            int part = row.partition();
            GridDhtLocalPartition loc = top.localPartition(part, demandMsg.topologyVersion(), false);
            assert (loc != null && loc.state() == OWNING && loc.reservations() > 0) || iter.isPartitionMissing(part) : "Partition should be in OWNING state and has at least 1 reservation " + loc;
            if (iter.isPartitionMissing(part) && remainingParts.contains(part)) {
                supplyMsg.missed(part);
                remainingParts.remove(part);
                if (grp.eventRecordable(EVT_CACHE_REBALANCE_PART_MISSED))
                    grp.addRebalanceMissEvent(part);
                if (log.isDebugEnabled())
                    log.debug("Requested partition is marked as missing [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", p=" + part + "]");
                continue;
            }
            if (!remainingParts.contains(part))
                continue;
            GridCacheEntryInfo info = extractEntryInfo(row);
            if (info == null)
                continue;
            supplyMsg.addEntry0(part, iter.historical(part), info, grp.shared(), grp.cacheObjectContext());
            if (iter.isPartitionDone(part)) {
                supplyMsg.last(part, loc.updateCounter());
                remainingParts.remove(part);
                if (grp.eventRecordable(EVT_CACHE_REBALANCE_PART_SUPPLIED))
                    grp.addRebalanceSupplyEvent(part);
            }
        }
        Iterator<Integer> remainingIter = remainingParts.iterator();
        while (remainingIter.hasNext()) {
            int p = remainingIter.next();
            if (iter.isPartitionDone(p)) {
                GridDhtLocalPartition loc = top.localPartition(p, demandMsg.topologyVersion(), false);
                assert loc != null : "Supply partition is gone: grp=" + grp.cacheOrGroupName() + ", p=" + p;
                supplyMsg.last(p, loc.updateCounter());
                remainingIter.remove();
                if (grp.eventRecordable(EVT_CACHE_REBALANCE_PART_SUPPLIED))
                    grp.addRebalanceSupplyEvent(p);
            } else if (iter.isPartitionMissing(p)) {
                supplyMsg.missed(p);
                remainingIter.remove();
                if (grp.eventRecordable(EVT_CACHE_REBALANCE_PART_MISSED))
                    grp.addRebalanceMissEvent(p);
            }
        }
        assert remainingParts.isEmpty() : "Partitions after rebalance should be either done or missing: " + remainingParts;
        if (sctx != null)
            clearContext(sctx, log);
        else
            iter.close();
        reply(topicId, demanderNode, demandMsg, supplyMsg, contextId);
        if (log.isInfoEnabled())
            log.info("Finished supplying rebalancing [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + "]");
    } catch (Throwable t) {
        if (iter != null && !iter.isClosed()) {
            try {
                iter.close();
            } catch (IgniteCheckedException e) {
                t.addSuppressed(e);
            }
        }
        if (grp.shared().kernalContext().isStopping())
            return;
        // Sending supply messages with error requires new protocol.
        boolean sendErrMsg = demanderNode.version().compareTo(GridDhtPartitionSupplyMessageV2.AVAILABLE_SINCE) >= 0;
        if (t instanceof IgniteSpiException) {
            if (log.isDebugEnabled())
                log.debug("Failed to send message to node (current node is stopping?) [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ", msg=" + t.getMessage() + ']');
            sendErrMsg = false;
        } else
            U.error(log, "Failed to continue supplying [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ']', t);
        try {
            if (sctx != null)
                clearContext(sctx, log);
        } catch (Throwable t1) {
            U.error(log, "Failed to cleanup supplying context [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ']', t1);
        }
        if (!sendErrMsg)
            return;
        boolean fallbackToFullRebalance = X.hasCause(t, IgniteHistoricalIteratorException.class);
        try {
            GridDhtPartitionSupplyMessage errMsg;
            if (fallbackToFullRebalance) {
                // Mark the last checkpoint as not applicable for WAL rebalance.
                grp.shared().database().lastCheckpointInapplicableForWalRebalance(grp.groupId());
                // Mark all remaining partitions as missed to trigger full rebalance.
                if (iter == null && F.isEmpty(remainingParts)) {
                    remainingParts = new HashSet<>(demandMsg.partitions().fullSet());
                    remainingParts.addAll(demandMsg.partitions().historicalSet());
                }
                for (int p : Optional.ofNullable(remainingParts).orElseGet(Collections::emptySet)) supplyMsg.missed(p);
                errMsg = supplyMsg;
            } else {
                errMsg = new GridDhtPartitionSupplyMessageV2(demandMsg.rebalanceId(), grp.groupId(), demandMsg.topologyVersion(), grp.deploymentEnabled(), t);
            }
            reply(topicId, demanderNode, demandMsg, errMsg, contextId);
        } catch (Throwable t1) {
            U.error(log, "Failed to send supply error message [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ']', t1);
        }
        // instead of triggering failure handler.
        if (!fallbackToFullRebalance) {
            grp.shared().kernalContext().failure().process(new FailureContext(FailureType.CRITICAL_ERROR, new IgniteCheckedException("Failed to continue supplying [" + supplyRoutineInfo(topicId, nodeId, demandMsg) + ']', t)));
        }
    }
}
Also used : IgniteCheckedException(org.apache.ignite.IgniteCheckedException) FailureContext(org.apache.ignite.failure.FailureContext) GridDhtLocalPartition(org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) UUID(java.util.UUID) T3(org.apache.ignite.internal.util.typedef.T3) HashSet(java.util.HashSet) ClusterNode(org.apache.ignite.cluster.ClusterNode) CacheDataRow(org.apache.ignite.internal.processors.cache.persistence.CacheDataRow) GridCacheEntryInfo(org.apache.ignite.internal.processors.cache.GridCacheEntryInfo) AffinityTopologyVersion(org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion) IgniteRebalanceIterator(org.apache.ignite.internal.processors.cache.IgniteRebalanceIterator)

Example 2 with IgniteRebalanceIterator

use of org.apache.ignite.internal.processors.cache.IgniteRebalanceIterator in project ignite by apache.

the class WalRecoveryTxLogicalRecordsTest method testHistoricalRebalanceIterator.

/**
 * @throws Exception if failed.
 */
@Test
public void testHistoricalRebalanceIterator() throws Exception {
    System.setProperty(IgniteSystemProperties.IGNITE_PDS_WAL_REBALANCE_THRESHOLD, "0");
    extraCcfg = new CacheConfiguration(CACHE_NAME + "2");
    extraCcfg.setAffinity(new RendezvousAffinityFunction(false, PARTS));
    Ignite ignite = startGrid();
    try {
        ignite.cluster().active(true);
        GridCacheDatabaseSharedManager dbMgr = (GridCacheDatabaseSharedManager) ((IgniteEx) ignite).context().cache().context().database();
        dbMgr.waitForCheckpoint("test");
        // This number depends on wal history size.
        int entries = 25;
        IgniteCache<Integer, Integer> cache = ignite.cache(CACHE_NAME);
        IgniteCache<Integer, Integer> cache2 = ignite.cache(CACHE_NAME + "2");
        for (int i = 0; i < entries; i++) {
            // Put to partition 0.
            cache.put(i * PARTS, i * PARTS);
            // Put to partition 1.
            cache.put(i * PARTS + 1, i * PARTS + 1);
            // Put to another cache.
            cache2.put(i, i);
            dbMgr.waitForCheckpoint("test");
        }
        for (int i = 0; i < entries; i++) {
            assertEquals((Integer) (i * PARTS), cache.get(i * PARTS));
            assertEquals((Integer) (i * PARTS + 1), cache.get(i * PARTS + 1));
            assertEquals((Integer) (i), cache2.get(i));
        }
        CacheGroupContext grp = ((IgniteEx) ignite).context().cache().cacheGroup(CU.cacheId(CACHE_NAME));
        IgniteCacheOffheapManager offh = grp.offheap();
        AffinityTopologyVersion topVer = grp.affinity().lastVersion();
        IgniteDhtDemandedPartitionsMap map;
        for (int i = 0; i < entries; i++) {
            map = new IgniteDhtDemandedPartitionsMap();
            map.addHistorical(0, i, entries, PARTS);
            WALPointer ptr = reserveWalPointerForIterator(grp.shared());
            try (IgniteRebalanceIterator it = offh.rebalanceIterator(map, topVer)) {
                assertNotNull(it);
                assertTrue("Not historical for iteration: " + i, it.historical(0));
                for (int j = i; j < entries; j++) {
                    assertTrue("i=" + i + ", j=" + j, it.hasNextX());
                    CacheDataRow row = it.next();
                    assertEquals(j * PARTS, (int) row.key().value(grp.cacheObjectContext(), false));
                    assertEquals(j * PARTS, (int) row.value().value(grp.cacheObjectContext(), false));
                }
                assertFalse(it.hasNext());
            } finally {
                releaseWalPointerForIterator(grp.shared(), ptr);
            }
            map = new IgniteDhtDemandedPartitionsMap();
            map.addHistorical(1, i, entries, PARTS);
            ptr = reserveWalPointerForIterator(grp.shared());
            try (IgniteRebalanceIterator it = offh.rebalanceIterator(map, topVer)) {
                assertNotNull(it);
                assertTrue("Not historical for iteration: " + i, it.historical(1));
                for (int j = i; j < entries; j++) {
                    assertTrue(it.hasNextX());
                    CacheDataRow row = it.next();
                    assertEquals(j * PARTS + 1, (int) row.key().value(grp.cacheObjectContext(), false));
                    assertEquals(j * PARTS + 1, (int) row.value().value(grp.cacheObjectContext(), false));
                }
                assertFalse(it.hasNext());
            } finally {
                releaseWalPointerForIterator(grp.shared(), ptr);
            }
        }
        stopAllGrids();
        // Check that iterator is valid after restart.
        ignite = startGrid();
        ignite.cluster().active(true);
        grp = ((IgniteEx) ignite).context().cache().cacheGroup(CU.cacheId(CACHE_NAME));
        offh = grp.offheap();
        topVer = grp.affinity().lastVersion();
        for (int i = 0; i < entries; i++) {
            long start = System.currentTimeMillis();
            map = new IgniteDhtDemandedPartitionsMap();
            map.addHistorical(0, i, entries, PARTS);
            WALPointer ptr = reserveWalPointerForIterator(grp.shared());
            try (IgniteRebalanceIterator it = offh.rebalanceIterator(map, topVer)) {
                long end = System.currentTimeMillis();
                info("Time to get iterator: " + (end - start));
                assertTrue("Not historical for iteration: " + i, it.historical(0));
                assertNotNull(it);
                start = System.currentTimeMillis();
                for (int j = i; j < entries; j++) {
                    assertTrue("i=" + i + ", j=" + j, it.hasNextX());
                    CacheDataRow row = it.next();
                    assertEquals(j * PARTS, (int) row.key().value(grp.cacheObjectContext(), false));
                    assertEquals(j * PARTS, (int) row.value().value(grp.cacheObjectContext(), false));
                }
                end = System.currentTimeMillis();
                info("Time to iterate: " + (end - start));
                assertFalse(it.hasNext());
            } finally {
                releaseWalPointerForIterator(grp.shared(), ptr);
            }
            map = new IgniteDhtDemandedPartitionsMap();
            map.addHistorical(1, i, entries, PARTS);
            ptr = reserveWalPointerForIterator(grp.shared());
            try (IgniteRebalanceIterator it = offh.rebalanceIterator(map, topVer)) {
                assertNotNull(it);
                assertTrue("Not historical for iteration: " + i, it.historical(1));
                for (int j = i; j < entries; j++) {
                    assertTrue(it.hasNextX());
                    CacheDataRow row = it.next();
                    assertEquals(j * PARTS + 1, (int) row.key().value(grp.cacheObjectContext(), false));
                    assertEquals(j * PARTS + 1, (int) row.value().value(grp.cacheObjectContext(), false));
                }
                assertFalse(it.hasNext());
            } finally {
                releaseWalPointerForIterator(grp.shared(), ptr);
            }
        }
    } finally {
        stopAllGrids();
        System.clearProperty(IgniteSystemProperties.IGNITE_PDS_WAL_REBALANCE_THRESHOLD);
    }
}
Also used : CacheDataRow(org.apache.ignite.internal.processors.cache.persistence.CacheDataRow) GridCacheDatabaseSharedManager(org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager) AffinityTopologyVersion(org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion) IgniteRebalanceIterator(org.apache.ignite.internal.processors.cache.IgniteRebalanceIterator) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IgniteDhtDemandedPartitionsMap(org.apache.ignite.internal.processors.cache.distributed.dht.preloader.IgniteDhtDemandedPartitionsMap) IgniteCacheOffheapManager(org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManager) IgniteEx(org.apache.ignite.internal.IgniteEx) RendezvousAffinityFunction(org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction) Ignite(org.apache.ignite.Ignite) CacheGroupContext(org.apache.ignite.internal.processors.cache.CacheGroupContext) WALPointer(org.apache.ignite.internal.processors.cache.persistence.wal.WALPointer) CacheConfiguration(org.apache.ignite.configuration.CacheConfiguration) GridCommonAbstractTest(org.apache.ignite.testframework.junits.common.GridCommonAbstractTest) Test(org.junit.Test)

Example 3 with IgniteRebalanceIterator

use of org.apache.ignite.internal.processors.cache.IgniteRebalanceIterator in project ignite by apache.

the class WalRecoveryTxLogicalRecordsTest method rows.

/**
 * @param ignite Ignite.
 * @param part Partition.
 * @param from From counter.
 * @param to To counter.
 */
private List<CacheDataRow> rows(Ignite ignite, int part, long from, long to) throws IgniteCheckedException {
    CacheGroupContext grp = ((IgniteEx) ignite).context().cache().cacheGroup(CU.cacheId(CACHE_NAME));
    IgniteCacheOffheapManager offh = grp.offheap();
    AffinityTopologyVersion topVer = grp.affinity().lastVersion();
    IgniteDhtDemandedPartitionsMap map = new IgniteDhtDemandedPartitionsMap();
    map.addHistorical(part, from, to, PARTS);
    List<CacheDataRow> rows = new ArrayList<>();
    WALPointer ptr = reserveWalPointerForIterator(grp.shared());
    try (IgniteRebalanceIterator it = offh.rebalanceIterator(map, topVer)) {
        assertNotNull(it);
        while (it.hasNextX()) rows.add(it.next());
    } finally {
        releaseWalPointerForIterator(grp.shared(), ptr);
    }
    return rows;
}
Also used : CacheDataRow(org.apache.ignite.internal.processors.cache.persistence.CacheDataRow) IgniteDhtDemandedPartitionsMap(org.apache.ignite.internal.processors.cache.distributed.dht.preloader.IgniteDhtDemandedPartitionsMap) IgniteCacheOffheapManager(org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManager) AffinityTopologyVersion(org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion) IgniteEx(org.apache.ignite.internal.IgniteEx) ArrayList(java.util.ArrayList) IgniteRebalanceIterator(org.apache.ignite.internal.processors.cache.IgniteRebalanceIterator) CacheGroupContext(org.apache.ignite.internal.processors.cache.CacheGroupContext) WALPointer(org.apache.ignite.internal.processors.cache.persistence.wal.WALPointer)

Aggregations

AffinityTopologyVersion (org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion)3 IgniteRebalanceIterator (org.apache.ignite.internal.processors.cache.IgniteRebalanceIterator)3 CacheDataRow (org.apache.ignite.internal.processors.cache.persistence.CacheDataRow)3 IgniteEx (org.apache.ignite.internal.IgniteEx)2 CacheGroupContext (org.apache.ignite.internal.processors.cache.CacheGroupContext)2 IgniteCacheOffheapManager (org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManager)2 IgniteDhtDemandedPartitionsMap (org.apache.ignite.internal.processors.cache.distributed.dht.preloader.IgniteDhtDemandedPartitionsMap)2 WALPointer (org.apache.ignite.internal.processors.cache.persistence.wal.WALPointer)2 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 UUID (java.util.UUID)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 Ignite (org.apache.ignite.Ignite)1 IgniteCheckedException (org.apache.ignite.IgniteCheckedException)1 RendezvousAffinityFunction (org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction)1 ClusterNode (org.apache.ignite.cluster.ClusterNode)1 CacheConfiguration (org.apache.ignite.configuration.CacheConfiguration)1 FailureContext (org.apache.ignite.failure.FailureContext)1 GridCacheEntryInfo (org.apache.ignite.internal.processors.cache.GridCacheEntryInfo)1 GridDhtLocalPartition (org.apache.ignite.internal.processors.cache.distributed.dht.topology.GridDhtLocalPartition)1