Search in sources :

Example 1 with CommunicationFailureResolver

use of org.apache.ignite.configuration.CommunicationFailureResolver in project ignite by apache.

the class GridDiscoveryManager method initCommunicationErrorResolveConfiguration.

/**
 * @param cfg Configuration.
 * @throws IgniteCheckedException If configuration is not valid.
 */
public static void initCommunicationErrorResolveConfiguration(IgniteConfiguration cfg) throws IgniteCheckedException {
    CommunicationFailureResolver rslvr = cfg.getCommunicationFailureResolver();
    CommunicationSpi commSpi = cfg.getCommunicationSpi();
    DiscoverySpi discoverySpi = cfg.getDiscoverySpi();
    if (rslvr != null) {
        if (!supportsCommunicationErrorResolve(commSpi)) {
            throw new IgniteCheckedException("CommunicationFailureResolver is configured, but CommunicationSpi does not support communication" + "problem resolve: " + commSpi.getClass().getName());
        }
        if (!supportsCommunicationErrorResolve(discoverySpi)) {
            throw new IgniteCheckedException("CommunicationFailureResolver is configured, but DiscoverySpi does not support communication" + "problem resolve: " + discoverySpi.getClass().getName());
        }
    } else {
        if (supportsCommunicationErrorResolve(commSpi) && supportsCommunicationErrorResolve(discoverySpi))
            cfg.setCommunicationFailureResolver(new DefaultCommunicationFailureResolver());
    }
}
Also used : TcpCommunicationSpi(org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi) CommunicationSpi(org.apache.ignite.spi.communication.CommunicationSpi) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) DefaultCommunicationFailureResolver(org.apache.ignite.configuration.DefaultCommunicationFailureResolver) CommunicationFailureResolver(org.apache.ignite.configuration.CommunicationFailureResolver) DiscoverySpi(org.apache.ignite.spi.discovery.DiscoverySpi) TcpDiscoverySpi(org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi) DefaultCommunicationFailureResolver(org.apache.ignite.configuration.DefaultCommunicationFailureResolver)

Example 2 with CommunicationFailureResolver

use of org.apache.ignite.configuration.CommunicationFailureResolver in project ignite by apache.

the class ZookeeperDiscoveryImpl method onCommunicationErrorResolveStatusReceived.

/**
 * @param rtState Runtime state.
 * @throws Exception If failed.
 */
private void onCommunicationErrorResolveStatusReceived(final ZkRuntimeState rtState) throws Exception {
    ZkDiscoveryEventsData evtsData = rtState.evtsData;
    UUID futId = evtsData.communicationErrorResolveFutureId();
    if (log.isInfoEnabled())
        log.info("Received communication status from all nodes [reqId=" + futId + ']');
    assert futId != null;
    String futPath = zkPaths.distributedFutureBasePath(futId);
    List<ClusterNode> initialNodes = rtState.commErrProcNodes;
    assert initialNodes != null;
    rtState.commErrProcNodes = null;
    List<ClusterNode> topSnapshot = rtState.top.topologySnapshot();
    Map<UUID, BitSet> nodesRes = U.newHashMap(topSnapshot.size());
    Exception err = null;
    for (ClusterNode node : topSnapshot) {
        byte[] stateBytes = ZkDistributedCollectDataFuture.readNodeResult(futPath, rtState.zkClient, node.order());
        ZkCommunicationErrorNodeState nodeState = unmarshalZip(stateBytes);
        if (nodeState.err != null) {
            if (err == null)
                err = new Exception("Failed to resolve communication error.");
            err.addSuppressed(nodeState.err);
        } else {
            assert nodeState.commState != null;
            nodesRes.put(node.id(), nodeState.commState);
        }
    }
    long topVer = evtsData.topVer;
    GridLongList killedNodesList = null;
    if (err == null) {
        boolean fullyConnected = true;
        for (Map.Entry<UUID, BitSet> e : nodesRes.entrySet()) {
            if (!checkFullyConnected(e.getValue(), initialNodes, rtState.top)) {
                fullyConnected = false;
                break;
            }
        }
        if (fullyConnected) {
            if (log.isInfoEnabled()) {
                log.info("Finish communication error resolve process automatically, there are no " + "communication errors [reqId=" + futId + ']');
            }
        } else {
            CommunicationFailureResolver rslvr = spi.ignite().configuration().getCommunicationFailureResolver();
            if (rslvr != null) {
                if (log.isInfoEnabled()) {
                    log.info("Call communication error resolver [reqId=" + futId + ", rslvr=" + rslvr.getClass().getSimpleName() + ']');
                }
                ZkCommunicationFailureContext ctx = new ZkCommunicationFailureContext(((IgniteKernal) spi.ignite()).context().cache().context(), topSnapshot, initialNodes, nodesRes);
                try {
                    rslvr.resolve(ctx);
                    Set<ClusterNode> killedNodes = ctx.killedNodes();
                    if (killedNodes != null) {
                        if (log.isInfoEnabled()) {
                            log.info("Communication error resolver forced nodes stop [reqId=" + futId + ", killNodeCnt=" + killedNodes.size() + ", nodeIds=" + U.nodeIds(killedNodes) + ']');
                        }
                        killedNodesList = new GridLongList(killedNodes.size());
                        for (ClusterNode killedNode : killedNodes) {
                            killedNodesList.add(((ZookeeperClusterNode) killedNode).internalId());
                            evtsData.topVer++;
                        }
                    }
                } catch (Exception e) {
                    err = e;
                    U.error(log, "Failed to resolve communication error with configured resolver [reqId=" + futId + ']', e);
                }
            }
        }
    }
    evtsData.communicationErrorResolveFutureId(null);
    ZkCommunicationErrorResolveResult res = new ZkCommunicationErrorResolveResult(killedNodesList, err);
    ZkCommunicationErrorResolveFinishMessage msg = new ZkCommunicationErrorResolveFinishMessage(futId, topVer);
    msg.res = res;
    ZkDistributedCollectDataFuture.saveResult(zkPaths.distributedFutureResultPath(futId), rtState.zkClient, marshalZip(res));
    evtsData.evtIdGen++;
    ZkDiscoveryCustomEventData evtData = new ZkDiscoveryCustomEventData(evtsData.evtIdGen, 0L, topVer, locNode.id(), msg, null);
    evtData.resolvedMsg = msg;
    evtsData.addEvent(rtState.top.nodesByOrder.values(), evtData);
    saveAndProcessNewEvents();
    // Need re-check alive nodes in case join was delayed.
    rtState.zkClient.getChildrenAsync(zkPaths.aliveNodesDir, rtState.watcher, rtState.watcher);
}
Also used : ClusterNode(org.apache.ignite.cluster.ClusterNode) CommunicationFailureResolver(org.apache.ignite.configuration.CommunicationFailureResolver) BitSet(java.util.BitSet) GridLongList(org.apache.ignite.internal.util.GridLongList) IgniteClientDisconnectedException(org.apache.ignite.IgniteClientDisconnectedException) IgniteClientDisconnectedCheckedException(org.apache.ignite.internal.IgniteClientDisconnectedCheckedException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteException(org.apache.ignite.IgniteException) IgniteFutureTimeoutCheckedException(org.apache.ignite.internal.IgniteFutureTimeoutCheckedException) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) DataFormatException(java.util.zip.DataFormatException) IgniteInterruptedException(org.apache.ignite.IgniteInterruptedException) KeeperException(org.apache.zookeeper.KeeperException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) UUID(java.util.UUID) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap)

Example 3 with CommunicationFailureResolver

use of org.apache.ignite.configuration.CommunicationFailureResolver in project ignite by apache.

the class ZookeeperDiscoveryCommunicationFailureTest method testCommunicationFailureResolve_CachesInfo1.

/**
 * @throws Exception If failed.
 */
@Test
public void testCommunicationFailureResolve_CachesInfo1() throws Exception {
    testCommSpi = true;
    sesTimeout = 5000;
    final CacheInfoCommunicationFailureResolver rslvr = new CacheInfoCommunicationFailureResolver();
    commFailureRslvr = new IgniteOutClosure<CommunicationFailureResolver>() {

        @Override
        public CommunicationFailureResolver apply() {
            return rslvr;
        }
    };
    startGrids(2);
    awaitPartitionMapExchange();
    Map<String, T3<Integer, Integer, Integer>> expCaches = new HashMap<>();
    expCaches.put(DEFAULT_CACHE_NAME, new T3<>(RendezvousAffinityFunction.DFLT_PARTITION_COUNT, 0, 1));
    checkResolverCachesInfo(ignite(0), expCaches);
    List<CacheConfiguration> caches = new ArrayList<>();
    CacheConfiguration c1 = new CacheConfiguration("c1");
    c1.setBackups(1);
    c1.setAffinity(new RendezvousAffinityFunction(false, 64));
    caches.add(c1);
    CacheConfiguration c2 = new CacheConfiguration("c2");
    c2.setBackups(2);
    c2.setAffinity(new RendezvousAffinityFunction(false, 128));
    caches.add(c2);
    CacheConfiguration c3 = new CacheConfiguration("c3");
    c3.setCacheMode(CacheMode.REPLICATED);
    c3.setAffinity(new RendezvousAffinityFunction(false, 256));
    caches.add(c3);
    ignite(0).createCaches(caches);
    expCaches.put("c1", new T3<>(64, 1, 2));
    expCaches.put("c2", new T3<>(128, 2, 2));
    expCaches.put("c3", new T3<>(256, 1, 2));
    checkResolverCachesInfo(ignite(0), expCaches);
    startGrid(2);
    startGrid(3);
    awaitPartitionMapExchange();
    expCaches.put("c2", new T3<>(128, 2, 3));
    expCaches.put("c3", new T3<>(256, 1, 4));
    checkResolverCachesInfo(ignite(0), expCaches);
    CacheConfiguration<Object, Object> c4 = new CacheConfiguration<>("c4");
    c4.setCacheMode(CacheMode.PARTITIONED);
    c4.setBackups(0);
    c4.setAffinity(new RendezvousAffinityFunction(false, 256));
    c4.setNodeFilter(new TestCacheNodeExcludingFilter(getTestIgniteInstanceName(0), getTestIgniteInstanceName(1)));
    ignite(2).createCache(c4);
    expCaches.put("c4", new T3<>(256, 0, 1));
    checkResolverCachesInfo(ignite(0), expCaches);
    // Stop current coordinator, check new coordinator will initialize required caches information.
    stopGrid(0);
    awaitPartitionMapExchange();
    expCaches.put("c3", new T3<>(256, 1, 3));
    checkResolverCachesInfo(ignite(1), expCaches);
    startGrid(0);
    expCaches.put("c3", new T3<>(256, 1, 4));
    checkResolverCachesInfo(ignite(1), expCaches);
    stopGrid(1);
    expCaches.put("c3", new T3<>(256, 1, 3));
    checkResolverCachesInfo(ignite(3), expCaches);
}
Also used : TestCacheNodeExcludingFilter(org.apache.ignite.internal.processors.cache.distributed.TestCacheNodeExcludingFilter) CommunicationFailureResolver(org.apache.ignite.configuration.CommunicationFailureResolver) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) RendezvousAffinityFunction(org.apache.ignite.cache.affinity.rendezvous.RendezvousAffinityFunction) CacheConfiguration(org.apache.ignite.configuration.CacheConfiguration) T3(org.apache.ignite.internal.util.typedef.T3) Test(org.junit.Test)

Example 4 with CommunicationFailureResolver

use of org.apache.ignite.configuration.CommunicationFailureResolver in project ignite by apache.

the class ZookeeperDiscoveryCommunicationFailureTest method testCommunicationFailureResolve_CachesInfo2.

/**
 * @throws Exception If failed.
 */
@Test
public void testCommunicationFailureResolve_CachesInfo2() throws Exception {
    testCommSpi = true;
    sesTimeout = 5000;
    final CacheInfoCommunicationFailureResolver rslvr = new CacheInfoCommunicationFailureResolver();
    commFailureRslvr = new IgniteOutClosure<CommunicationFailureResolver>() {

        @Override
        public CommunicationFailureResolver apply() {
            return rslvr;
        }
    };
    Ignite srv0 = startGrid(0);
    CacheConfiguration<Object, Object> ccfg = new CacheConfiguration<>("c1");
    ccfg.setBackups(1);
    srv0.createCache(ccfg);
    // Block rebalance to make sure node0 will be the only owner.
    TestRecordingCommunicationSpi.spi(srv0).blockMessages(new IgniteBiPredicate<ClusterNode, Message>() {

        @Override
        public boolean apply(ClusterNode node, Message msg) {
            return msg instanceof GridDhtPartitionSupplyMessage && ((GridDhtPartitionSupplyMessage) msg).groupId() == CU.cacheId("c1");
        }
    });
    startGrid(1);
    U.sleep(1000);
    ZookeeperDiscoverySpi spi = spi(srv0);
    rslvr.latch = new CountDownLatch(1);
    ZkTestCommunicationSpi.testSpi(srv0).initCheckResult(2, 0);
    spi.resolveCommunicationFailure(spi.getRemoteNodes().iterator().next(), new Exception("test"));
    assertTrue(rslvr.latch.await(10, SECONDS));
    List<List<ClusterNode>> cacheOwners = rslvr.ownersMap.get("c1");
    ClusterNode node0 = srv0.cluster().localNode();
    for (int p = 0; p < RendezvousAffinityFunction.DFLT_PARTITION_COUNT; p++) {
        List<ClusterNode> owners = cacheOwners.get(p);
        assertEquals(1, owners.size());
        assertEquals(node0, owners.get(0));
    }
    TestRecordingCommunicationSpi.spi(srv0).stopBlock();
    awaitPartitionMapExchange();
    Map<String, T3<Integer, Integer, Integer>> expCaches = new HashMap<>();
    expCaches.put(DEFAULT_CACHE_NAME, new T3<>(RendezvousAffinityFunction.DFLT_PARTITION_COUNT, 0, 1));
    expCaches.put("c1", new T3<>(RendezvousAffinityFunction.DFLT_PARTITION_COUNT, 1, 2));
    checkResolverCachesInfo(srv0, expCaches);
}
Also used : ClusterNode(org.apache.ignite.cluster.ClusterNode) GridDhtPartitionSupplyMessage(org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionSupplyMessage) Message(org.apache.ignite.plugin.extensions.communication.Message) CommunicationFailureResolver(org.apache.ignite.configuration.CommunicationFailureResolver) HashMap(java.util.HashMap) ZookeeperDiscoverySpi(org.apache.ignite.spi.discovery.zk.ZookeeperDiscoverySpi) CountDownLatch(java.util.concurrent.CountDownLatch) GridDhtPartitionSupplyMessage(org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionSupplyMessage) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) Ignite(org.apache.ignite.Ignite) ArrayList(java.util.ArrayList) List(java.util.List) CacheConfiguration(org.apache.ignite.configuration.CacheConfiguration) T3(org.apache.ignite.internal.util.typedef.T3) Test(org.junit.Test)

Aggregations

CommunicationFailureResolver (org.apache.ignite.configuration.CommunicationFailureResolver)4 HashMap (java.util.HashMap)3 ArrayList (java.util.ArrayList)2 IgniteCheckedException (org.apache.ignite.IgniteCheckedException)2 ClusterNode (org.apache.ignite.cluster.ClusterNode)2 CacheConfiguration (org.apache.ignite.configuration.CacheConfiguration)2 ClusterTopologyCheckedException (org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)2 T3 (org.apache.ignite.internal.util.typedef.T3)2 IgniteSpiException (org.apache.ignite.spi.IgniteSpiException)2 Test (org.junit.Test)2 BitSet (java.util.BitSet)1 List (java.util.List)1 Map (java.util.Map)1 TreeMap (java.util.TreeMap)1 UUID (java.util.UUID)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 CountDownLatch (java.util.concurrent.CountDownLatch)1 DataFormatException (java.util.zip.DataFormatException)1 Ignite (org.apache.ignite.Ignite)1 IgniteClientDisconnectedException (org.apache.ignite.IgniteClientDisconnectedException)1