Search in sources :

Example 6 with IgniteSpiOperationTimeoutException

use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project ignite by apache.

the class TcpCommunicationSpi method createTcpClient.

/**
 * Establish TCP connection to remote node and returns client.
 *
 * @param node Remote node.
 * @param connIdx Connection index.
 * @return Client.
 * @throws IgniteCheckedException If failed.
 */
protected GridCommunicationClient createTcpClient(ClusterNode node, int connIdx) throws IgniteCheckedException {
    LinkedHashSet<InetSocketAddress> addrs = nodeAddresses(node);
    GridCommunicationClient client = null;
    IgniteCheckedException errs = null;
    int connectAttempts = 1;
    for (InetSocketAddress addr : addrs) {
        long connTimeout0 = connTimeout;
        int attempt = 1;
        IgniteSpiOperationTimeoutHelper timeoutHelper = new IgniteSpiOperationTimeoutHelper(this, !node.isClient());
        int lastWaitingTimeout = 1;
        while (client == null) {
            // Reconnection on handshake timeout.
            if (addr.getAddress().isLoopbackAddress() && addr.getPort() == boundTcpPort) {
                if (log.isDebugEnabled())
                    log.debug("Skipping local address [addr=" + addr + ", locAddrs=" + node.attribute(createSpiAttributeName(ATTR_ADDRS)) + ", node=" + node + ']');
                break;
            }
            boolean needWait = false;
            try {
                SocketChannel ch = SocketChannel.open();
                ch.configureBlocking(true);
                ch.socket().setTcpNoDelay(tcpNoDelay);
                ch.socket().setKeepAlive(true);
                if (sockRcvBuf > 0)
                    ch.socket().setReceiveBufferSize(sockRcvBuf);
                if (sockSndBuf > 0)
                    ch.socket().setSendBufferSize(sockSndBuf);
                if (getSpiContext().node(node.id()) == null) {
                    U.closeQuiet(ch);
                    throw new ClusterTopologyCheckedException("Failed to send message " + "(node left topology): " + node);
                }
                ConnectionKey connKey = new ConnectionKey(node.id(), connIdx, -1);
                GridNioRecoveryDescriptor recoveryDesc = outRecoveryDescriptor(node, connKey);
                if (!recoveryDesc.reserve()) {
                    U.closeQuiet(ch);
                    return null;
                }
                Long rcvCnt;
                Map<Integer, Object> meta = new HashMap<>();
                GridSslMeta sslMeta = null;
                try {
                    ch.socket().connect(addr, (int) timeoutHelper.nextTimeoutChunk(connTimeout));
                    if (isSslEnabled()) {
                        meta.put(SSL_META.ordinal(), sslMeta = new GridSslMeta());
                        SSLEngine sslEngine = ignite.configuration().getSslContextFactory().create().createSSLEngine();
                        sslEngine.setUseClientMode(true);
                        sslMeta.sslEngine(sslEngine);
                    }
                    Integer handshakeConnIdx = connIdx;
                    rcvCnt = safeTcpHandshake(ch, recoveryDesc, node.id(), timeoutHelper.nextTimeoutChunk(connTimeout0), sslMeta, handshakeConnIdx);
                    if (rcvCnt == ALREADY_CONNECTED) {
                        return null;
                    } else if (rcvCnt == NODE_STOPPING) {
                        throw new ClusterTopologyCheckedException("Remote node started stop procedure: " + node.id());
                    } else if (rcvCnt == NEED_WAIT) {
                        needWait = true;
                        continue;
                    }
                    meta.put(CONN_IDX_META, connKey);
                    if (recoveryDesc != null) {
                        recoveryDesc.onHandshake(rcvCnt);
                        meta.put(GridNioServer.RECOVERY_DESC_META_KEY, recoveryDesc);
                    }
                    GridNioSession ses = nioSrvr.createSession(ch, meta, false, null).get();
                    client = new GridTcpNioCommunicationClient(connIdx, ses, log);
                } finally {
                    if (client == null) {
                        U.closeQuiet(ch);
                        if (recoveryDesc != null)
                            recoveryDesc.release();
                        if (needWait) {
                            if (lastWaitingTimeout < 60000)
                                lastWaitingTimeout *= 2;
                            U.sleep(lastWaitingTimeout);
                        }
                    }
                }
            } catch (HandshakeTimeoutException | IgniteSpiOperationTimeoutException e) {
                if (client != null) {
                    client.forceClose();
                    client = null;
                }
                if (failureDetectionTimeoutEnabled() && (e instanceof HandshakeTimeoutException || X.hasCause(e, SocketException.class) || timeoutHelper.checkFailureTimeoutReached(e))) {
                    String msg = "Handshake timed out (failure detection timeout is reached) " + "[failureDetectionTimeout=" + failureDetectionTimeout() + ", addr=" + addr + ']';
                    onException(msg, e);
                    if (log.isDebugEnabled())
                        log.debug(msg);
                    if (errs == null)
                        errs = new IgniteCheckedException("Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']');
                    errs.addSuppressed(new IgniteCheckedException("Failed to connect to address: " + addr, e));
                    break;
                }
                assert !failureDetectionTimeoutEnabled();
                onException("Handshake timed out (will retry with increased timeout) [timeout=" + connTimeout0 + ", addr=" + addr + ']', e);
                if (log.isDebugEnabled())
                    log.debug("Handshake timed out (will retry with increased timeout) [timeout=" + connTimeout0 + ", addr=" + addr + ", err=" + e + ']');
                if (attempt == reconCnt || connTimeout0 > maxConnTimeout) {
                    U.warn(log, "Handshake timedout (will stop attempts to perform the handshake) " + "[node=" + node.id() + ", timeout=" + connTimeout0 + ", maxConnTimeout=" + maxConnTimeout + ", attempt=" + attempt + ", reconCnt=" + reconCnt + ", err=" + e.getMessage() + ", addr=" + addr + ']');
                    if (errs == null)
                        errs = new IgniteCheckedException("Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']');
                    errs.addSuppressed(new IgniteCheckedException("Failed to connect to address: " + addr, e));
                    break;
                } else {
                    attempt++;
                    connTimeout0 *= 2;
                // Continue loop.
                }
            } catch (ClusterTopologyCheckedException e) {
                throw e;
            } catch (Exception e) {
                if (client != null) {
                    client.forceClose();
                    client = null;
                }
                onException("Client creation failed [addr=" + addr + ", err=" + e + ']', e);
                if (log.isDebugEnabled())
                    log.debug("Client creation failed [addr=" + addr + ", err=" + e + ']');
                boolean failureDetThrReached = timeoutHelper.checkFailureTimeoutReached(e);
                if (enableTroubleshootingLog)
                    U.error(log, "Failed to establish connection to a remote node [node=" + node + ", addr=" + addr + ", connectAttempts=" + connectAttempts + ", failureDetThrReached=" + failureDetThrReached + ']', e);
                if (failureDetThrReached)
                    LT.warn(log, "Connect timed out (consider increasing 'failureDetectionTimeout' " + "configuration property) [addr=" + addr + ", failureDetectionTimeout=" + failureDetectionTimeout() + ']');
                else if (X.hasCause(e, SocketTimeoutException.class))
                    LT.warn(log, "Connect timed out (consider increasing 'connTimeout' " + "configuration property) [addr=" + addr + ", connTimeout=" + connTimeout + ']');
                if (errs == null)
                    errs = new IgniteCheckedException("Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']');
                errs.addSuppressed(new IgniteCheckedException("Failed to connect to address " + "[addr=" + addr + ", err=" + e.getMessage() + ']', e));
                // Reconnect for the second time, if connection is not established.
                if (!failureDetThrReached && connectAttempts < 5 && (X.hasCause(e, ConnectException.class, HandshakeException.class, SocketTimeoutException.class))) {
                    U.sleep(200);
                    connectAttempts++;
                    continue;
                }
                break;
            }
        }
        if (client != null)
            break;
    }
    if (client == null) {
        assert errs != null;
        if (X.hasCause(errs, ConnectException.class))
            LT.warn(log, "Failed to connect to a remote node " + "(make sure that destination node is alive and " + "operating system firewall is disabled on local and remote hosts) " + "[addrs=" + addrs + ']');
        if (enableForcibleNodeKill) {
            if (getSpiContext().node(node.id()) != null && (CU.clientNode(node) || !CU.clientNode(getLocalNode())) && connectionError(errs)) {
                String msg = "TcpCommunicationSpi failed to establish connection to node, node will be dropped from " + "cluster [" + "rmtNode=" + node + ']';
                if (enableTroubleshootingLog)
                    U.error(log, msg, errs);
                else
                    U.warn(log, msg);
                getSpiContext().failNode(node.id(), "TcpCommunicationSpi failed to establish connection to node [" + "rmtNode=" + node + ", errs=" + errs + ", connectErrs=" + Arrays.toString(errs.getSuppressed()) + ']');
            }
        }
        if (!X.hasCause(errs, SocketTimeoutException.class, HandshakeTimeoutException.class, IgniteSpiOperationTimeoutException.class))
            throw errs;
    }
    return client;
}
Also used : SocketChannel(java.nio.channels.SocketChannel) SocketException(java.net.SocketException) GridNioSession(org.apache.ignite.internal.util.nio.GridNioSession) HashMap(java.util.HashMap) InetSocketAddress(java.net.InetSocketAddress) SSLEngine(javax.net.ssl.SSLEngine) GridSslMeta(org.apache.ignite.internal.util.nio.ssl.GridSslMeta) GridTcpNioCommunicationClient(org.apache.ignite.internal.util.nio.GridTcpNioCommunicationClient) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) ConnectException(java.net.ConnectException) GridCommunicationClient(org.apache.ignite.internal.util.nio.GridCommunicationClient) IpcEndpoint(org.apache.ignite.internal.util.ipc.IpcEndpoint) IpcSharedMemoryServerEndpoint(org.apache.ignite.internal.util.ipc.shmem.IpcSharedMemoryServerEndpoint) IgniteClientDisconnectedException(org.apache.ignite.IgniteClientDisconnectedException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) SSLException(javax.net.ssl.SSLException) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) SocketException(java.net.SocketException) SocketTimeoutException(java.net.SocketTimeoutException) IgniteInterruptedCheckedException(org.apache.ignite.internal.IgniteInterruptedCheckedException) ConnectException(java.net.ConnectException) IpcOutOfSystemResourcesException(org.apache.ignite.internal.util.ipc.shmem.IpcOutOfSystemResourcesException) IOException(java.io.IOException) IgniteClientDisconnectedCheckedException(org.apache.ignite.internal.IgniteClientDisconnectedCheckedException) IgniteException(org.apache.ignite.IgniteException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) IgniteSpiOperationTimeoutHelper(org.apache.ignite.spi.IgniteSpiOperationTimeoutHelper) SocketTimeoutException(java.net.SocketTimeoutException) GridNioRecoveryDescriptor(org.apache.ignite.internal.util.nio.GridNioRecoveryDescriptor) IgniteSpiTimeoutObject(org.apache.ignite.spi.IgniteSpiTimeoutObject) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)

Example 7 with IgniteSpiOperationTimeoutException

use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project ignite by apache.

the class TcpCommunicationSpi method createShmemClient.

/**
 * @param node Node.
 * @param port Port.
 * @param connIdx Connection index.
 * @return Client.
 * @throws IgniteCheckedException If failed.
 */
@Nullable
private GridCommunicationClient createShmemClient(ClusterNode node, int connIdx, Integer port) throws IgniteCheckedException {
    int attempt = 1;
    int connectAttempts = 1;
    long connTimeout0 = connTimeout;
    IgniteSpiOperationTimeoutHelper timeoutHelper = new IgniteSpiOperationTimeoutHelper(this, !node.isClient());
    while (true) {
        GridCommunicationClient client;
        try {
            client = new GridShmemCommunicationClient(connIdx, metricsLsnr, port, timeoutHelper.nextTimeoutChunk(connTimeout), log, getSpiContext().messageFormatter());
        } catch (IgniteCheckedException e) {
            if (timeoutHelper.checkFailureTimeoutReached(e))
                throw e;
            // Reconnect for the second time, if connection is not established.
            if (connectAttempts < 2 && X.hasCause(e, ConnectException.class)) {
                connectAttempts++;
                continue;
            }
            throw e;
        }
        try {
            safeShmemHandshake(client, node.id(), timeoutHelper.nextTimeoutChunk(connTimeout0));
        } catch (HandshakeTimeoutException | IgniteSpiOperationTimeoutException e) {
            client.forceClose();
            if (failureDetectionTimeoutEnabled() && (e instanceof HandshakeTimeoutException || timeoutHelper.checkFailureTimeoutReached(e))) {
                if (log.isDebugEnabled())
                    log.debug("Handshake timed out (failure threshold reached) [failureDetectionTimeout=" + failureDetectionTimeout() + ", err=" + e.getMessage() + ", client=" + client + ']');
                throw e;
            }
            assert !failureDetectionTimeoutEnabled();
            if (log.isDebugEnabled())
                log.debug("Handshake timed out (will retry with increased timeout) [timeout=" + connTimeout0 + ", err=" + e.getMessage() + ", client=" + client + ']');
            if (attempt == reconCnt || connTimeout0 > maxConnTimeout) {
                if (log.isDebugEnabled())
                    log.debug("Handshake timedout (will stop attempts to perform the handshake) " + "[timeout=" + connTimeout0 + ", maxConnTimeout=" + maxConnTimeout + ", attempt=" + attempt + ", reconCnt=" + reconCnt + ", err=" + e.getMessage() + ", client=" + client + ']');
                throw e;
            } else {
                attempt++;
                connTimeout0 *= 2;
                continue;
            }
        } catch (IgniteCheckedException | RuntimeException | Error e) {
            if (log.isDebugEnabled())
                log.debug("Caught exception (will close client) [err=" + e.getMessage() + ", client=" + client + ']');
            client.forceClose();
            throw e;
        }
        return client;
    }
}
Also used : IgniteSpiOperationTimeoutHelper(org.apache.ignite.spi.IgniteSpiOperationTimeoutHelper) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) GridShmemCommunicationClient(org.apache.ignite.internal.util.nio.GridShmemCommunicationClient) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) GridCommunicationClient(org.apache.ignite.internal.util.nio.GridCommunicationClient) IpcEndpoint(org.apache.ignite.internal.util.ipc.IpcEndpoint) IpcSharedMemoryServerEndpoint(org.apache.ignite.internal.util.ipc.shmem.IpcSharedMemoryServerEndpoint) Nullable(org.jetbrains.annotations.Nullable)

Example 8 with IgniteSpiOperationTimeoutException

use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project ignite by apache.

the class GridNioServerWrapper method createNioSession.

/**
 * Returns the established TCP/IP connection between the current node and remote server. A handshake process of
 * negotiation between two communicating nodes will be performed before the {@link GridNioSession} created.
 * <p>
 * The handshaking process contains of these steps:
 *
 * <ol>
 * <li>The local node opens a new {@link SocketChannel} in the <em>blocking</em> mode.</li>
 * <li>The local node calls {@link SocketChannel#connect(SocketAddress)} to remote node.</li>
 * <li>The remote GridNioAcceptWorker thread accepts new connection.</li>
 * <li>The remote node sends back the {@link NodeIdMessage}.</li>
 * <li>The local node reads NodeIdMessage from created channel.</li>
 * <li>The local node sends the {@link HandshakeMessage2} to remote.</li>
 * <li>The remote node processes {@link HandshakeMessage2} in {@link GridNioServerListener#onMessage(GridNioSession,
 * Object)}.</li>
 * <li>The remote node sends back the {@link RecoveryLastReceivedMessage}.</li>
 * </ol>
 *
 * The handshaking process ends.
 * </p>
 * <p>
 * <em>Note.</em> The {@link HandshakeTimeoutObject} is created to control execution timeout during the
 * whole handshaking process.
 * </p>
 *
 * @param node Remote node identifier to connect with.
 * @param connIdx Connection index based on configured {@link ConnectionPolicy}.
 * @return A {@link GridNioSession} connection representation.
 * @throws IgniteCheckedException If establish connection fails.
 */
public GridNioSession createNioSession(ClusterNode node, int connIdx) throws IgniteCheckedException {
    boolean locNodeIsSrv = !locNodeSupplier.get().isClient() && !locNodeSupplier.get().isDaemon();
    if (!(Thread.currentThread() instanceof IgniteDiscoveryThread) && locNodeIsSrv) {
        if (node.isClient() && forceClientToServerConnections(node)) {
            String msg = "Failed to connect to node " + node.id() + " because it is started" + " in 'forceClientToServerConnections' mode; inverse connection will be requested.";
            throw new NodeUnreachableException(msg);
        }
    }
    Collection<InetSocketAddress> addrs = nodeAddresses(node, cfg.filterReachableAddresses(), attrs, locNodeSupplier);
    GridNioSession ses = null;
    IgniteCheckedException errs = null;
    long totalTimeout;
    if (cfg.failureDetectionTimeoutEnabled())
        totalTimeout = node.isClient() ? stateProvider.clientFailureDetectionTimeout() : cfg.failureDetectionTimeout();
    else {
        totalTimeout = ExponentialBackoffTimeoutStrategy.totalBackoffTimeout(cfg.connectionTimeout(), cfg.maxConnectionTimeout(), cfg.reconCount());
    }
    Set<InetSocketAddress> failedAddrsSet = new HashSet<>();
    int skippedAddrs = 0;
    for (InetSocketAddress addr : addrs) {
        if (addr.isUnresolved()) {
            failedAddrsSet.add(addr);
            continue;
        }
        TimeoutStrategy connTimeoutStgy = new ExponentialBackoffTimeoutStrategy(totalTimeout, cfg.failureDetectionTimeoutEnabled() ? DFLT_INITIAL_TIMEOUT : cfg.connectionTimeout(), cfg.maxConnectionTimeout());
        while (ses == null) {
            // Reconnection on handshake timeout.
            if (stopping)
                throw new IgniteSpiException("Node is stopping.");
            if (isLocalNodeAddress(addr)) {
                if (log.isDebugEnabled())
                    log.debug("Skipping local address [addr=" + addr + ", locAddrs=" + node.attribute(attrs.addresses()) + ", node=" + node + ']');
                skippedAddrs++;
                break;
            }
            long timeout = 0;
            connectGate.enter();
            try {
                if (nodeGetter.apply(node.id()) == null)
                    throw new ClusterTopologyCheckedException("Failed to send message (node left topology): " + node);
                SocketChannel ch = socketChannelFactory.get();
                ch.configureBlocking(true);
                ch.socket().setTcpNoDelay(cfg.tcpNoDelay());
                ch.socket().setKeepAlive(true);
                if (cfg.socketReceiveBuffer() > 0)
                    ch.socket().setReceiveBufferSize(cfg.socketReceiveBuffer());
                if (cfg.socketSendBuffer() > 0)
                    ch.socket().setSendBufferSize(cfg.socketSendBuffer());
                ConnectionKey connKey = new ConnectionKey(node.id(), connIdx, -1);
                GridNioRecoveryDescriptor recoveryDesc = outRecoveryDescriptor(node, connKey);
                assert recoveryDesc != null : "Recovery descriptor not found [connKey=" + connKey + ", rmtNode=" + node.id() + ']';
                if (!recoveryDesc.reserve()) {
                    U.closeQuiet(ch);
                    // Ensure the session is closed.
                    GridNioSession sesFromRecovery = recoveryDesc.session();
                    if (sesFromRecovery != null) {
                        while (sesFromRecovery.closeTime() == 0) sesFromRecovery.close();
                    }
                    return null;
                }
                long rcvCnt;
                Map<Integer, Object> meta = new HashMap<>();
                GridSslMeta sslMeta = null;
                try {
                    timeout = connTimeoutStgy.nextTimeout();
                    ch.socket().connect(addr, (int) timeout);
                    if (nodeGetter.apply(node.id()) == null)
                        throw new ClusterTopologyCheckedException("Failed to send message (node left topology): " + node);
                    if (stateProvider.isSslEnabled()) {
                        meta.put(SSL_META.ordinal(), sslMeta = new GridSslMeta());
                        SSLEngine sslEngine = stateProvider.createSSLEngine();
                        sslEngine.setUseClientMode(true);
                        sslMeta.sslEngine(sslEngine);
                    }
                    ClusterNode locNode = locNodeSupplier.get();
                    if (locNode == null)
                        throw new IgniteCheckedException("Local node has not been started or " + "fully initialized [isStopping=" + stateProvider.isStopping() + ']');
                    timeout = connTimeoutStgy.nextTimeout(timeout);
                    rcvCnt = safeTcpHandshake(ch, node.id(), timeout, sslMeta, new HandshakeMessage2(locNode.id(), recoveryDesc.incrementConnectCount(), recoveryDesc.received(), connIdx));
                    if (rcvCnt == ALREADY_CONNECTED)
                        return null;
                    else if (rcvCnt == NODE_STOPPING) {
                        // Safe to remap on remote node stopping.
                        throw new ClusterTopologyCheckedException("Remote node started stop procedure: " + node.id());
                    } else if (rcvCnt == UNKNOWN_NODE)
                        throw new IgniteCheckedException("Remote node does not observe current node " + "in topology : " + node.id());
                    else if (rcvCnt == NEED_WAIT) {
                        // scenarios with delayed client node join.
                        if (log.isDebugEnabled())
                            log.debug("NEED_WAIT received, handshake after delay [node = " + node + ", outOfTopologyDelay = " + DFLT_NEED_WAIT_DELAY + "ms]");
                        U.sleep(DFLT_NEED_WAIT_DELAY);
                        continue;
                    } else if (rcvCnt < 0)
                        throw new IgniteCheckedException("Unsupported negative receivedCount [rcvCnt=" + rcvCnt + ", senderNode=" + node + ']');
                    recoveryDesc.onHandshake(rcvCnt);
                    meta.put(CONSISTENT_ID_META, node.consistentId());
                    meta.put(CONN_IDX_META, connKey);
                    meta.put(GridNioServer.RECOVERY_DESC_META_KEY, recoveryDesc);
                    ses = nioSrv.createSession(ch, meta, false, null).get();
                } finally {
                    if (ses == null) {
                        U.closeQuiet(ch);
                        if (recoveryDesc != null)
                            recoveryDesc.release();
                    }
                }
            } catch (IgniteSpiOperationTimeoutException e) {
                // Handshake is timed out.
                if (ses != null) {
                    ses.close();
                    ses = null;
                }
                eRegistrySupplier.get().onException("Handshake timed out (will retry with increased timeout) [connTimeoutStrategy=" + connTimeoutStgy + ", addr=" + addr + ']', e);
                if (log.isDebugEnabled())
                    log.debug("Handshake timed out (will retry with increased timeout) [connTimeoutStrategy=" + connTimeoutStgy + ", addr=" + addr + ", err=" + e + ']');
                if (connTimeoutStgy.checkTimeout()) {
                    U.warn(log, "Handshake timed out (will stop attempts to perform the handshake) " + "[node=" + node.id() + ", connTimeoutStrategy=" + connTimeoutStgy + ", err=" + e.getMessage() + ", addr=" + addr + ", failureDetectionTimeoutEnabled=" + cfg.failureDetectionTimeoutEnabled() + ", timeout=" + timeout + ']');
                    String msg = "Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                    else
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
                    break;
                }
            } catch (ClusterTopologyCheckedException e) {
                throw e;
            } catch (Exception e) {
                // Most probably IO error on socket connect or handshake.
                if (ses != null) {
                    ses.close();
                    ses = null;
                }
                eRegistrySupplier.get().onException("Client creation failed [addr=" + addr + ", err=" + e + ']', e);
                if (log.isDebugEnabled())
                    log.debug("Client creation failed [addr=" + addr + ", err=" + e + ']');
                if (X.hasCause(e, "Too many open files", SocketException.class))
                    throw new IgniteTooManyOpenFilesException(e);
                // check if timeout occured in case of unrecoverable exception
                if (connTimeoutStgy.checkTimeout()) {
                    U.warn(log, "Connection timed out (will stop attempts to perform the connect) " + "[node=" + node.id() + ", connTimeoutStgy=" + connTimeoutStgy + ", failureDetectionTimeoutEnabled=" + cfg.failureDetectionTimeoutEnabled() + ", timeout=" + timeout + ", err=" + e.getMessage() + ", addr=" + addr + ']');
                    String msg = "Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                    else
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
                    break;
                }
                // Inverse communication protocol works only for client nodes.
                if (node.isClient() && isNodeUnreachableException(e))
                    failedAddrsSet.add(addr);
                if (isRecoverableException(e))
                    U.sleep(DFLT_RECONNECT_DELAY);
                else {
                    String msg = "Failed to connect to node due to unrecoverable exception (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ", err= " + e + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                    else
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
                    break;
                }
            } finally {
                connectGate.leave();
            }
            CommunicationWorker commWorker0 = commWorker;
            if (commWorker0 != null && commWorker0.runner() == Thread.currentThread())
                commWorker0.updateHeartbeat();
        }
        if (ses != null)
            break;
    }
    if (ses == null) {
        // inverse connection so no point in throwing NodeUnreachableException
        if (!cfg.usePairedConnections() || !Boolean.TRUE.equals(node.attribute(attrs.pairedConnection()))) {
            if (!(Thread.currentThread() instanceof IgniteDiscoveryThread) && locNodeIsSrv) {
                if (node.isClient() && (addrs.size() - skippedAddrs == failedAddrsSet.size())) {
                    String msg = "Failed to connect to all addresses of node " + node.id() + ": " + failedAddrsSet + "; inverse connection will be requested.";
                    throw new NodeUnreachableException(msg);
                }
            }
        }
        processSessionCreationError(node, addrs, errs == null ? new IgniteCheckedException("No session found") : errs);
    }
    return ses;
}
Also used : SocketChannel(java.nio.channels.SocketChannel) SocketException(java.net.SocketException) GridNioSession(org.apache.ignite.internal.util.nio.GridNioSession) HashMap(java.util.HashMap) InetSocketAddress(java.net.InetSocketAddress) SSLEngine(javax.net.ssl.SSLEngine) GridSslMeta(org.apache.ignite.internal.util.nio.ssl.GridSslMeta) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) HashSet(java.util.HashSet) ExponentialBackoffTimeoutStrategy(org.apache.ignite.spi.ExponentialBackoffTimeoutStrategy) ClusterNode(org.apache.ignite.cluster.ClusterNode) HandshakeMessage2(org.apache.ignite.spi.communication.tcp.messages.HandshakeMessage2) ExponentialBackoffTimeoutStrategy(org.apache.ignite.spi.ExponentialBackoffTimeoutStrategy) TimeoutStrategy(org.apache.ignite.spi.TimeoutStrategy) IgniteDiscoveryThread(org.apache.ignite.spi.discovery.IgniteDiscoveryThread) CommunicationTcpUtils.handshakeTimeoutException(org.apache.ignite.spi.communication.tcp.internal.CommunicationTcpUtils.handshakeTimeoutException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) SSLException(javax.net.ssl.SSLException) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) SocketException(java.net.SocketException) SocketTimeoutException(java.net.SocketTimeoutException) IOException(java.io.IOException) CommunicationTcpUtils.isRecoverableException(org.apache.ignite.spi.communication.tcp.internal.CommunicationTcpUtils.isRecoverableException) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) GridNioRecoveryDescriptor(org.apache.ignite.internal.util.nio.GridNioRecoveryDescriptor) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)

Example 9 with IgniteSpiOperationTimeoutException

use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project gridgain by gridgain.

the class GridNioServerWrapper method createNioSession.

/**
 * Returns the established TCP/IP connection between the current node and remote server. A handshake process of
 * negotiation between two communicating nodes will be performed before the {@link GridNioSession} created.
 * <p>
 * The handshaking process contains of these steps:
 *
 * <ol>
 * <li>The local node opens a new {@link SocketChannel} in the <em>blocking</em> mode.</li>
 * <li>The local node calls {@link SocketChannel#connect(SocketAddress)} to remote node.</li>
 * <li>The remote GridNioAcceptWorker thread accepts new connection.</li>
 * <li>The remote node sends back the {@link NodeIdMessage}.</li>
 * <li>The local node reads NodeIdMessage from created channel.</li>
 * <li>The local node sends the {@link HandshakeMessage2} to remote.</li>
 * <li>The remote node processes {@link HandshakeMessage2} in {@link GridNioServerListener#onMessage(GridNioSession,
 * Object)}.</li>
 * <li>The remote node sends back the {@link RecoveryLastReceivedMessage}.</li>
 * </ol>
 *
 * The handshaking process ends.
 * </p>
 * <p>
 * <em>Note.</em> The {@link HandshakeTimeoutObject} is created to control execution timeout during the
 * whole handshaking process.
 * </p>
 *
 * @param node Remote node identifier to connect with.
 * @param connIdx Connection index based on configured {@link ConnectionPolicy}.
 * @return A {@link GridNioSession} connection representation.
 * @throws IgniteCheckedException If establish connection fails.
 */
public GridNioSession createNioSession(ClusterNode node, int connIdx) throws IgniteCheckedException {
    boolean locNodeIsSrv = !locNodeSupplier.get().isClient() && !locNodeSupplier.get().isDaemon();
    if (!(Thread.currentThread() instanceof IgniteDiscoveryThread) && locNodeIsSrv) {
        if (node.isClient() && forceClientToServerConnections(node)) {
            String msg = "Failed to connect to node " + node.id() + " because it is started" + " in 'forceClientToServerConnections' mode; inverse connection will be requested.";
            throw new NodeUnreachableException(msg);
        }
    }
    Collection<InetSocketAddress> addrs = nodeAddresses(node, cfg.filterReachableAddresses(), attrs, locNodeSupplier);
    GridNioSession ses = null;
    IgniteCheckedException errs = null;
    long totalTimeout;
    if (cfg.failureDetectionTimeoutEnabled())
        totalTimeout = node.isClient() ? stateProvider.clientFailureDetectionTimeout() : cfg.failureDetectionTimeout();
    else {
        totalTimeout = ExponentialBackoffTimeoutStrategy.totalBackoffTimeout(cfg.connectionTimeout(), cfg.maxConnectionTimeout(), cfg.reconCount());
    }
    Set<InetSocketAddress> failedAddrsSet = new HashSet<>();
    int skippedAddrs = 0;
    for (InetSocketAddress addr : addrs) {
        if (addr.isUnresolved()) {
            failedAddrsSet.add(addr);
            continue;
        }
        TimeoutStrategy connTimeoutStgy = new ExponentialBackoffTimeoutStrategy(totalTimeout, cfg.failureDetectionTimeoutEnabled() ? DFLT_INITIAL_TIMEOUT : cfg.connectionTimeout(), cfg.maxConnectionTimeout());
        while (ses == null) {
            // Reconnection on handshake timeout.
            if (stopping)
                throw new GridNioException("Failed to create session, server is stopped.");
            if (isLocalNodeAddress(addr)) {
                if (log.isDebugEnabled())
                    log.debug("Skipping local address [addr=" + addr + ", locAddrs=" + node.attribute(attrs.addresses()) + ", node=" + node + ']');
                skippedAddrs++;
                break;
            }
            long timeout = 0;
            connectGate.enter();
            try {
                if (nodeGetter.apply(node.id()) == null)
                    throw new ClusterTopologyCheckedException("Failed to send message (node left topology): " + node);
                SocketChannel ch = socketChannelFactory.get();
                ch.configureBlocking(true);
                ch.socket().setTcpNoDelay(cfg.tcpNoDelay());
                ch.socket().setKeepAlive(true);
                if (cfg.socketReceiveBuffer() > 0)
                    ch.socket().setReceiveBufferSize(cfg.socketReceiveBuffer());
                if (cfg.socketSendBuffer() > 0)
                    ch.socket().setSendBufferSize(cfg.socketSendBuffer());
                ConnectionKey connKey = new ConnectionKey(node.id(), connIdx, -1);
                GridNioRecoveryDescriptor recoveryDesc = outRecoveryDescriptor(node, connKey);
                assert recoveryDesc != null : "Recovery descriptor not found [connKey=" + connKey + ", rmtNode=" + node.id() + ']';
                if (!recoveryDesc.reserve()) {
                    U.closeQuiet(ch);
                    // Ensure the session is closed.
                    GridNioSession sesFromRecovery = recoveryDesc.session();
                    if (sesFromRecovery != null) {
                        while (sesFromRecovery.closeTime() == 0) sesFromRecovery.close();
                    }
                    return null;
                }
                long rcvCnt;
                Map<Integer, Object> meta = new HashMap<>();
                GridSslMeta sslMeta = null;
                try {
                    if (stopping)
                        throw new GridNioException("Failed to create session, server is stopped.");
                    timeout = connTimeoutStgy.nextTimeout();
                    ch.socket().connect(addr, (int) timeout);
                    if (nodeGetter.apply(node.id()) == null)
                        throw new ClusterTopologyCheckedException("Failed to send message (node left topology): " + node);
                    if (stateProvider.isSslEnabled()) {
                        meta.put(SSL_META.ordinal(), sslMeta = new GridSslMeta());
                        SSLEngine sslEngine = stateProvider.createSSLEngine();
                        sslEngine.setUseClientMode(true);
                        sslMeta.sslEngine(sslEngine);
                    }
                    ClusterNode locNode = locNodeSupplier.get();
                    if (locNode == null)
                        throw new IgniteCheckedException("Local node has not been started or " + "fully initialized [isStopping=" + stateProvider.isStopping() + ']');
                    timeout = connTimeoutStgy.nextTimeout(timeout);
                    rcvCnt = safeTcpHandshake(ch, node.id(), timeout, sslMeta, new HandshakeMessage2(locNode.id(), recoveryDesc.incrementConnectCount(), recoveryDesc.received(), connIdx));
                    if (rcvCnt == ALREADY_CONNECTED) {
                        return null;
                    } else if (rcvCnt == NODE_STOPPING) {
                        // Safe to remap on remote node stopping.
                        throw new ClusterTopologyCheckedException("Remote node started stop procedure: " + node.id());
                    } else if (rcvCnt == UNKNOWN_NODE)
                        throw new IgniteCheckedException("Remote node does not observe current node " + "in topology : " + node.id());
                    else if (rcvCnt == NEED_WAIT) {
                        // scenarios with delayed client node join.
                        if (log.isDebugEnabled())
                            log.debug("NEED_WAIT received, handshake after delay [node = " + node + ", outOfTopologyDelay = " + DFLT_NEED_WAIT_DELAY + "ms]");
                        U.sleep(DFLT_NEED_WAIT_DELAY);
                        continue;
                    } else if (rcvCnt < 0)
                        throw new IgniteCheckedException("Unsupported negative receivedCount [rcvCnt=" + rcvCnt + ", senderNode=" + node + ']');
                    recoveryDesc.onHandshake(rcvCnt);
                    meta.put(CONSISTENT_ID_META, node.consistentId());
                    meta.put(CONN_IDX_META, connKey);
                    meta.put(GridNioServer.RECOVERY_DESC_META_KEY, recoveryDesc);
                    ses = nioSrv.createSession(ch, meta, false, null).get();
                } finally {
                    if (ses == null) {
                        U.closeQuiet(ch);
                        if (recoveryDesc != null)
                            recoveryDesc.release();
                    }
                }
            } catch (IgniteSpiOperationTimeoutException e) {
                // Handshake is timed out.
                if (ses != null) {
                    ses.close();
                    ses = null;
                }
                eRegistrySupplier.get().onException("Handshake timed out (will retry with increased timeout) [connTimeoutStrategy=" + connTimeoutStgy + ", addr=" + addr + ']', e);
                if (log.isDebugEnabled())
                    log.debug("Handshake timed out (will retry with increased timeout) [connTimeoutStrategy=" + connTimeoutStgy + ", addr=" + addr + ", err=" + e + ']');
                if (connTimeoutStgy.checkTimeout()) {
                    U.warn(log, "Handshake timed out (will stop attempts to perform the handshake) " + "[node=" + node.id() + ", connTimeoutStrategy=" + connTimeoutStgy + ", err=" + e.getMessage() + ", addr=" + addr + ", failureDetectionTimeoutEnabled=" + cfg.failureDetectionTimeoutEnabled() + ", timeout=" + timeout + ']');
                    String msg = "Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                    else
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
                    break;
                }
            } catch (ClusterTopologyCheckedException e) {
                throw e;
            } catch (Exception e) {
                // Most probably IO error on socket connect or handshake.
                if (ses != null) {
                    ses.close();
                    ses = null;
                }
                eRegistrySupplier.get().onException("Client creation failed [addr=" + addr + ", err=" + e + ']', e);
                if (log.isDebugEnabled())
                    log.debug("Client creation failed [addr=" + addr + ", err=" + e + ']');
                if (X.hasCause(e, "Too many open files", SocketException.class))
                    throw new IgniteTooManyOpenFilesException(e);
                // check if timeout occurred in case of unrecoverable exception
                if (connTimeoutStgy.checkTimeout()) {
                    U.warn(log, "Connection timed out (will stop attempts to perform the connect) " + "[node=" + node.id() + ", connTimeoutStgy=" + connTimeoutStgy + ", failureDetectionTimeoutEnabled=" + cfg.failureDetectionTimeoutEnabled() + ", timeout=" + timeout + ", err=" + e.getMessage() + ", addr=" + addr + ']');
                    String msg = "Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                    else
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
                    break;
                }
                // Inverse communication protocol works only for client nodes.
                if (node.isClient() && isNodeUnreachableException(e))
                    failedAddrsSet.add(addr);
                if (isRecoverableException(e))
                    U.sleep(DFLT_RECONNECT_DELAY);
                else {
                    String msg = "Failed to connect to node due to unrecoverable exception (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ", err= " + e + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                    else
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
                    break;
                }
            } finally {
                connectGate.leave();
            }
            CommunicationWorker commWorker0 = commWorker;
            if (commWorker0 != null && commWorker0.runner() == Thread.currentThread())
                commWorker0.updateHeartbeat();
        }
        if (ses != null)
            break;
    }
    if (ses == null) {
        // inverse connection so no point in throwing NodeUnreachableException
        if (!cfg.usePairedConnections() || !Boolean.TRUE.equals(node.attribute(attrs.pairedConnection()))) {
            if (!(Thread.currentThread() instanceof IgniteDiscoveryThread) && locNodeIsSrv) {
                if (node.isClient() && (addrs.size() - skippedAddrs == failedAddrsSet.size())) {
                    String msg = "Failed to connect to all addresses of node " + node.id() + ": " + failedAddrsSet + "; inverse connection will be requested.";
                    throw new NodeUnreachableException(msg);
                }
            }
        }
        processSessionCreationError(node, addrs, errs == null ? new IgniteCheckedException("No session found") : errs);
    }
    return ses;
}
Also used : SocketChannel(java.nio.channels.SocketChannel) SocketException(java.net.SocketException) GridNioSession(org.apache.ignite.internal.util.nio.GridNioSession) HashMap(java.util.HashMap) InetSocketAddress(java.net.InetSocketAddress) SSLEngine(javax.net.ssl.SSLEngine) GridSslMeta(org.apache.ignite.internal.util.nio.ssl.GridSslMeta) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) HashSet(java.util.HashSet) ExponentialBackoffTimeoutStrategy(org.apache.ignite.spi.ExponentialBackoffTimeoutStrategy) ClusterNode(org.apache.ignite.cluster.ClusterNode) HandshakeMessage2(org.apache.ignite.spi.communication.tcp.messages.HandshakeMessage2) ExponentialBackoffTimeoutStrategy(org.apache.ignite.spi.ExponentialBackoffTimeoutStrategy) TimeoutStrategy(org.apache.ignite.spi.TimeoutStrategy) IgniteDiscoveryThread(org.apache.ignite.spi.discovery.IgniteDiscoveryThread) CommunicationTcpUtils.handshakeTimeoutException(org.apache.ignite.spi.communication.tcp.internal.CommunicationTcpUtils.handshakeTimeoutException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) SSLException(javax.net.ssl.SSLException) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) SocketException(java.net.SocketException) SocketTimeoutException(java.net.SocketTimeoutException) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) IOException(java.io.IOException) CommunicationTcpUtils.isRecoverableException(org.apache.ignite.spi.communication.tcp.internal.CommunicationTcpUtils.isRecoverableException) GridNioException(org.apache.ignite.internal.util.nio.GridNioException) GridNioException(org.apache.ignite.internal.util.nio.GridNioException) GridNioRecoveryDescriptor(org.apache.ignite.internal.util.nio.GridNioRecoveryDescriptor) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)

Example 10 with IgniteSpiOperationTimeoutException

use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project gridgain by gridgain.

the class TcpDiscoverySpi method openSocket.

/**
 * Connects to remote address sending {@code U.IGNITE_HEADER} when connection is established.
 *
 * @param sock Socket bound to a local host address.
 * @param remAddr Remote address.
 * @param timeoutHelper Timeout helper.
 * @return Connected socket.
 * @throws IOException If failed.
 * @throws IgniteSpiOperationTimeoutException In case of timeout.
 */
protected Socket openSocket(Socket sock, InetSocketAddress remAddr, IgniteSpiOperationTimeoutHelper timeoutHelper) throws IOException, IgniteSpiOperationTimeoutException {
    assert remAddr != null;
    try {
        InetSocketAddress resolved = remAddr.isUnresolved() ? new InetSocketAddress(InetAddress.getByName(remAddr.getHostName()), remAddr.getPort()) : remAddr;
        InetAddress addr = resolved.getAddress();
        assert addr != null;
        sock.connect(resolved, (int) timeoutHelper.nextTimeoutChunk(sockTimeout));
        writeToSocket(sock, null, U.IGNITE_HEADER, timeoutHelper.nextTimeoutChunk(sockTimeout));
        return sock;
    } catch (IOException | IgniteSpiOperationTimeoutException e) {
        if (sock != null)
            U.closeQuiet(sock);
        throw e;
    }
}
Also used : InetSocketAddress(java.net.InetSocketAddress) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) IOException(java.io.IOException) InetAddress(java.net.InetAddress)

Aggregations

IgniteSpiOperationTimeoutException (org.apache.ignite.spi.IgniteSpiOperationTimeoutException)10 IOException (java.io.IOException)8 InetSocketAddress (java.net.InetSocketAddress)7 IgniteCheckedException (org.apache.ignite.IgniteCheckedException)6 SocketTimeoutException (java.net.SocketTimeoutException)5 IgniteSpiOperationTimeoutHelper (org.apache.ignite.spi.IgniteSpiOperationTimeoutHelper)5 SocketException (java.net.SocketException)3 SocketChannel (java.nio.channels.SocketChannel)3 HashMap (java.util.HashMap)3 SSLEngine (javax.net.ssl.SSLEngine)3 SSLException (javax.net.ssl.SSLException)3 ClusterNode (org.apache.ignite.cluster.ClusterNode)3 ClusterTopologyCheckedException (org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)3 IpcEndpoint (org.apache.ignite.internal.util.ipc.IpcEndpoint)3 IpcSharedMemoryServerEndpoint (org.apache.ignite.internal.util.ipc.shmem.IpcSharedMemoryServerEndpoint)3 GridCommunicationClient (org.apache.ignite.internal.util.nio.GridCommunicationClient)3 GridNioRecoveryDescriptor (org.apache.ignite.internal.util.nio.GridNioRecoveryDescriptor)3 GridNioSession (org.apache.ignite.internal.util.nio.GridNioSession)3 GridSslMeta (org.apache.ignite.internal.util.nio.ssl.GridSslMeta)3 IgniteSpiException (org.apache.ignite.spi.IgniteSpiException)3