Search in sources :

Example 1 with IgniteTooManyOpenFilesException

use of org.apache.ignite.internal.IgniteTooManyOpenFilesException in project ignite by apache.

the class CommunicationWorker method processDisconnect.

 * @param sesInfo Disconnected session information.
private void processDisconnect(DisconnectedSessionInfo sesInfo) {
    GridNioRecoveryDescriptor recoveryDesc = sesInfo.recoveryDescription();
    ClusterNode node = recoveryDesc.node();
    if (!recoveryDesc.nodeAlive(nodeGetter.apply(
    try {
        if (log.isDebugEnabled())
            log.debug("Recovery reconnect [rmtNode=" + recoveryDesc.node().id() + ']');
        GridCommunicationClient client = clientPool.reserveClient(node, sesInfo.connectionIndex());
    } catch (ClusterTopologyCheckedException e) {
        if (log.isDebugEnabled())
            log.debug("Recovery reconnect failed, node stopping [rmtNode=" + recoveryDesc.node().id() + ']');
    } catch (IgniteTooManyOpenFilesException e) {
        eRegistrySupplier.get().onException(e.getMessage(), e);
        throw e;
    } catch (IgniteCheckedException | IgniteException e) {
        try {
            if (recoveryDesc.nodeAlive(nodeGetter.apply( && pingNode.apply( {
                if (log.isDebugEnabled()) {
                    log.debug("Recovery reconnect failed, will retry " + "[rmtNode=" + recoveryDesc.node().id() + ", err=" + e + ']');
            } else {
                if (log.isDebugEnabled()) {
                    log.debug("Recovery reconnect failed, " + "node left [rmtNode=" + recoveryDesc.node().id() + ", err=" + e + ']');
                eRegistrySupplier.get().onException("Recovery reconnect failed, node left [rmtNode=" + recoveryDesc.node().id() + "]", e);
        } catch (IgniteClientDisconnectedException ignored) {
            if (log.isDebugEnabled())
                log.debug("Failed to ping node, client disconnected.");
Also used : ClusterNode(org.apache.ignite.cluster.ClusterNode) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteException(org.apache.ignite.IgniteException) IgniteClientDisconnectedException(org.apache.ignite.IgniteClientDisconnectedException) GridNioRecoveryDescriptor(org.apache.ignite.internal.util.nio.GridNioRecoveryDescriptor) GridCommunicationClient(org.apache.ignite.internal.util.nio.GridCommunicationClient) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException)

Example 2 with IgniteTooManyOpenFilesException

use of org.apache.ignite.internal.IgniteTooManyOpenFilesException in project ignite by apache.

the class ConnectionClientPool method reserveClient.

 * Returns existing or just created client to node.
 * @param node Node to which client should be open.
 * @param connIdx Connection index.
 * @return The existing or just created client.
 * @throws IgniteCheckedException Thrown if any exception occurs.
public GridCommunicationClient reserveClient(ClusterNode node, int connIdx) throws IgniteCheckedException {
    assert node != null;
    assert (connIdx >= 0 && connIdx < cfg.connectionsPerNode()) || !(cfg.usePairedConnections() && usePairedConnections(node, attrs.pairedConnection())) : connIdx;
    if (locNodeSupplier.get().isClient()) {
        if (node.isClient()) {
            if (DISABLED_CLIENT_PORT.equals(node.attribute(attrs.port())))
                throw new IgniteSpiException("Cannot send message to the client node with no server socket opened.");
    UUID nodeId =;
    if (log.isDebugEnabled())
        log.debug("The node client is going to reserve a connection [nodeId=" + + ", connIdx=" + connIdx + "]");
    while (true) {
        GridCommunicationClient[] curClients = clients.get(nodeId);
        GridCommunicationClient client = curClients != null && connIdx < curClients.length ? curClients[connIdx] : null;
        if (client == null) {
            if (stopping)
                throw new IgniteSpiException("Node is stopping.");
            // Do not allow concurrent connects.
            GridFutureAdapter<GridCommunicationClient> fut = new ConnectFuture();
            ConnectionKey connKey = new ConnectionKey(nodeId, connIdx, -1);
            GridFutureAdapter<GridCommunicationClient> oldFut = clientFuts.putIfAbsent(connKey, fut);
            if (oldFut == null) {
                try {
                    GridCommunicationClient[] curClients0 = clients.get(nodeId);
                    GridCommunicationClient client0 = curClients0 != null && connIdx < curClients0.length ? curClients0[connIdx] : null;
                    if (client0 == null) {
                        client0 = createCommunicationClient(node, connIdx);
                        if (client0 != null) {
                            addNodeClient(node, connIdx, client0);
                            if (client0 instanceof GridTcpNioCommunicationClient) {
                                GridTcpNioCommunicationClient tcpClient = ((GridTcpNioCommunicationClient) client0);
                                if (tcpClient.session().closeTime() > 0 && removeNodeClient(nodeId, client0)) {
                                    if (log.isDebugEnabled()) {
                                        log.debug("Session was closed after client creation, will retry " + "[node=" + node + ", client=" + client0 + ']');
                                    client0 = null;
                        } else {
                            if (nodeGetter.apply( == null)
                                throw new ClusterTopologyCheckedException("Failed to send message " + "(node left topology): " + node);
                } catch (NodeUnreachableException e) {
                    fut = handleUnreachableNodeException(node, connIdx, fut, e);
                } catch (Throwable e) {
                    if (e instanceof NodeUnreachableException)
                        throw e;
                    if (e instanceof IgniteTooManyOpenFilesException)
                        throw e;
                    if (e instanceof Error)
                        throw (Error) e;
                } finally {
                    clientFuts.remove(connKey, fut);
            } else
                fut = oldFut;
            long clientReserveWaitTimeout = registry != null ? registry.getSystemWorkerBlockedTimeout() / 3 : cfg.connectionTimeout() / 3;
            long currTimeout = System.currentTimeMillis();
            // This cycle will eventually quit when future is completed by concurrent thread reserving client.
            while (true) {
                try {
                    client = fut.get(clientReserveWaitTimeout, TimeUnit.MILLISECONDS);
                } catch (IgniteFutureTimeoutCheckedException ignored) {
                    currTimeout += clientReserveWaitTimeout;
                    if (log.isDebugEnabled()) {
                        log.debug("Still waiting for reestablishing connection to node " + "[nodeId=" + + ", waitingTime=" + currTimeout + "ms]");
                    if (registry != null) {
                        GridWorker wrkr = registry.worker(Thread.currentThread().getName());
                        if (wrkr != null)
            if (client == null) {
                if (clusterStateProvider.isLocalNodeDisconnected())
                    throw new IgniteCheckedException("Unable to create TCP client due to local node disconnecting.");
            if (nodeGetter.apply(nodeId) == null) {
                if (removeNodeClient(nodeId, client))
                throw new IgniteSpiException("Destination node is not in topology: " +;
        assert connIdx == client.connectionIndex() : client;
        if (client.reserve())
            return client;
            // Client has just been closed by idle worker. Help it and try again.
            removeNodeClient(nodeId, client);
Also used : GridCommunicationClient(org.apache.ignite.internal.util.nio.GridCommunicationClient) GridTcpNioCommunicationClient(org.apache.ignite.internal.util.nio.GridTcpNioCommunicationClient) GridWorker(org.apache.ignite.internal.util.worker.GridWorker) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteFutureTimeoutCheckedException(org.apache.ignite.internal.IgniteFutureTimeoutCheckedException) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) UUID(java.util.UUID) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)

Example 3 with IgniteTooManyOpenFilesException

use of org.apache.ignite.internal.IgniteTooManyOpenFilesException in project ignite by apache.

the class GridNioServerWrapper method createNioSession.

 * Returns the established TCP/IP connection between the current node and remote server. A handshake process of
 * negotiation between two communicating nodes will be performed before the {@link GridNioSession} created.
 * <p>
 * The handshaking process contains of these steps:
 * <ol>
 * <li>The local node opens a new {@link SocketChannel} in the <em>blocking</em> mode.</li>
 * <li>The local node calls {@link SocketChannel#connect(SocketAddress)} to remote node.</li>
 * <li>The remote GridNioAcceptWorker thread accepts new connection.</li>
 * <li>The remote node sends back the {@link NodeIdMessage}.</li>
 * <li>The local node reads NodeIdMessage from created channel.</li>
 * <li>The local node sends the {@link HandshakeMessage2} to remote.</li>
 * <li>The remote node processes {@link HandshakeMessage2} in {@link GridNioServerListener#onMessage(GridNioSession,
 * Object)}.</li>
 * <li>The remote node sends back the {@link RecoveryLastReceivedMessage}.</li>
 * </ol>
 * The handshaking process ends.
 * </p>
 * <p>
 * <em>Note.</em> The {@link HandshakeTimeoutObject} is created to control execution timeout during the
 * whole handshaking process.
 * </p>
 * @param node Remote node identifier to connect with.
 * @param connIdx Connection index based on configured {@link ConnectionPolicy}.
 * @return A {@link GridNioSession} connection representation.
 * @throws IgniteCheckedException If establish connection fails.
public GridNioSession createNioSession(ClusterNode node, int connIdx) throws IgniteCheckedException {
    boolean locNodeIsSrv = !locNodeSupplier.get().isClient() && !locNodeSupplier.get().isDaemon();
    if (!(Thread.currentThread() instanceof IgniteDiscoveryThread) && locNodeIsSrv) {
        if (node.isClient() && forceClientToServerConnections(node)) {
            String msg = "Failed to connect to node " + + " because it is started" + " in 'forceClientToServerConnections' mode; inverse connection will be requested.";
            throw new NodeUnreachableException(msg);
    Collection<InetSocketAddress> addrs = nodeAddresses(node, cfg.filterReachableAddresses(), attrs, locNodeSupplier);
    GridNioSession ses = null;
    IgniteCheckedException errs = null;
    long totalTimeout;
    if (cfg.failureDetectionTimeoutEnabled())
        totalTimeout = node.isClient() ? stateProvider.clientFailureDetectionTimeout() : cfg.failureDetectionTimeout();
    else {
        totalTimeout = ExponentialBackoffTimeoutStrategy.totalBackoffTimeout(cfg.connectionTimeout(), cfg.maxConnectionTimeout(), cfg.reconCount());
    Set<InetSocketAddress> failedAddrsSet = new HashSet<>();
    int skippedAddrs = 0;
    for (InetSocketAddress addr : addrs) {
        if (addr.isUnresolved()) {
        TimeoutStrategy connTimeoutStgy = new ExponentialBackoffTimeoutStrategy(totalTimeout, cfg.failureDetectionTimeoutEnabled() ? DFLT_INITIAL_TIMEOUT : cfg.connectionTimeout(), cfg.maxConnectionTimeout());
        while (ses == null) {
            // Reconnection on handshake timeout.
            if (stopping)
                throw new IgniteSpiException("Node is stopping.");
            if (isLocalNodeAddress(addr)) {
                if (log.isDebugEnabled())
                    log.debug("Skipping local address [addr=" + addr + ", locAddrs=" + node.attribute(attrs.addresses()) + ", node=" + node + ']');
            long timeout = 0;
            try {
                if (nodeGetter.apply( == null)
                    throw new ClusterTopologyCheckedException("Failed to send message (node left topology): " + node);
                SocketChannel ch = socketChannelFactory.get();
                if (cfg.socketReceiveBuffer() > 0)
                if (cfg.socketSendBuffer() > 0)
                ConnectionKey connKey = new ConnectionKey(, connIdx, -1);
                GridNioRecoveryDescriptor recoveryDesc = outRecoveryDescriptor(node, connKey);
                assert recoveryDesc != null : "Recovery descriptor not found [connKey=" + connKey + ", rmtNode=" + + ']';
                if (!recoveryDesc.reserve()) {
                    // Ensure the session is closed.
                    GridNioSession sesFromRecovery = recoveryDesc.session();
                    if (sesFromRecovery != null) {
                        while (sesFromRecovery.closeTime() == 0) sesFromRecovery.close();
                    return null;
                long rcvCnt;
                Map<Integer, Object> meta = new HashMap<>();
                GridSslMeta sslMeta = null;
                try {
                    timeout = connTimeoutStgy.nextTimeout();
                    ch.socket().connect(addr, (int) timeout);
                    if (nodeGetter.apply( == null)
                        throw new ClusterTopologyCheckedException("Failed to send message (node left topology): " + node);
                    if (stateProvider.isSslEnabled()) {
                        meta.put(SSL_META.ordinal(), sslMeta = new GridSslMeta());
                        SSLEngine sslEngine = stateProvider.createSSLEngine();
                    ClusterNode locNode = locNodeSupplier.get();
                    if (locNode == null)
                        throw new IgniteCheckedException("Local node has not been started or " + "fully initialized [isStopping=" + stateProvider.isStopping() + ']');
                    timeout = connTimeoutStgy.nextTimeout(timeout);
                    rcvCnt = safeTcpHandshake(ch,, timeout, sslMeta, new HandshakeMessage2(, recoveryDesc.incrementConnectCount(), recoveryDesc.received(), connIdx));
                    if (rcvCnt == ALREADY_CONNECTED)
                        return null;
                    else if (rcvCnt == NODE_STOPPING) {
                        // Safe to remap on remote node stopping.
                        throw new ClusterTopologyCheckedException("Remote node started stop procedure: " +;
                    } else if (rcvCnt == UNKNOWN_NODE)
                        throw new IgniteCheckedException("Remote node does not observe current node " + "in topology : " +;
                    else if (rcvCnt == NEED_WAIT) {
                        // scenarios with delayed client node join.
                        if (log.isDebugEnabled())
                            log.debug("NEED_WAIT received, handshake after delay [node = " + node + ", outOfTopologyDelay = " + DFLT_NEED_WAIT_DELAY + "ms]");
                    } else if (rcvCnt < 0)
                        throw new IgniteCheckedException("Unsupported negative receivedCount [rcvCnt=" + rcvCnt + ", senderNode=" + node + ']');
                    meta.put(CONSISTENT_ID_META, node.consistentId());
                    meta.put(CONN_IDX_META, connKey);
                    meta.put(GridNioServer.RECOVERY_DESC_META_KEY, recoveryDesc);
                    ses = nioSrv.createSession(ch, meta, false, null).get();
                } finally {
                    if (ses == null) {
                        if (recoveryDesc != null)
            } catch (IgniteSpiOperationTimeoutException e) {
                // Handshake is timed out.
                if (ses != null) {
                    ses = null;
                eRegistrySupplier.get().onException("Handshake timed out (will retry with increased timeout) [connTimeoutStrategy=" + connTimeoutStgy + ", addr=" + addr + ']', e);
                if (log.isDebugEnabled())
                    log.debug("Handshake timed out (will retry with increased timeout) [connTimeoutStrategy=" + connTimeoutStgy + ", addr=" + addr + ", err=" + e + ']');
                if (connTimeoutStgy.checkTimeout()) {
                    U.warn(log, "Handshake timed out (will stop attempts to perform the handshake) " + "[node=" + + ", connTimeoutStrategy=" + connTimeoutStgy + ", err=" + e.getMessage() + ", addr=" + addr + ", failureDetectionTimeoutEnabled=" + cfg.failureDetectionTimeoutEnabled() + ", timeout=" + timeout + ']');
                    String msg = "Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + + ", addrs=" + addrs + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
            } catch (ClusterTopologyCheckedException e) {
                throw e;
            } catch (Exception e) {
                // Most probably IO error on socket connect or handshake.
                if (ses != null) {
                    ses = null;
                eRegistrySupplier.get().onException("Client creation failed [addr=" + addr + ", err=" + e + ']', e);
                if (log.isDebugEnabled())
                    log.debug("Client creation failed [addr=" + addr + ", err=" + e + ']');
                if (X.hasCause(e, "Too many open files", SocketException.class))
                    throw new IgniteTooManyOpenFilesException(e);
                // check if timeout occured in case of unrecoverable exception
                if (connTimeoutStgy.checkTimeout()) {
                    U.warn(log, "Connection timed out (will stop attempts to perform the connect) " + "[node=" + + ", connTimeoutStgy=" + connTimeoutStgy + ", failureDetectionTimeoutEnabled=" + cfg.failureDetectionTimeoutEnabled() + ", timeout=" + timeout + ", err=" + e.getMessage() + ", addr=" + addr + ']');
                    String msg = "Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + + ", addrs=" + addrs + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
                // Inverse communication protocol works only for client nodes.
                if (node.isClient() && isNodeUnreachableException(e))
                if (isRecoverableException(e))
                else {
                    String msg = "Failed to connect to node due to unrecoverable exception (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + + ", addrs=" + addrs + ", err= " + e + ']';
                    if (errs == null)
                        errs = new IgniteCheckedException(msg, e);
                        errs.addSuppressed(new IgniteCheckedException(msg, e));
            } finally {
            CommunicationWorker commWorker0 = commWorker;
            if (commWorker0 != null && commWorker0.runner() == Thread.currentThread())
        if (ses != null)
    if (ses == null) {
        // inverse connection so no point in throwing NodeUnreachableException
        if (!cfg.usePairedConnections() || !Boolean.TRUE.equals(node.attribute(attrs.pairedConnection()))) {
            if (!(Thread.currentThread() instanceof IgniteDiscoveryThread) && locNodeIsSrv) {
                if (node.isClient() && (addrs.size() - skippedAddrs == failedAddrsSet.size())) {
                    String msg = "Failed to connect to all addresses of node " + + ": " + failedAddrsSet + "; inverse connection will be requested.";
                    throw new NodeUnreachableException(msg);
        processSessionCreationError(node, addrs, errs == null ? new IgniteCheckedException("No session found") : errs);
    return ses;
Also used : SocketChannel(java.nio.channels.SocketChannel) SocketException( GridNioSession(org.apache.ignite.internal.util.nio.GridNioSession) HashMap(java.util.HashMap) InetSocketAddress( SSLEngine( GridSslMeta(org.apache.ignite.internal.util.nio.ssl.GridSslMeta) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) HashSet(java.util.HashSet) ExponentialBackoffTimeoutStrategy(org.apache.ignite.spi.ExponentialBackoffTimeoutStrategy) ClusterNode(org.apache.ignite.cluster.ClusterNode) HandshakeMessage2(org.apache.ignite.spi.communication.tcp.messages.HandshakeMessage2) ExponentialBackoffTimeoutStrategy(org.apache.ignite.spi.ExponentialBackoffTimeoutStrategy) TimeoutStrategy(org.apache.ignite.spi.TimeoutStrategy) IgniteDiscoveryThread(org.apache.ignite.spi.discovery.IgniteDiscoveryThread) CommunicationTcpUtils.handshakeTimeoutException(org.apache.ignite.spi.communication.tcp.internal.CommunicationTcpUtils.handshakeTimeoutException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) SSLException( IgniteSpiOperationTimeoutException(org.apache.ignite.spi.IgniteSpiOperationTimeoutException) IgniteSpiException(org.apache.ignite.spi.IgniteSpiException) SocketException( SocketTimeoutException( IOException( CommunicationTcpUtils.isRecoverableException(org.apache.ignite.spi.communication.tcp.internal.CommunicationTcpUtils.isRecoverableException) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) GridNioRecoveryDescriptor(org.apache.ignite.internal.util.nio.GridNioRecoveryDescriptor) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)


IgniteCheckedException (org.apache.ignite.IgniteCheckedException)3 IgniteTooManyOpenFilesException (org.apache.ignite.internal.IgniteTooManyOpenFilesException)3 ClusterTopologyCheckedException (org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)3 ClusterNode (org.apache.ignite.cluster.ClusterNode)2 GridCommunicationClient (org.apache.ignite.internal.util.nio.GridCommunicationClient)2 GridNioRecoveryDescriptor (org.apache.ignite.internal.util.nio.GridNioRecoveryDescriptor)2 IgniteSpiException (org.apache.ignite.spi.IgniteSpiException)2 IOException ( InetSocketAddress ( SocketException ( SocketTimeoutException ( SocketChannel (java.nio.channels.SocketChannel)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 UUID (java.util.UUID)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 SSLEngine ( SSLException ( IgniteClientDisconnectedException (org.apache.ignite.IgniteClientDisconnectedException)1 IgniteException (org.apache.ignite.IgniteException)1