use of org.apache.ignite.spi.communication.tcp.messages.HandshakeMessage2 in project ignite by apache.
the class TxDeadlockOnEntryToStringTest method resolve.
/**
* The method reference implementation of {@link DependencyResolver}. It adds an additional behavior to {@link
* InboundConnectionHandler}.
*
* @param instance Delegated instance.
*/
private <T> T resolve(T instance) {
if (instance instanceof InboundConnectionHandler) {
InboundConnectionHandler hnd = (InboundConnectionHandler) instance;
return (T) (new InboundConnectionHandler(null, null, null, null, null, null, null, null, null, null, null, null, null, false, null, null) {
@Override
public void setNioSrvWrapper(GridNioServerWrapper nioSrvWrapper) {
hnd.setNioSrvWrapper(nioSrvWrapper);
}
@Override
public void setClientPool(ConnectionClientPool pool) {
hnd.setClientPool(pool);
}
@Override
public void onSessionWriteTimeout(GridNioSession ses) {
hnd.onSessionWriteTimeout(ses);
}
@Override
public void onConnected(GridNioSession ses) {
hnd.onConnected(ses);
}
@Override
public void onMessageSent(GridNioSession ses, Message msg) {
hnd.onMessageSent(ses, msg);
}
@Override
public void onMessage(GridNioSession ses, Message msg) {
if (rejectHandshake.get() && msg instanceof HandshakeMessage2) {
rejectHandshake.set(false);
ses.close();
return;
}
hnd.onMessage(ses, msg);
}
@Override
public void onFailure(FailureType failureType, Throwable failure) {
hnd.onFailure(failureType, failure);
}
@Override
public void onDisconnected(GridNioSession ses, @Nullable Exception e) {
hnd.onDisconnected(ses, e);
}
@Override
public void stop() {
hnd.stop();
}
@Override
public void communicationWorker(CommunicationWorker commWorker) {
hnd.communicationWorker(commWorker);
}
@Override
public void onSessionIdleTimeout(GridNioSession ses) {
hnd.onSessionIdleTimeout(ses);
}
@Override
public void metricsListener(@Nullable TcpCommunicationMetricsListener metricsLsnr) {
hnd.metricsListener(metricsLsnr);
}
});
}
return instance;
}
use of org.apache.ignite.spi.communication.tcp.messages.HandshakeMessage2 in project ignite by apache.
the class TcpCommunicationSpi method safeTcpHandshake.
/**
* Performs handshake in timeout-safe way.
*
* @param ch Socket channel.
* @param recovery Recovery descriptor if use recovery handshake, otherwise {@code null}.
* @param rmtNodeId Remote node.
* @param timeout Timeout for handshake.
* @param sslMeta Session meta.
* @param handshakeConnIdx Non null connection index if need send it in handshake.
* @throws IgniteCheckedException If handshake failed or wasn't completed withing timeout.
* @return Handshake response.
*/
@SuppressWarnings("ThrowFromFinallyBlock")
private long safeTcpHandshake(SocketChannel ch, @Nullable GridNioRecoveryDescriptor recovery, UUID rmtNodeId, long timeout, GridSslMeta sslMeta, @Nullable Integer handshakeConnIdx) throws IgniteCheckedException {
HandshakeTimeoutObject obj = new HandshakeTimeoutObject<>(ch, U.currentTimeMillis() + timeout);
addTimeoutObject(obj);
long rcvCnt = 0;
try {
BlockingSslHandler sslHnd = null;
ByteBuffer buf;
if (isSslEnabled()) {
assert sslMeta != null;
sslHnd = new BlockingSslHandler(sslMeta.sslEngine(), ch, directBuf, ByteOrder.nativeOrder(), log);
if (!sslHnd.handshake())
throw new HandshakeException("SSL handshake is not completed.");
ByteBuffer handBuff = sslHnd.applicationBuffer();
if (handBuff.remaining() < NodeIdMessage.MESSAGE_FULL_SIZE) {
buf = ByteBuffer.allocate(1000);
int read = ch.read(buf);
if (read == -1)
throw new HandshakeException("Failed to read remote node ID (connection closed).");
buf.flip();
buf = sslHnd.decode(buf);
} else
buf = handBuff;
} else {
buf = ByteBuffer.allocate(NodeIdMessage.MESSAGE_FULL_SIZE);
for (int i = 0; i < NodeIdMessage.MESSAGE_FULL_SIZE; ) {
int read = ch.read(buf);
if (read == -1)
throw new HandshakeException("Failed to read remote node ID (connection closed).");
i += read;
}
}
UUID rmtNodeId0 = U.bytesToUuid(buf.array(), Message.DIRECT_TYPE_SIZE);
if (!rmtNodeId.equals(rmtNodeId0))
throw new HandshakeException("Remote node ID is not as expected [expected=" + rmtNodeId + ", rcvd=" + rmtNodeId0 + ']');
else if (log.isDebugEnabled())
log.debug("Received remote node ID: " + rmtNodeId0);
if (isSslEnabled()) {
assert sslHnd != null;
ch.write(sslHnd.encrypt(ByteBuffer.wrap(U.IGNITE_HEADER)));
} else
ch.write(ByteBuffer.wrap(U.IGNITE_HEADER));
ClusterNode locNode = getLocalNode();
if (locNode == null)
throw new IgniteCheckedException("Local node has not been started or " + "fully initialized [isStopping=" + getSpiContext().isStopping() + ']');
if (recovery != null) {
HandshakeMessage msg;
int msgSize = HandshakeMessage.MESSAGE_FULL_SIZE;
if (handshakeConnIdx != null) {
msg = new HandshakeMessage2(locNode.id(), recovery.incrementConnectCount(), recovery.received(), handshakeConnIdx);
msgSize += 4;
} else {
msg = new HandshakeMessage(locNode.id(), recovery.incrementConnectCount(), recovery.received());
}
if (log.isDebugEnabled())
log.debug("Writing handshake message [locNodeId=" + locNode.id() + ", rmtNode=" + rmtNodeId + ", msg=" + msg + ']');
buf = ByteBuffer.allocate(msgSize);
buf.order(ByteOrder.nativeOrder());
boolean written = msg.writeTo(buf, null);
assert written;
buf.flip();
if (isSslEnabled()) {
assert sslHnd != null;
ch.write(sslHnd.encrypt(buf));
} else
ch.write(buf);
} else {
if (isSslEnabled()) {
assert sslHnd != null;
ch.write(sslHnd.encrypt(ByteBuffer.wrap(NodeIdMessage.nodeIdBytesWithType(safeLocalNodeId()))));
} else
ch.write(ByteBuffer.wrap(NodeIdMessage.nodeIdBytesWithType(safeLocalNodeId())));
}
if (recovery != null) {
if (log.isDebugEnabled())
log.debug("Waiting for handshake [rmtNode=" + rmtNodeId + ']');
if (isSslEnabled()) {
assert sslHnd != null;
buf = ByteBuffer.allocate(1000);
buf.order(ByteOrder.nativeOrder());
ByteBuffer decode = ByteBuffer.allocate(2 * buf.capacity());
decode.order(ByteOrder.nativeOrder());
for (int i = 0; i < RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE; ) {
int read = ch.read(buf);
if (read == -1)
throw new HandshakeException("Failed to read remote node recovery handshake " + "(connection closed).");
buf.flip();
ByteBuffer decode0 = sslHnd.decode(buf);
i += decode0.remaining();
decode = appendAndResizeIfNeeded(decode, decode0);
buf.clear();
}
decode.flip();
rcvCnt = decode.getLong(Message.DIRECT_TYPE_SIZE);
if (decode.limit() > RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE) {
decode.position(RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE);
sslMeta.decodedBuffer(decode);
}
ByteBuffer inBuf = sslHnd.inputBuffer();
if (inBuf.position() > 0)
sslMeta.encodedBuffer(inBuf);
} else {
buf = ByteBuffer.allocate(RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE);
buf.order(ByteOrder.nativeOrder());
for (int i = 0; i < RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE; ) {
int read = ch.read(buf);
if (read == -1)
throw new HandshakeException("Failed to read remote node recovery handshake " + "(connection closed).");
i += read;
}
rcvCnt = buf.getLong(Message.DIRECT_TYPE_SIZE);
}
if (log.isDebugEnabled())
log.debug("Received handshake message [rmtNode=" + rmtNodeId + ", rcvCnt=" + rcvCnt + ']');
if (rcvCnt == -1) {
if (log.isDebugEnabled())
log.debug("Connection rejected, will retry client creation [rmtNode=" + rmtNodeId + ']');
}
}
} catch (IOException e) {
if (log.isDebugEnabled())
log.debug("Failed to read from channel: " + e);
throw new IgniteCheckedException("Failed to read from channel.", e);
} finally {
boolean cancelled = obj.cancel();
if (cancelled)
removeTimeoutObject(obj);
// Ignoring whatever happened after timeout - reporting only timeout event.
if (!cancelled)
throw new HandshakeTimeoutException(new IgniteSpiOperationTimeoutException("Failed to perform handshake due to timeout " + "(consider increasing 'connectionTimeout' configuration property)."));
}
return rcvCnt;
}
use of org.apache.ignite.spi.communication.tcp.messages.HandshakeMessage2 in project ignite by apache.
the class GridNioServerWrapper method createNioSession.
/**
* Returns the established TCP/IP connection between the current node and remote server. A handshake process of
* negotiation between two communicating nodes will be performed before the {@link GridNioSession} created.
* <p>
* The handshaking process contains of these steps:
*
* <ol>
* <li>The local node opens a new {@link SocketChannel} in the <em>blocking</em> mode.</li>
* <li>The local node calls {@link SocketChannel#connect(SocketAddress)} to remote node.</li>
* <li>The remote GridNioAcceptWorker thread accepts new connection.</li>
* <li>The remote node sends back the {@link NodeIdMessage}.</li>
* <li>The local node reads NodeIdMessage from created channel.</li>
* <li>The local node sends the {@link HandshakeMessage2} to remote.</li>
* <li>The remote node processes {@link HandshakeMessage2} in {@link GridNioServerListener#onMessage(GridNioSession,
* Object)}.</li>
* <li>The remote node sends back the {@link RecoveryLastReceivedMessage}.</li>
* </ol>
*
* The handshaking process ends.
* </p>
* <p>
* <em>Note.</em> The {@link HandshakeTimeoutObject} is created to control execution timeout during the
* whole handshaking process.
* </p>
*
* @param node Remote node identifier to connect with.
* @param connIdx Connection index based on configured {@link ConnectionPolicy}.
* @return A {@link GridNioSession} connection representation.
* @throws IgniteCheckedException If establish connection fails.
*/
public GridNioSession createNioSession(ClusterNode node, int connIdx) throws IgniteCheckedException {
boolean locNodeIsSrv = !locNodeSupplier.get().isClient() && !locNodeSupplier.get().isDaemon();
if (!(Thread.currentThread() instanceof IgniteDiscoveryThread) && locNodeIsSrv) {
if (node.isClient() && forceClientToServerConnections(node)) {
String msg = "Failed to connect to node " + node.id() + " because it is started" + " in 'forceClientToServerConnections' mode; inverse connection will be requested.";
throw new NodeUnreachableException(msg);
}
}
Collection<InetSocketAddress> addrs = nodeAddresses(node, cfg.filterReachableAddresses(), attrs, locNodeSupplier);
GridNioSession ses = null;
IgniteCheckedException errs = null;
long totalTimeout;
if (cfg.failureDetectionTimeoutEnabled())
totalTimeout = node.isClient() ? stateProvider.clientFailureDetectionTimeout() : cfg.failureDetectionTimeout();
else {
totalTimeout = ExponentialBackoffTimeoutStrategy.totalBackoffTimeout(cfg.connectionTimeout(), cfg.maxConnectionTimeout(), cfg.reconCount());
}
Set<InetSocketAddress> failedAddrsSet = new HashSet<>();
int skippedAddrs = 0;
for (InetSocketAddress addr : addrs) {
if (addr.isUnresolved()) {
failedAddrsSet.add(addr);
continue;
}
TimeoutStrategy connTimeoutStgy = new ExponentialBackoffTimeoutStrategy(totalTimeout, cfg.failureDetectionTimeoutEnabled() ? DFLT_INITIAL_TIMEOUT : cfg.connectionTimeout(), cfg.maxConnectionTimeout());
while (ses == null) {
// Reconnection on handshake timeout.
if (stopping)
throw new IgniteSpiException("Node is stopping.");
if (isLocalNodeAddress(addr)) {
if (log.isDebugEnabled())
log.debug("Skipping local address [addr=" + addr + ", locAddrs=" + node.attribute(attrs.addresses()) + ", node=" + node + ']');
skippedAddrs++;
break;
}
long timeout = 0;
connectGate.enter();
try {
if (nodeGetter.apply(node.id()) == null)
throw new ClusterTopologyCheckedException("Failed to send message (node left topology): " + node);
SocketChannel ch = socketChannelFactory.get();
ch.configureBlocking(true);
ch.socket().setTcpNoDelay(cfg.tcpNoDelay());
ch.socket().setKeepAlive(true);
if (cfg.socketReceiveBuffer() > 0)
ch.socket().setReceiveBufferSize(cfg.socketReceiveBuffer());
if (cfg.socketSendBuffer() > 0)
ch.socket().setSendBufferSize(cfg.socketSendBuffer());
ConnectionKey connKey = new ConnectionKey(node.id(), connIdx, -1);
GridNioRecoveryDescriptor recoveryDesc = outRecoveryDescriptor(node, connKey);
assert recoveryDesc != null : "Recovery descriptor not found [connKey=" + connKey + ", rmtNode=" + node.id() + ']';
if (!recoveryDesc.reserve()) {
U.closeQuiet(ch);
// Ensure the session is closed.
GridNioSession sesFromRecovery = recoveryDesc.session();
if (sesFromRecovery != null) {
while (sesFromRecovery.closeTime() == 0) sesFromRecovery.close();
}
return null;
}
long rcvCnt;
Map<Integer, Object> meta = new HashMap<>();
GridSslMeta sslMeta = null;
try {
timeout = connTimeoutStgy.nextTimeout();
ch.socket().connect(addr, (int) timeout);
if (nodeGetter.apply(node.id()) == null)
throw new ClusterTopologyCheckedException("Failed to send message (node left topology): " + node);
if (stateProvider.isSslEnabled()) {
meta.put(SSL_META.ordinal(), sslMeta = new GridSslMeta());
SSLEngine sslEngine = stateProvider.createSSLEngine();
sslEngine.setUseClientMode(true);
sslMeta.sslEngine(sslEngine);
}
ClusterNode locNode = locNodeSupplier.get();
if (locNode == null)
throw new IgniteCheckedException("Local node has not been started or " + "fully initialized [isStopping=" + stateProvider.isStopping() + ']');
timeout = connTimeoutStgy.nextTimeout(timeout);
rcvCnt = safeTcpHandshake(ch, node.id(), timeout, sslMeta, new HandshakeMessage2(locNode.id(), recoveryDesc.incrementConnectCount(), recoveryDesc.received(), connIdx));
if (rcvCnt == ALREADY_CONNECTED)
return null;
else if (rcvCnt == NODE_STOPPING) {
// Safe to remap on remote node stopping.
throw new ClusterTopologyCheckedException("Remote node started stop procedure: " + node.id());
} else if (rcvCnt == UNKNOWN_NODE)
throw new IgniteCheckedException("Remote node does not observe current node " + "in topology : " + node.id());
else if (rcvCnt == NEED_WAIT) {
// scenarios with delayed client node join.
if (log.isDebugEnabled())
log.debug("NEED_WAIT received, handshake after delay [node = " + node + ", outOfTopologyDelay = " + DFLT_NEED_WAIT_DELAY + "ms]");
U.sleep(DFLT_NEED_WAIT_DELAY);
continue;
} else if (rcvCnt < 0)
throw new IgniteCheckedException("Unsupported negative receivedCount [rcvCnt=" + rcvCnt + ", senderNode=" + node + ']');
recoveryDesc.onHandshake(rcvCnt);
meta.put(CONSISTENT_ID_META, node.consistentId());
meta.put(CONN_IDX_META, connKey);
meta.put(GridNioServer.RECOVERY_DESC_META_KEY, recoveryDesc);
ses = nioSrv.createSession(ch, meta, false, null).get();
} finally {
if (ses == null) {
U.closeQuiet(ch);
if (recoveryDesc != null)
recoveryDesc.release();
}
}
} catch (IgniteSpiOperationTimeoutException e) {
// Handshake is timed out.
if (ses != null) {
ses.close();
ses = null;
}
eRegistrySupplier.get().onException("Handshake timed out (will retry with increased timeout) [connTimeoutStrategy=" + connTimeoutStgy + ", addr=" + addr + ']', e);
if (log.isDebugEnabled())
log.debug("Handshake timed out (will retry with increased timeout) [connTimeoutStrategy=" + connTimeoutStgy + ", addr=" + addr + ", err=" + e + ']');
if (connTimeoutStgy.checkTimeout()) {
U.warn(log, "Handshake timed out (will stop attempts to perform the handshake) " + "[node=" + node.id() + ", connTimeoutStrategy=" + connTimeoutStgy + ", err=" + e.getMessage() + ", addr=" + addr + ", failureDetectionTimeoutEnabled=" + cfg.failureDetectionTimeoutEnabled() + ", timeout=" + timeout + ']');
String msg = "Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']';
if (errs == null)
errs = new IgniteCheckedException(msg, e);
else
errs.addSuppressed(new IgniteCheckedException(msg, e));
break;
}
} catch (ClusterTopologyCheckedException e) {
throw e;
} catch (Exception e) {
// Most probably IO error on socket connect or handshake.
if (ses != null) {
ses.close();
ses = null;
}
eRegistrySupplier.get().onException("Client creation failed [addr=" + addr + ", err=" + e + ']', e);
if (log.isDebugEnabled())
log.debug("Client creation failed [addr=" + addr + ", err=" + e + ']');
if (X.hasCause(e, "Too many open files", SocketException.class))
throw new IgniteTooManyOpenFilesException(e);
// check if timeout occured in case of unrecoverable exception
if (connTimeoutStgy.checkTimeout()) {
U.warn(log, "Connection timed out (will stop attempts to perform the connect) " + "[node=" + node.id() + ", connTimeoutStgy=" + connTimeoutStgy + ", failureDetectionTimeoutEnabled=" + cfg.failureDetectionTimeoutEnabled() + ", timeout=" + timeout + ", err=" + e.getMessage() + ", addr=" + addr + ']');
String msg = "Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']';
if (errs == null)
errs = new IgniteCheckedException(msg, e);
else
errs.addSuppressed(new IgniteCheckedException(msg, e));
break;
}
// Inverse communication protocol works only for client nodes.
if (node.isClient() && isNodeUnreachableException(e))
failedAddrsSet.add(addr);
if (isRecoverableException(e))
U.sleep(DFLT_RECONNECT_DELAY);
else {
String msg = "Failed to connect to node due to unrecoverable exception (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ", err= " + e + ']';
if (errs == null)
errs = new IgniteCheckedException(msg, e);
else
errs.addSuppressed(new IgniteCheckedException(msg, e));
break;
}
} finally {
connectGate.leave();
}
CommunicationWorker commWorker0 = commWorker;
if (commWorker0 != null && commWorker0.runner() == Thread.currentThread())
commWorker0.updateHeartbeat();
}
if (ses != null)
break;
}
if (ses == null) {
// inverse connection so no point in throwing NodeUnreachableException
if (!cfg.usePairedConnections() || !Boolean.TRUE.equals(node.attribute(attrs.pairedConnection()))) {
if (!(Thread.currentThread() instanceof IgniteDiscoveryThread) && locNodeIsSrv) {
if (node.isClient() && (addrs.size() - skippedAddrs == failedAddrsSet.size())) {
String msg = "Failed to connect to all addresses of node " + node.id() + ": " + failedAddrsSet + "; inverse connection will be requested.";
throw new NodeUnreachableException(msg);
}
}
}
processSessionCreationError(node, addrs, errs == null ? new IgniteCheckedException("No session found") : errs);
}
return ses;
}
Aggregations