use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project ignite by apache.
the class TcpDiscoverySpi method openSocket.
/**
* Connects to remote address sending {@code U.IGNITE_HEADER} when connection is established.
*
* @param sock Socket bound to a local host address.
* @param remAddr Remote address.
* @param timeoutHelper Timeout helper.
* @return Connected socket.
* @throws IOException If failed.
* @throws IgniteSpiOperationTimeoutException In case of timeout.
*/
protected Socket openSocket(Socket sock, InetSocketAddress remAddr, IgniteSpiOperationTimeoutHelper timeoutHelper) throws IOException, IgniteSpiOperationTimeoutException {
assert remAddr != null;
try {
InetSocketAddress resolved = remAddr.isUnresolved() ? new InetSocketAddress(InetAddress.getByName(remAddr.getHostName()), remAddr.getPort()) : remAddr;
InetAddress addr = resolved.getAddress();
assert addr != null;
sock.connect(resolved, (int) timeoutHelper.nextTimeoutChunk(sockTimeout));
writeToSocket(sock, null, U.IGNITE_HEADER, timeoutHelper.nextTimeoutChunk(sockTimeout));
return sock;
} catch (IOException | IgniteSpiOperationTimeoutException e) {
if (sock != null)
U.closeQuiet(sock);
throw e;
}
}
use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project ignite by apache.
the class TcpDiscoveryNetworkIssuesTest method testServerGetsSegmentedOnBecomeDangling.
/**
* Test scenario: some node (lets call it IllN) in the middle experience network issues: its previous cannot see it,
* and the node cannot see two nodes in front of it.
*
* IllN is considered failed by othen nodes in topology but IllN manages to connect to topology and
* sends StatusCheckMessage with non-empty failedNodes collection.
*
* Expected outcome: IllN eventually segments from topology, other healthy nodes work normally.
*
* @see <a href="https://issues.apache.org/jira/browse/IGNITE-11364">IGNITE-11364</a>
* for more details about actual bug.
*/
@Test
public void testServerGetsSegmentedOnBecomeDangling() throws Exception {
usePortFromNodeName = true;
connectionRecoveryTimeout = 0;
AtomicBoolean networkBroken = new AtomicBoolean(false);
IgniteEx ig0 = startGrid(NODE_0_NAME);
IgniteEx ig1 = startGrid(NODE_1_NAME);
specialSpi = new TcpDiscoverySpi() {
@Override
protected int readReceipt(Socket sock, long timeout) throws IOException {
if (networkBroken.get() && sock.getPort() == NODE_3_PORT)
throw new SocketTimeoutException("Read timed out");
return super.readReceipt(sock, timeout);
}
@Override
protected Socket openSocket(InetSocketAddress sockAddr, IgniteSpiOperationTimeoutHelper timeoutHelper) throws IOException, IgniteSpiOperationTimeoutException {
if (networkBroken.get() && sockAddr.getPort() == NODE_4_PORT)
throw new SocketTimeoutException("connect timed out");
return super.openSocket(sockAddr, timeoutHelper);
}
};
Ignite ig2 = startGrid(NODE_2_NAME);
AtomicBoolean illNodeSegmented = new AtomicBoolean(false);
ig2.events().localListen((e) -> {
illNodeSegmented.set(true);
return false;
}, EVT_NODE_SEGMENTED);
specialSpi = null;
startGrid(NODE_3_NAME);
startGrid(NODE_4_NAME);
startGrid(NODE_5_NAME);
breakDiscoConnectionToNext(ig1);
networkBroken.set(true);
GridTestUtils.waitForCondition(illNodeSegmented::get, 10_000);
assertTrue(illNodeSegmented.get());
Map failedNodes = getFailedNodesCollection(ig0);
assertTrue(String.format("Failed nodes is expected to be empty, but contains %s nodes.", failedNodes.size()), failedNodes.isEmpty());
}
use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project ignite by apache.
the class TcpCommunicationSpi method safeTcpHandshake.
/**
* Performs handshake in timeout-safe way.
*
* @param ch Socket channel.
* @param recovery Recovery descriptor if use recovery handshake, otherwise {@code null}.
* @param rmtNodeId Remote node.
* @param timeout Timeout for handshake.
* @param sslMeta Session meta.
* @param handshakeConnIdx Non null connection index if need send it in handshake.
* @throws IgniteCheckedException If handshake failed or wasn't completed withing timeout.
* @return Handshake response.
*/
@SuppressWarnings("ThrowFromFinallyBlock")
private long safeTcpHandshake(SocketChannel ch, @Nullable GridNioRecoveryDescriptor recovery, UUID rmtNodeId, long timeout, GridSslMeta sslMeta, @Nullable Integer handshakeConnIdx) throws IgniteCheckedException {
HandshakeTimeoutObject obj = new HandshakeTimeoutObject<>(ch, U.currentTimeMillis() + timeout);
addTimeoutObject(obj);
long rcvCnt = 0;
try {
BlockingSslHandler sslHnd = null;
ByteBuffer buf;
if (isSslEnabled()) {
assert sslMeta != null;
sslHnd = new BlockingSslHandler(sslMeta.sslEngine(), ch, directBuf, ByteOrder.nativeOrder(), log);
if (!sslHnd.handshake())
throw new HandshakeException("SSL handshake is not completed.");
ByteBuffer handBuff = sslHnd.applicationBuffer();
if (handBuff.remaining() < NodeIdMessage.MESSAGE_FULL_SIZE) {
buf = ByteBuffer.allocate(1000);
int read = ch.read(buf);
if (read == -1)
throw new HandshakeException("Failed to read remote node ID (connection closed).");
buf.flip();
buf = sslHnd.decode(buf);
} else
buf = handBuff;
} else {
buf = ByteBuffer.allocate(NodeIdMessage.MESSAGE_FULL_SIZE);
for (int i = 0; i < NodeIdMessage.MESSAGE_FULL_SIZE; ) {
int read = ch.read(buf);
if (read == -1)
throw new HandshakeException("Failed to read remote node ID (connection closed).");
i += read;
}
}
UUID rmtNodeId0 = U.bytesToUuid(buf.array(), Message.DIRECT_TYPE_SIZE);
if (!rmtNodeId.equals(rmtNodeId0))
throw new HandshakeException("Remote node ID is not as expected [expected=" + rmtNodeId + ", rcvd=" + rmtNodeId0 + ']');
else if (log.isDebugEnabled())
log.debug("Received remote node ID: " + rmtNodeId0);
if (isSslEnabled()) {
assert sslHnd != null;
ch.write(sslHnd.encrypt(ByteBuffer.wrap(U.IGNITE_HEADER)));
} else
ch.write(ByteBuffer.wrap(U.IGNITE_HEADER));
ClusterNode locNode = getLocalNode();
if (locNode == null)
throw new IgniteCheckedException("Local node has not been started or " + "fully initialized [isStopping=" + getSpiContext().isStopping() + ']');
if (recovery != null) {
HandshakeMessage msg;
int msgSize = HandshakeMessage.MESSAGE_FULL_SIZE;
if (handshakeConnIdx != null) {
msg = new HandshakeMessage2(locNode.id(), recovery.incrementConnectCount(), recovery.received(), handshakeConnIdx);
msgSize += 4;
} else {
msg = new HandshakeMessage(locNode.id(), recovery.incrementConnectCount(), recovery.received());
}
if (log.isDebugEnabled())
log.debug("Writing handshake message [locNodeId=" + locNode.id() + ", rmtNode=" + rmtNodeId + ", msg=" + msg + ']');
buf = ByteBuffer.allocate(msgSize);
buf.order(ByteOrder.nativeOrder());
boolean written = msg.writeTo(buf, null);
assert written;
buf.flip();
if (isSslEnabled()) {
assert sslHnd != null;
ch.write(sslHnd.encrypt(buf));
} else
ch.write(buf);
} else {
if (isSslEnabled()) {
assert sslHnd != null;
ch.write(sslHnd.encrypt(ByteBuffer.wrap(NodeIdMessage.nodeIdBytesWithType(safeLocalNodeId()))));
} else
ch.write(ByteBuffer.wrap(NodeIdMessage.nodeIdBytesWithType(safeLocalNodeId())));
}
if (recovery != null) {
if (log.isDebugEnabled())
log.debug("Waiting for handshake [rmtNode=" + rmtNodeId + ']');
if (isSslEnabled()) {
assert sslHnd != null;
buf = ByteBuffer.allocate(1000);
buf.order(ByteOrder.nativeOrder());
ByteBuffer decode = ByteBuffer.allocate(2 * buf.capacity());
decode.order(ByteOrder.nativeOrder());
for (int i = 0; i < RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE; ) {
int read = ch.read(buf);
if (read == -1)
throw new HandshakeException("Failed to read remote node recovery handshake " + "(connection closed).");
buf.flip();
ByteBuffer decode0 = sslHnd.decode(buf);
i += decode0.remaining();
decode = appendAndResizeIfNeeded(decode, decode0);
buf.clear();
}
decode.flip();
rcvCnt = decode.getLong(Message.DIRECT_TYPE_SIZE);
if (decode.limit() > RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE) {
decode.position(RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE);
sslMeta.decodedBuffer(decode);
}
ByteBuffer inBuf = sslHnd.inputBuffer();
if (inBuf.position() > 0)
sslMeta.encodedBuffer(inBuf);
} else {
buf = ByteBuffer.allocate(RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE);
buf.order(ByteOrder.nativeOrder());
for (int i = 0; i < RecoveryLastReceivedMessage.MESSAGE_FULL_SIZE; ) {
int read = ch.read(buf);
if (read == -1)
throw new HandshakeException("Failed to read remote node recovery handshake " + "(connection closed).");
i += read;
}
rcvCnt = buf.getLong(Message.DIRECT_TYPE_SIZE);
}
if (log.isDebugEnabled())
log.debug("Received handshake message [rmtNode=" + rmtNodeId + ", rcvCnt=" + rcvCnt + ']');
if (rcvCnt == -1) {
if (log.isDebugEnabled())
log.debug("Connection rejected, will retry client creation [rmtNode=" + rmtNodeId + ']');
}
}
} catch (IOException e) {
if (log.isDebugEnabled())
log.debug("Failed to read from channel: " + e);
throw new IgniteCheckedException("Failed to read from channel.", e);
} finally {
boolean cancelled = obj.cancel();
if (cancelled)
removeTimeoutObject(obj);
// Ignoring whatever happened after timeout - reporting only timeout event.
if (!cancelled)
throw new HandshakeTimeoutException(new IgniteSpiOperationTimeoutException("Failed to perform handshake due to timeout " + "(consider increasing 'connectionTimeout' configuration property)."));
}
return rcvCnt;
}
use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project ignite by apache.
the class TcpCommunicationSpi method createTcpClient.
/**
* Establish TCP connection to remote node and returns client.
*
* @param node Remote node.
* @param connIdx Connection index.
* @return Client.
* @throws IgniteCheckedException If failed.
*/
protected GridCommunicationClient createTcpClient(ClusterNode node, int connIdx) throws IgniteCheckedException {
LinkedHashSet<InetSocketAddress> addrs = nodeAddresses(node);
GridCommunicationClient client = null;
IgniteCheckedException errs = null;
int connectAttempts = 1;
for (InetSocketAddress addr : addrs) {
long connTimeout0 = connTimeout;
int attempt = 1;
IgniteSpiOperationTimeoutHelper timeoutHelper = new IgniteSpiOperationTimeoutHelper(this, !node.isClient());
int lastWaitingTimeout = 1;
while (client == null) {
// Reconnection on handshake timeout.
if (addr.getAddress().isLoopbackAddress() && addr.getPort() == boundTcpPort) {
if (log.isDebugEnabled())
log.debug("Skipping local address [addr=" + addr + ", locAddrs=" + node.attribute(createSpiAttributeName(ATTR_ADDRS)) + ", node=" + node + ']');
break;
}
boolean needWait = false;
try {
SocketChannel ch = SocketChannel.open();
ch.configureBlocking(true);
ch.socket().setTcpNoDelay(tcpNoDelay);
ch.socket().setKeepAlive(true);
if (sockRcvBuf > 0)
ch.socket().setReceiveBufferSize(sockRcvBuf);
if (sockSndBuf > 0)
ch.socket().setSendBufferSize(sockSndBuf);
if (getSpiContext().node(node.id()) == null) {
U.closeQuiet(ch);
throw new ClusterTopologyCheckedException("Failed to send message " + "(node left topology): " + node);
}
ConnectionKey connKey = new ConnectionKey(node.id(), connIdx, -1);
GridNioRecoveryDescriptor recoveryDesc = outRecoveryDescriptor(node, connKey);
if (!recoveryDesc.reserve()) {
U.closeQuiet(ch);
return null;
}
Long rcvCnt;
Map<Integer, Object> meta = new HashMap<>();
GridSslMeta sslMeta = null;
try {
ch.socket().connect(addr, (int) timeoutHelper.nextTimeoutChunk(connTimeout));
if (isSslEnabled()) {
meta.put(SSL_META.ordinal(), sslMeta = new GridSslMeta());
SSLEngine sslEngine = ignite.configuration().getSslContextFactory().create().createSSLEngine();
sslEngine.setUseClientMode(true);
sslMeta.sslEngine(sslEngine);
}
Integer handshakeConnIdx = connIdx;
rcvCnt = safeTcpHandshake(ch, recoveryDesc, node.id(), timeoutHelper.nextTimeoutChunk(connTimeout0), sslMeta, handshakeConnIdx);
if (rcvCnt == ALREADY_CONNECTED) {
return null;
} else if (rcvCnt == NODE_STOPPING) {
throw new ClusterTopologyCheckedException("Remote node started stop procedure: " + node.id());
} else if (rcvCnt == NEED_WAIT) {
needWait = true;
continue;
}
meta.put(CONN_IDX_META, connKey);
if (recoveryDesc != null) {
recoveryDesc.onHandshake(rcvCnt);
meta.put(GridNioServer.RECOVERY_DESC_META_KEY, recoveryDesc);
}
GridNioSession ses = nioSrvr.createSession(ch, meta, false, null).get();
client = new GridTcpNioCommunicationClient(connIdx, ses, log);
} finally {
if (client == null) {
U.closeQuiet(ch);
if (recoveryDesc != null)
recoveryDesc.release();
if (needWait) {
if (lastWaitingTimeout < 60000)
lastWaitingTimeout *= 2;
U.sleep(lastWaitingTimeout);
}
}
}
} catch (HandshakeTimeoutException | IgniteSpiOperationTimeoutException e) {
if (client != null) {
client.forceClose();
client = null;
}
if (failureDetectionTimeoutEnabled() && (e instanceof HandshakeTimeoutException || X.hasCause(e, SocketException.class) || timeoutHelper.checkFailureTimeoutReached(e))) {
String msg = "Handshake timed out (failure detection timeout is reached) " + "[failureDetectionTimeout=" + failureDetectionTimeout() + ", addr=" + addr + ']';
onException(msg, e);
if (log.isDebugEnabled())
log.debug(msg);
if (errs == null)
errs = new IgniteCheckedException("Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']');
errs.addSuppressed(new IgniteCheckedException("Failed to connect to address: " + addr, e));
break;
}
assert !failureDetectionTimeoutEnabled();
onException("Handshake timed out (will retry with increased timeout) [timeout=" + connTimeout0 + ", addr=" + addr + ']', e);
if (log.isDebugEnabled())
log.debug("Handshake timed out (will retry with increased timeout) [timeout=" + connTimeout0 + ", addr=" + addr + ", err=" + e + ']');
if (attempt == reconCnt || connTimeout0 > maxConnTimeout) {
U.warn(log, "Handshake timedout (will stop attempts to perform the handshake) " + "[node=" + node.id() + ", timeout=" + connTimeout0 + ", maxConnTimeout=" + maxConnTimeout + ", attempt=" + attempt + ", reconCnt=" + reconCnt + ", err=" + e.getMessage() + ", addr=" + addr + ']');
if (errs == null)
errs = new IgniteCheckedException("Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']');
errs.addSuppressed(new IgniteCheckedException("Failed to connect to address: " + addr, e));
break;
} else {
attempt++;
connTimeout0 *= 2;
// Continue loop.
}
} catch (ClusterTopologyCheckedException e) {
throw e;
} catch (Exception e) {
if (client != null) {
client.forceClose();
client = null;
}
onException("Client creation failed [addr=" + addr + ", err=" + e + ']', e);
if (log.isDebugEnabled())
log.debug("Client creation failed [addr=" + addr + ", err=" + e + ']');
boolean failureDetThrReached = timeoutHelper.checkFailureTimeoutReached(e);
if (enableTroubleshootingLog)
U.error(log, "Failed to establish connection to a remote node [node=" + node + ", addr=" + addr + ", connectAttempts=" + connectAttempts + ", failureDetThrReached=" + failureDetThrReached + ']', e);
if (failureDetThrReached)
LT.warn(log, "Connect timed out (consider increasing 'failureDetectionTimeout' " + "configuration property) [addr=" + addr + ", failureDetectionTimeout=" + failureDetectionTimeout() + ']');
else if (X.hasCause(e, SocketTimeoutException.class))
LT.warn(log, "Connect timed out (consider increasing 'connTimeout' " + "configuration property) [addr=" + addr + ", connTimeout=" + connTimeout + ']');
if (errs == null)
errs = new IgniteCheckedException("Failed to connect to node (is node still alive?). " + "Make sure that each ComputeTask and cache Transaction has a timeout set " + "in order to prevent parties from waiting forever in case of network issues " + "[nodeId=" + node.id() + ", addrs=" + addrs + ']');
errs.addSuppressed(new IgniteCheckedException("Failed to connect to address " + "[addr=" + addr + ", err=" + e.getMessage() + ']', e));
// Reconnect for the second time, if connection is not established.
if (!failureDetThrReached && connectAttempts < 5 && (X.hasCause(e, ConnectException.class, HandshakeException.class, SocketTimeoutException.class))) {
U.sleep(200);
connectAttempts++;
continue;
}
break;
}
}
if (client != null)
break;
}
if (client == null) {
assert errs != null;
if (X.hasCause(errs, ConnectException.class))
LT.warn(log, "Failed to connect to a remote node " + "(make sure that destination node is alive and " + "operating system firewall is disabled on local and remote hosts) " + "[addrs=" + addrs + ']');
if (enableForcibleNodeKill) {
if (getSpiContext().node(node.id()) != null && (CU.clientNode(node) || !CU.clientNode(getLocalNode())) && connectionError(errs)) {
String msg = "TcpCommunicationSpi failed to establish connection to node, node will be dropped from " + "cluster [" + "rmtNode=" + node + ']';
if (enableTroubleshootingLog)
U.error(log, msg, errs);
else
U.warn(log, msg);
getSpiContext().failNode(node.id(), "TcpCommunicationSpi failed to establish connection to node [" + "rmtNode=" + node + ", errs=" + errs + ", connectErrs=" + Arrays.toString(errs.getSuppressed()) + ']');
}
}
if (!X.hasCause(errs, SocketTimeoutException.class, HandshakeTimeoutException.class, IgniteSpiOperationTimeoutException.class))
throw errs;
}
return client;
}
use of org.apache.ignite.spi.IgniteSpiOperationTimeoutException in project ignite by apache.
the class TcpCommunicationSpi method createShmemClient.
/**
* @param node Node.
* @param port Port.
* @param connIdx Connection index.
* @return Client.
* @throws IgniteCheckedException If failed.
*/
@Nullable
private GridCommunicationClient createShmemClient(ClusterNode node, int connIdx, Integer port) throws IgniteCheckedException {
int attempt = 1;
int connectAttempts = 1;
long connTimeout0 = connTimeout;
IgniteSpiOperationTimeoutHelper timeoutHelper = new IgniteSpiOperationTimeoutHelper(this, !node.isClient());
while (true) {
GridCommunicationClient client;
try {
client = new GridShmemCommunicationClient(connIdx, metricsLsnr, port, timeoutHelper.nextTimeoutChunk(connTimeout), log, getSpiContext().messageFormatter());
} catch (IgniteCheckedException e) {
if (timeoutHelper.checkFailureTimeoutReached(e))
throw e;
// Reconnect for the second time, if connection is not established.
if (connectAttempts < 2 && X.hasCause(e, ConnectException.class)) {
connectAttempts++;
continue;
}
throw e;
}
try {
safeShmemHandshake(client, node.id(), timeoutHelper.nextTimeoutChunk(connTimeout0));
} catch (HandshakeTimeoutException | IgniteSpiOperationTimeoutException e) {
client.forceClose();
if (failureDetectionTimeoutEnabled() && (e instanceof HandshakeTimeoutException || timeoutHelper.checkFailureTimeoutReached(e))) {
if (log.isDebugEnabled())
log.debug("Handshake timed out (failure threshold reached) [failureDetectionTimeout=" + failureDetectionTimeout() + ", err=" + e.getMessage() + ", client=" + client + ']');
throw e;
}
assert !failureDetectionTimeoutEnabled();
if (log.isDebugEnabled())
log.debug("Handshake timed out (will retry with increased timeout) [timeout=" + connTimeout0 + ", err=" + e.getMessage() + ", client=" + client + ']');
if (attempt == reconCnt || connTimeout0 > maxConnTimeout) {
if (log.isDebugEnabled())
log.debug("Handshake timedout (will stop attempts to perform the handshake) " + "[timeout=" + connTimeout0 + ", maxConnTimeout=" + maxConnTimeout + ", attempt=" + attempt + ", reconCnt=" + reconCnt + ", err=" + e.getMessage() + ", client=" + client + ']');
throw e;
} else {
attempt++;
connTimeout0 *= 2;
continue;
}
} catch (IgniteCheckedException | RuntimeException | Error e) {
if (log.isDebugEnabled())
log.debug("Caught exception (will close client) [err=" + e.getMessage() + ", client=" + client + ']');
client.forceClose();
throw e;
}
return client;
}
}
Aggregations