use of org.apache.ignite.spi.IgniteSpiException in project ignite by apache.
the class ServerImpl method spiStart.
/**
* {@inheritDoc}
*/
@Override
public void spiStart(String igniteInstanceName) throws IgniteSpiException {
synchronized (mux) {
spiState = DISCONNECTED;
}
lastRingMsgReceivedTime = 0;
lastRingMsgSentTime = 0;
// Foundumental timeout value for actions related to connection check.
connCheckTick = effectiveExchangeTimeout() / 3;
// Since we take in account time of last sent message, the interval should be quite short to give enough piece
// of failure detection timeout as send-and-acknowledge timeout of the message to send.
connCheckInterval = Math.min(connCheckTick, MAX_CON_CHECK_INTERVAL);
utilityPool = new IgniteThreadPoolExecutor("disco-pool", spi.ignite().name(), 0, 4, 2000, new LinkedBlockingQueue<>());
if (debugMode) {
if (!log.isInfoEnabled())
throw new IgniteSpiException("Info log level should be enabled for TCP discovery to work " + "in debug mode.");
debugLogQ = new ConcurrentLinkedDeque<>();
U.quietAndWarn(log, "TCP discovery SPI is configured in debug mode.");
}
// Clear addresses collections.
fromAddrs.clear();
noResAddrs.clear();
msgWorker = new RingMessageWorker(log);
msgWorkerThread = new MessageWorkerDiscoveryThread(msgWorker, log);
msgWorkerThread.start();
if (tcpSrvr == null)
tcpSrvr = new TcpServer(log);
spi.initLocalNode(tcpSrvr.port, true);
if (spi.locNodeAddrs.size() > 1 && log.isDebugEnabled()) {
if (spi.failureDetectionTimeoutEnabled()) {
log.debug("This node " + spi.locNode.id() + " has " + spi.locNodeAddrs.size() + " TCP " + "addresses. Note that TcpDiscoverySpi.failureDetectionTimeout works per address sequentially. " + "Setting of several addresses can prolong detection of current node failure.");
} else {
log.debug("This node " + spi.locNode.id() + " has " + spi.locNodeAddrs.size() + " TPC " + "addresses. With exception of connRecoveryTimeout, timeouts and setting like sockTimeout, " + "ackTimeout, reconCnt in TcpDiscoverySpi work per address sequentially. Setting of several " + "addresses can prolong detection of current node failure.");
}
}
locNode = spi.locNode;
// Start TCP server thread after local node is initialized.
new TcpServerThread(tcpSrvr, log).start();
ring.localNode(locNode);
if (spi.ipFinder.isShared())
registerLocalNodeAddress();
else {
if (F.isEmpty(spi.ipFinder.getRegisteredAddresses()))
throw new IgniteSpiException("Non-shared IP finder must have IP addresses specified in " + "TcpDiscoveryIpFinder.getRegisteredAddresses() configuration property " + "(specify list of IP addresses in configuration).");
ipFinderHasLocAddr = spi.ipFinderHasLocalAddress();
}
if (spi.getStatisticsPrintFrequency() > 0 && log.isInfoEnabled()) {
statsPrinter = new StatisticsPrinter();
statsPrinter.start();
}
joinTopology();
if (locNode.order() == 1)
U.enhanceThreadName(msgWorkerThread, "crd");
if (spi.ipFinder.isShared()) {
ipFinderCleaner = new IpFinderCleaner();
ipFinderCleaner.start();
}
spi.printStartInfo();
}
use of org.apache.ignite.spi.IgniteSpiException in project ignite by apache.
the class ServerImpl method sendJoinRequestMessage.
/**
* Tries to send join request message to a random node presenting in topology.
* Address is provided by {@link org.apache.ignite.spi.discovery.tcp.ipfinder.TcpDiscoveryIpFinder} and message is
* sent to first node connection succeeded to.
*
* @param joinMsg Join request message.
* @return {@code true} if send succeeded.
* @throws IgniteSpiException If any error occurs.
*/
private boolean sendJoinRequestMessage(TcpDiscoveryJoinRequestMessage joinMsg) throws IgniteSpiException {
// Time when join process started.
long joinStartNanos = 0;
while (true) {
Collection<InetSocketAddress> addrs = spi.resolvedAddresses();
if (F.isEmpty(addrs))
return false;
boolean retry = false;
boolean joinImpossible = false;
Collection<Exception> errs = new ArrayList<>();
for (InetSocketAddress addr : addrs) {
try {
IgniteSpiOperationTimeoutHelper timeoutHelper = new IgniteSpiOperationTimeoutHelper(spi, true);
Integer res;
try {
SecurityUtils.serializeVersion(1);
res = sendMessageDirectly(joinMsg, addr, timeoutHelper);
} finally {
SecurityUtils.restoreDefaultSerializeVersion();
}
assert res != null;
noResAddrs.remove(addr);
// otherwise two CONNECTING nodes can stuck in infinite loop sending join reqs to each other forever
if (res != RES_WAIT && res != RES_CONTINUE_JOIN)
joinStartNanos = 0;
switch(res) {
case RES_WAIT:
// Concurrent startup, try sending join request again or wait if no success.
retry = true;
break;
case RES_OK:
if (log.isDebugEnabled())
log.debug("Join request message has been sent to address [addr=" + addr + ", req=" + joinMsg + ']');
// Join request sending succeeded, wait for response from topology.
return true;
case RES_JOIN_IMPOSSIBLE:
joinImpossible = true;
break;
default:
// Concurrent startup, try next node.
if (res == RES_CONTINUE_JOIN) {
if (!fromAddrs.contains(addr))
retry = true;
} else {
if (log.isDebugEnabled())
log.debug("Unexpected response to join request: " + res);
retry = true;
}
break;
}
} catch (IgniteSpiException e) {
errs.add(e);
if (log.isDebugEnabled()) {
IOException ioe = X.cause(e, IOException.class);
log.debug("Failed to send join request message [addr=" + addr + ", msg=" + (ioe != null ? ioe.getMessage() : e.getMessage()) + ']');
onException("Failed to send join request message [addr=" + addr + ", msg=" + (ioe != null ? ioe.getMessage() : e.getMessage()) + ']', ioe);
}
noResAddrs.add(addr);
}
if (joinImpossible)
throw new IgniteSpiException("Impossible to continue join, " + "check if local discovery and communication ports " + "are not blocked with firewall [addr=" + addr + ", req=" + joinMsg + ", discoLocalPort=" + spi.getLocalPort() + ", discoLocalPortRange=" + spi.getLocalPortRange() + ']');
}
if (retry) {
if (log.isDebugEnabled())
log.debug("Concurrent discovery SPI start has been detected (local node should wait).");
try {
U.sleep(spi.getReconnectDelay());
} catch (IgniteInterruptedCheckedException e) {
throw new IgniteSpiException("Thread has been interrupted.", e);
}
} else if (!spi.ipFinder.isShared() && !ipFinderHasLocAddr) {
IgniteCheckedException e = null;
if (!errs.isEmpty()) {
e = new IgniteCheckedException("Multiple connection attempts failed.");
for (Exception err : errs) e.addSuppressed(err);
}
if (X.hasCause(e, ConnectException.class)) {
LT.warn(log, "Failed to connect to any address from IP finder " + "(make sure IP finder addresses are correct and firewalls are disabled on all host machines): " + toOrderedList(addrs), true);
}
if (spi.joinTimeout > 0) {
if (joinStartNanos == 0)
joinStartNanos = System.nanoTime();
else if (U.millisSinceNanos(joinStartNanos) > spi.joinTimeout)
throw new IgniteSpiException("Failed to connect to any address from IP finder within join timeout " + "(make sure IP finder addresses are correct, and operating system firewalls are disabled " + "on all host machines, or consider increasing 'joinTimeout' configuration property): " + addrs, e);
}
try {
U.sleep(spi.getReconnectDelay());
} catch (IgniteInterruptedCheckedException ex) {
throw new IgniteSpiException("Thread has been interrupted.", ex);
}
} else
break;
}
return false;
}
use of org.apache.ignite.spi.IgniteSpiException in project ignite by apache.
the class ServerImpl method sendMessageDirectly.
/**
* Establishes connection to an address, sends message and returns the response (if any).
*
* @param msg Message to send.
* @param addr Address to send message to.
* @param timeoutHelper Operation timeout helper.
* @return Response read from the recipient or {@code null} if no response is supposed.
* @throws IgniteSpiException If an error occurs.
*/
@Nullable
private Integer sendMessageDirectly(TcpDiscoveryAbstractMessage msg, InetSocketAddress addr, IgniteSpiOperationTimeoutHelper timeoutHelper) throws IgniteSpiException {
assert msg != null;
assert addr != null;
Collection<Throwable> errs = null;
long ackTimeout0 = spi.getAckTimeout();
int connectAttempts = 1;
int sslConnectAttempts = 3;
boolean joinReqSent;
UUID locNodeId = getLocalNodeId();
int reconCnt = 0;
while (true) {
// Need to set to false on each new iteration,
// since remote node may leave in the middle of the first iteration.
joinReqSent = false;
boolean openSock = false;
Socket sock = null;
try {
long tsNanos = System.nanoTime();
sock = spi.openSocket(addr, timeoutHelper);
openSock = true;
TcpDiscoveryHandshakeRequest req = new TcpDiscoveryHandshakeRequest(locNodeId);
// Handshake.
spi.writeToSocket(sock, req, timeoutHelper.nextTimeoutChunk(spi.getSocketTimeout()));
TcpDiscoveryHandshakeResponse res = spi.readMessage(sock, null, timeoutHelper.nextTimeoutChunk(ackTimeout0));
if (msg instanceof TcpDiscoveryJoinRequestMessage) {
boolean ignore = false;
// The only way to know is passing flag directly with handshake response.
if (!res.isDiscoveryDataPacketCompression())
((TcpDiscoveryJoinRequestMessage) msg).gridDiscoveryData().unzipData(log);
synchronized (mux) {
for (TcpDiscoveryNode failedNode : failedNodes.keySet()) {
if (failedNode.id().equals(res.creatorNodeId())) {
if (log.isDebugEnabled())
log.debug("Ignore response from node from failed list: " + res);
ignore = true;
break;
}
}
}
if (ignore)
break;
}
if (locNodeId.equals(res.creatorNodeId())) {
if (log.isDebugEnabled())
log.debug("Handshake response from local node: " + res);
break;
}
// Send message.
tsNanos = System.nanoTime();
spi.writeToSocket(sock, msg, timeoutHelper.nextTimeoutChunk(spi.getSocketTimeout()));
long tsNanos0 = System.nanoTime();
if (debugMode)
debugLog(msg, "Message has been sent directly to address [msg=" + msg + ", addr=" + addr + ", rmtNodeId=" + res.creatorNodeId() + ']');
if (log.isDebugEnabled())
log.debug("Message has been sent directly to address [msg=" + msg + ", addr=" + addr + ", rmtNodeId=" + res.creatorNodeId() + ']');
// Connection has been established, but
// join request may not be unmarshalled on remote host.
// E.g. due to class not found issue.
joinReqSent = msg instanceof TcpDiscoveryJoinRequestMessage;
int receipt = spi.readReceipt(sock, timeoutHelper.nextTimeoutChunk(ackTimeout0));
spi.stats.onMessageSent(msg, U.nanosToMillis(tsNanos0 - tsNanos));
return receipt;
} catch (ClassCastException e) {
// on dedicated machines.
if (log.isDebugEnabled())
U.error(log, "Class cast exception on direct send: " + addr, e);
onException("Class cast exception on direct send: " + addr, e);
if (errs == null)
errs = new ArrayList<>();
errs.add(e);
} catch (IOException | IgniteCheckedException e) {
if (log.isDebugEnabled())
log.error("Exception on direct send: " + e.getMessage(), e);
onException("Exception on direct send: " + e.getMessage(), e);
if (errs == null)
errs = new ArrayList<>();
errs.add(e);
if (X.hasCause(e, SSLException.class)) {
if (--sslConnectAttempts == 0)
throw new IgniteException("Unable to establish secure connection. " + "Was remote cluster configured with SSL? [rmtAddr=" + addr + ", errMsg=\"" + e.getMessage() + "\"]", e);
continue;
}
if (X.hasCause(e, StreamCorruptedException.class)) {
// StreamCorruptedException could be caused by remote node failover
if (connectAttempts < 2) {
connectAttempts++;
continue;
}
if (log.isDebugEnabled())
log.debug("Connect failed with StreamCorruptedException, skip address: " + addr);
break;
}
if (spi.failureDetectionTimeoutEnabled() && timeoutHelper.checkFailureTimeoutReached(e))
break;
if (!spi.failureDetectionTimeoutEnabled() && ++reconCnt == spi.getReconnectCount())
break;
if (!openSock) {
// Reconnect for the second time, if connection is not established.
if (connectAttempts < 2) {
connectAttempts++;
continue;
}
// Don't retry if we can not establish connection.
break;
}
if (!spi.failureDetectionTimeoutEnabled() && (e instanceof SocketTimeoutException || X.hasCause(e, SocketTimeoutException.class))) {
ackTimeout0 *= 2;
if (!checkAckTimeout(ackTimeout0))
break;
}
} finally {
U.closeQuiet(sock);
}
}
if (joinReqSent) {
if (log.isDebugEnabled())
log.debug("Join request has been sent, but receipt has not been read (returning RES_WAIT).");
// however, warning on timed out join will be output.
return RES_OK;
}
throw new IgniteSpiException("Failed to send message to address [addr=" + addr + ", msg=" + msg + ']', U.exceptionWithSuppressed("Failed to send message to address " + "[addr=" + addr + ", msg=" + msg + ']', errs));
}
use of org.apache.ignite.spi.IgniteSpiException in project ignite by apache.
the class ServerImpl method joinTopology.
/**
* Tries to join this node to topology.
*
* @throws IgniteSpiException If any error occurs.
*/
private void joinTopology() throws IgniteSpiException {
synchronized (mux) {
assert spiState == CONNECTING || spiState == DISCONNECTED;
spiState = CONNECTING;
}
SecurityCredentials locCred = (SecurityCredentials) locNode.getAttributes().get(IgniteNodeAttributes.ATTR_SECURITY_CREDENTIALS);
boolean auth = false;
if (spi.nodeAuth != null && spi.nodeAuth.isGlobalNodeAuthentication()) {
localAuthentication(locCred);
auth = true;
}
// Marshal credentials for backward compatibility and security.
marshalCredentials(locNode, locCred);
DiscoveryDataPacket discoveryData = spi.collectExchangeData(new DiscoveryDataPacket(getLocalNodeId()));
TcpDiscoveryJoinRequestMessage joinReqMsg = new TcpDiscoveryJoinRequestMessage(locNode, discoveryData);
joinReqMsg.spanContainer().span(tracing.create(TraceableMessagesTable.traceName(joinReqMsg.getClass())).addTag(SpanTags.tag(SpanTags.EVENT_NODE, SpanTags.ID), () -> locNode.id().toString()).addTag(SpanTags.tag(SpanTags.EVENT_NODE, SpanTags.CONSISTENT_ID), () -> locNode.consistentId().toString()).addLog(() -> "Created"));
tracing.messages().beforeSend(joinReqMsg);
while (true) {
if (!sendJoinRequestMessage(joinReqMsg)) {
if (log.isDebugEnabled())
log.debug("Join request message has not been sent (local node is the first in the topology).");
if (!auth && spi.nodeAuth != null)
localAuthentication(locCred);
// TODO IGNITE-11272
FutureTask<Void> fut = msgWorker.addTask(new FutureTask<Void>() {
@Override
protected Void body() {
pendingCustomMsgs.clear();
msgWorker.pendingMsgs.reset(null, null, null);
msgWorker.next = null;
failedNodes.clear();
leavingNodes.clear();
failedNodesMsgSent.clear();
locNode.attributes().remove(IgniteNodeAttributes.ATTR_SECURITY_CREDENTIALS);
locNode.order(1);
locNode.internalOrder(1);
spi.gridStartTime = U.currentTimeMillis();
locNode.visible(true);
ring.clear();
ring.topologyVersion(1);
synchronized (mux) {
topHist.clear();
spiState = CONNECTED;
mux.notifyAll();
}
notifyDiscovery(EVT_NODE_JOINED, 1, locNode, joinReqMsg.spanContainer());
return null;
}
});
try {
fut.get();
} catch (IgniteCheckedException e) {
throw new IgniteSpiException(e);
}
msgWorker.nullifyDiscoData();
break;
}
if (log.isDebugEnabled())
log.debug("Join request message has been sent (waiting for coordinator response).");
synchronized (mux) {
long timeout = spi.netTimeout;
long thresholdNanos = System.nanoTime() + U.millisToNanos(timeout);
while (spiState == CONNECTING && timeout > 0) {
try {
mux.wait(timeout);
timeout = U.nanosToMillis(thresholdNanos - System.nanoTime());
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new IgniteSpiException("Thread has been interrupted.", e);
}
}
if (spiState == CONNECTED)
break;
else if (spiState == DUPLICATE_ID)
throw spi.duplicateIdError((TcpDiscoveryDuplicateIdMessage) joinRes.get());
else if (spiState == AUTH_FAILED)
throw spi.authenticationFailedError((TcpDiscoveryAuthFailedMessage) joinRes.get());
else if (spiState == CHECK_FAILED)
throw spi.checkFailedError((TcpDiscoveryCheckFailedMessage) joinRes.get());
else if (spiState == RING_FAILED) {
throw new IgniteSpiException("Unable to connect to next nodes in a ring, it seems local node is " + "experiencing connectivity issues or the rest of the cluster is undergoing massive restarts. " + "Failing local node join to avoid case when one node fails a big part of cluster. To disable" + " this behavior set TcpDiscoverySpi.setConnectionRecoveryTimeout() to 0. " + "[connRecoveryTimeout=" + spi.connRecoveryTimeout + ", effectiveConnRecoveryTimeout=" + spi.getEffectiveConnectionRecoveryTimeout() + ']');
} else if (spiState == LOOPBACK_PROBLEM) {
TcpDiscoveryLoopbackProblemMessage msg = (TcpDiscoveryLoopbackProblemMessage) joinRes.get();
boolean locHostLoopback = spi.locHost.isLoopbackAddress();
String firstNode = locHostLoopback ? "local" : "remote";
String secondNode = locHostLoopback ? "remote" : "local";
throw new IgniteSpiException("Failed to add node to topology because " + firstNode + " node is configured to use loopback address, but " + secondNode + " node is not " + "(consider changing 'localAddress' configuration parameter) " + "[locNodeAddrs=" + U.addressesAsString(locNode) + ", rmtNodeAddrs=" + U.addressesAsString(msg.addresses(), msg.hostNames()) + ", creatorNodeId=" + msg.creatorNodeId() + ']');
} else
LT.warn(log, "Node has not been connected to topology and will repeat join process. " + "Check remote nodes logs for possible error messages. " + "Note that large topology may require significant time to start. " + "Increase 'TcpDiscoverySpi.networkTimeout' configuration property " + "if getting this message on the starting nodes [networkTimeout=" + spi.netTimeout + ']');
}
}
assert locNode.order() != 0;
assert locNode.internalOrder() != 0;
if (log.isDebugEnabled())
log.debug("Discovery SPI has been connected to topology with order: " + locNode.internalOrder());
joinReqMsg.spanContainer().span().addTag(SpanTags.tag(SpanTags.NODE, SpanTags.ORDER), () -> String.valueOf(locNode.order())).addLog(() -> "Joined to ring").end();
}
use of org.apache.ignite.spi.IgniteSpiException in project ignite by apache.
the class TcpDiscoverySpi method resolvedAddresses.
/**
* Resolves addresses registered in the IP finder, removes duplicates and local host
* address and returns the collection of.
*
* @return Resolved addresses without duplicates and local address (potentially
* empty but never null).
* @throws org.apache.ignite.spi.IgniteSpiException If an error occurs.
*/
protected Collection<InetSocketAddress> resolvedAddresses() throws IgniteSpiException {
// Time when resolution process started.
long resolutionStartNanos = System.nanoTime();
List<InetSocketAddress> res = new ArrayList<>();
Collection<InetSocketAddress> addrs;
long timeout = isClientMode() && impl.getSpiState().equalsIgnoreCase("connected") ? netTimeout : joinTimeout;
// Get consistent addresses collection.
while (true) {
try {
addrs = registeredAddresses();
break;
} catch (IgniteSpiException e) {
LT.error(log, e, "Failed to get registered addresses from IP finder " + "(retrying every " + getReconnectDelay() + "ms;" + " change 'reconnectDelay' to configure the frequency of retries) " + "[maxTimeout=" + timeout + "]", true);
}
try {
if (timeout > 0 && U.millisSinceNanos(resolutionStartNanos) > timeout) {
LT.warn(log, "Unable to get registered addresses from IP finder, timeout is reached " + "(consider increasing 'joinTimeout' for join process or 'netTimeout' for reconnection) " + "[joinTimeout=" + joinTimeout + ", netTimeout=" + netTimeout + "]");
addrs = res;
break;
}
U.sleep(getReconnectDelay());
} catch (IgniteInterruptedCheckedException e) {
throw new IgniteSpiException("Thread has been interrupted.", e);
}
}
for (InetSocketAddress addr : addrs) {
assert addr != null;
try {
if (addressFilter != null && !addressFilter.apply(addr))
continue;
InetSocketAddress resolved = addr.isUnresolved() ? new InetSocketAddress(InetAddress.getByName(addr.getHostName()), addr.getPort()) : addr;
if (locNodeAddrs == null || !locNodeAddrs.contains(resolved))
res.add(resolved);
} catch (UnknownHostException ignored) {
LT.warn(log, "Failed to resolve address from IP finder (host is unknown): " + addr);
// Add address in any case.
res.add(addr);
}
}
if (!res.isEmpty() && !skipAddrsRandomization)
Collections.shuffle(res);
return res;
}
Aggregations