use of com.cloud.legacymodel.exceptions.ActiveFencingException in project cosmic by MissionCriticalCloud.
the class ClusterManagerImpl method peerScan.
private void peerScan() throws ActiveFencingException {
final Date cutTime = DateUtil.currentGMTTime();
final Profiler profiler = new Profiler();
profiler.start();
final Profiler profilerQueryActiveList = new Profiler();
profilerQueryActiveList.start();
final List<ManagementServerHostVO> currentList = _mshostDao.getActiveList(new Date(cutTime.getTime() - HeartbeatThreshold.value()));
profilerQueryActiveList.stop();
final Profiler profilerSyncClusterInfo = new Profiler();
profilerSyncClusterInfo.start();
final List<ManagementServerHostVO> removedNodeList = new ArrayList<>();
final List<ManagementServerHostVO> invalidatedNodeList = new ArrayList<>();
if (_mshostId != null) {
if (_mshostPeerDao.countStateSeenInPeers(_mshostId, _runId, ManagementServerHost.State.Down) > 0) {
final String msg = "We have detected that at least one management server peer reports that this management server is down, perform active fencing to avoid split-brain " + "situation";
s_logger.error(msg);
throw new ActiveFencingException(msg);
}
// only if we have already attached to cluster, will we start to check leaving nodes
for (final Map.Entry<Long, ManagementServerHostVO> entry : _activePeers.entrySet()) {
final ManagementServerHostVO current = getInListById(entry.getKey(), currentList);
if (current == null) {
if (entry.getKey().longValue() != _mshostId.longValue()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Detected management node left, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
}
removedNodeList.add(entry.getValue());
}
} else {
if (current.getRunid() == 0) {
if (entry.getKey().longValue() != _mshostId.longValue()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Detected management node left because of invalidated session, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
}
invalidatedNodeList.add(entry.getValue());
}
} else {
if (entry.getValue().getRunid() != current.getRunid()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Detected management node left and rejoined quickly, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
}
entry.getValue().setRunid(current.getRunid());
}
}
}
}
}
profilerSyncClusterInfo.stop();
final Profiler profilerInvalidatedNodeList = new Profiler();
profilerInvalidatedNodeList.start();
// process invalidated node list
if (invalidatedNodeList.size() > 0) {
for (final ManagementServerHostVO mshost : invalidatedNodeList) {
_activePeers.remove(mshost.getId());
try {
JmxUtil.unregisterMBean("ClusterManager", "Node " + mshost.getId());
} catch (final Exception e) {
s_logger.warn("Unable to deregiester cluster node from JMX monitoring due to exception " + e.toString());
}
}
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, invalidatedNodeList));
}
profilerInvalidatedNodeList.stop();
final Profiler profilerRemovedList = new Profiler();
profilerRemovedList.start();
// process removed node list
final Iterator<ManagementServerHostVO> it = removedNodeList.iterator();
while (it.hasNext()) {
final ManagementServerHostVO mshost = it.next();
if (!pingManagementNode(mshost)) {
s_logger.warn("Management node " + mshost.getId() + " is detected inactive by timestamp and also not pingable");
_activePeers.remove(mshost.getId());
try {
JmxUtil.unregisterMBean("ClusterManager", "Node " + mshost.getId());
} catch (final Exception e) {
s_logger.warn("Unable to deregiester cluster node from JMX monitoring due to exception " + e.toString());
}
} else {
s_logger.info("Management node " + mshost.getId() + " is detected inactive by timestamp but is pingable");
it.remove();
}
}
if (removedNodeList.size() > 0) {
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, removedNodeList));
}
profilerRemovedList.stop();
final List<ManagementServerHostVO> newNodeList = new ArrayList<>();
for (final ManagementServerHostVO mshost : currentList) {
if (!_activePeers.containsKey(mshost.getId())) {
_activePeers.put(mshost.getId(), mshost);
if (s_logger.isDebugEnabled()) {
s_logger.debug("Detected management node joined, id:" + mshost.getId() + ", nodeIP:" + mshost.getServiceIP());
}
newNodeList.add(mshost);
try {
JmxUtil.registerMBean("ClusterManager", "Node " + mshost.getId(), new ClusterManagerMBeanImpl(this, mshost));
} catch (final Exception e) {
s_logger.warn("Unable to regiester cluster node into JMX monitoring due to exception " + ExceptionUtil.toString(e));
}
}
}
if (newNodeList.size() > 0) {
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeAdded, newNodeList));
}
profiler.stop();
if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Peer scan takes too long to finish. profiler: " + profiler.toString() + ", profilerQueryActiveList: " + profilerQueryActiveList.toString() + ", profilerSyncClusterInfo: " + profilerSyncClusterInfo.toString() + ", profilerInvalidatedNodeList: " + profilerInvalidatedNodeList.toString() + ", profilerRemovedList: " + profilerRemovedList.toString());
}
}
}
use of com.cloud.legacymodel.exceptions.ActiveFencingException in project cosmic by MissionCriticalCloud.
the class ClusterManagerImpl method getHeartbeatTask.
private Runnable getHeartbeatTask() {
return new ManagedContextRunnable() {
@Override
protected void runInContext() {
final TransactionLegacy txn = TransactionLegacy.open("ClusterHeartbeat");
try {
final Profiler profiler = new Profiler();
final Profiler profilerHeartbeatUpdate = new Profiler();
final Profiler profilerPeerScan = new Profiler();
try {
profiler.start();
profilerHeartbeatUpdate.start();
txn.transitToUserManagedConnection(getHeartbeatConnection());
if (s_logger.isTraceEnabled()) {
s_logger.trace("Cluster manager heartbeat update, id:" + _mshostId);
}
_mshostDao.update(_mshostId, _runId, DateUtil.currentGMTTime());
profilerHeartbeatUpdate.stop();
profilerPeerScan.start();
if (s_logger.isTraceEnabled()) {
s_logger.trace("Cluster manager peer-scan, id:" + _mshostId);
}
if (!_peerScanInited) {
_peerScanInited = true;
initPeerScan();
}
peerScan();
profilerPeerScan.stop();
} catch (final SQLException e) {
s_logger.error("Unexpected exception in cluster heartbeat", e);
if (isRootCauseConnectionRelated(e.getCause())) {
invalidHeartbeatConnection();
}
} finally {
profiler.stop();
if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Management server heartbeat takes too long to finish. profiler: " + profiler.toString() + ", profilerHeartbeatUpdate: " + profilerHeartbeatUpdate.toString() + ", profilerPeerScan: " + profilerPeerScan.toString());
}
}
}
} catch (final CloudRuntimeException e) {
s_logger.error("Runtime DB exception ", e.getCause());
if (e.getCause() instanceof ClusterInvalidSessionException) {
s_logger.error("Invalid cluster session found, fence it");
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
}
if (isRootCauseConnectionRelated(e.getCause())) {
invalidHeartbeatConnection();
}
} catch (final ActiveFencingException e) {
queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
} finally {
txn.transitToAutoManagedConnection(TransactionLegacy.CLOUD_DB);
txn.close("ClusterHeartbeat");
}
}
};
}
Aggregations