Search in sources :

Example 1 with ActiveFencingException

use of com.cloud.legacymodel.exceptions.ActiveFencingException in project cosmic by MissionCriticalCloud.

the class ClusterManagerImpl method peerScan.

private void peerScan() throws ActiveFencingException {
    final Date cutTime = DateUtil.currentGMTTime();
    final Profiler profiler = new Profiler();
    profiler.start();
    final Profiler profilerQueryActiveList = new Profiler();
    profilerQueryActiveList.start();
    final List<ManagementServerHostVO> currentList = _mshostDao.getActiveList(new Date(cutTime.getTime() - HeartbeatThreshold.value()));
    profilerQueryActiveList.stop();
    final Profiler profilerSyncClusterInfo = new Profiler();
    profilerSyncClusterInfo.start();
    final List<ManagementServerHostVO> removedNodeList = new ArrayList<>();
    final List<ManagementServerHostVO> invalidatedNodeList = new ArrayList<>();
    if (_mshostId != null) {
        if (_mshostPeerDao.countStateSeenInPeers(_mshostId, _runId, ManagementServerHost.State.Down) > 0) {
            final String msg = "We have detected that at least one management server peer reports that this management server is down, perform active fencing to avoid split-brain " + "situation";
            s_logger.error(msg);
            throw new ActiveFencingException(msg);
        }
        // only if we have already attached to cluster, will we start to check leaving nodes
        for (final Map.Entry<Long, ManagementServerHostVO> entry : _activePeers.entrySet()) {
            final ManagementServerHostVO current = getInListById(entry.getKey(), currentList);
            if (current == null) {
                if (entry.getKey().longValue() != _mshostId.longValue()) {
                    if (s_logger.isDebugEnabled()) {
                        s_logger.debug("Detected management node left, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
                    }
                    removedNodeList.add(entry.getValue());
                }
            } else {
                if (current.getRunid() == 0) {
                    if (entry.getKey().longValue() != _mshostId.longValue()) {
                        if (s_logger.isDebugEnabled()) {
                            s_logger.debug("Detected management node left because of invalidated session, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
                        }
                        invalidatedNodeList.add(entry.getValue());
                    }
                } else {
                    if (entry.getValue().getRunid() != current.getRunid()) {
                        if (s_logger.isDebugEnabled()) {
                            s_logger.debug("Detected management node left and rejoined quickly, id:" + entry.getKey() + ", nodeIP:" + entry.getValue().getServiceIP());
                        }
                        entry.getValue().setRunid(current.getRunid());
                    }
                }
            }
        }
    }
    profilerSyncClusterInfo.stop();
    final Profiler profilerInvalidatedNodeList = new Profiler();
    profilerInvalidatedNodeList.start();
    // process invalidated node list
    if (invalidatedNodeList.size() > 0) {
        for (final ManagementServerHostVO mshost : invalidatedNodeList) {
            _activePeers.remove(mshost.getId());
            try {
                JmxUtil.unregisterMBean("ClusterManager", "Node " + mshost.getId());
            } catch (final Exception e) {
                s_logger.warn("Unable to deregiester cluster node from JMX monitoring due to exception " + e.toString());
            }
        }
        queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, invalidatedNodeList));
    }
    profilerInvalidatedNodeList.stop();
    final Profiler profilerRemovedList = new Profiler();
    profilerRemovedList.start();
    // process removed node list
    final Iterator<ManagementServerHostVO> it = removedNodeList.iterator();
    while (it.hasNext()) {
        final ManagementServerHostVO mshost = it.next();
        if (!pingManagementNode(mshost)) {
            s_logger.warn("Management node " + mshost.getId() + " is detected inactive by timestamp and also not pingable");
            _activePeers.remove(mshost.getId());
            try {
                JmxUtil.unregisterMBean("ClusterManager", "Node " + mshost.getId());
            } catch (final Exception e) {
                s_logger.warn("Unable to deregiester cluster node from JMX monitoring due to exception " + e.toString());
            }
        } else {
            s_logger.info("Management node " + mshost.getId() + " is detected inactive by timestamp but is pingable");
            it.remove();
        }
    }
    if (removedNodeList.size() > 0) {
        queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved, removedNodeList));
    }
    profilerRemovedList.stop();
    final List<ManagementServerHostVO> newNodeList = new ArrayList<>();
    for (final ManagementServerHostVO mshost : currentList) {
        if (!_activePeers.containsKey(mshost.getId())) {
            _activePeers.put(mshost.getId(), mshost);
            if (s_logger.isDebugEnabled()) {
                s_logger.debug("Detected management node joined, id:" + mshost.getId() + ", nodeIP:" + mshost.getServiceIP());
            }
            newNodeList.add(mshost);
            try {
                JmxUtil.registerMBean("ClusterManager", "Node " + mshost.getId(), new ClusterManagerMBeanImpl(this, mshost));
            } catch (final Exception e) {
                s_logger.warn("Unable to regiester cluster node into JMX monitoring due to exception " + ExceptionUtil.toString(e));
            }
        }
    }
    if (newNodeList.size() > 0) {
        queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeAdded, newNodeList));
    }
    profiler.stop();
    if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
        if (s_logger.isDebugEnabled()) {
            s_logger.debug("Peer scan takes too long to finish. profiler: " + profiler.toString() + ", profilerQueryActiveList: " + profilerQueryActiveList.toString() + ", profilerSyncClusterInfo: " + profilerSyncClusterInfo.toString() + ", profilerInvalidatedNodeList: " + profilerInvalidatedNodeList.toString() + ", profilerRemovedList: " + profilerRemovedList.toString());
        }
    }
}
Also used : ArrayList(java.util.ArrayList) Date(java.util.Date) SQLNonTransientException(java.sql.SQLNonTransientException) ActiveFencingException(com.cloud.legacymodel.exceptions.ActiveFencingException) RemoteException(java.rmi.RemoteException) ConfigurationException(javax.naming.ConfigurationException) SQLException(java.sql.SQLException) ConnectException(java.net.ConnectException) SQLRecoverableException(java.sql.SQLRecoverableException) IOException(java.io.IOException) CloudRuntimeException(com.cloud.legacymodel.exceptions.CloudRuntimeException) Profiler(com.cloud.utils.Profiler) ActiveFencingException(com.cloud.legacymodel.exceptions.ActiveFencingException) Map(java.util.Map) HashMap(java.util.HashMap)

Example 2 with ActiveFencingException

use of com.cloud.legacymodel.exceptions.ActiveFencingException in project cosmic by MissionCriticalCloud.

the class ClusterManagerImpl method getHeartbeatTask.

private Runnable getHeartbeatTask() {
    return new ManagedContextRunnable() {

        @Override
        protected void runInContext() {
            final TransactionLegacy txn = TransactionLegacy.open("ClusterHeartbeat");
            try {
                final Profiler profiler = new Profiler();
                final Profiler profilerHeartbeatUpdate = new Profiler();
                final Profiler profilerPeerScan = new Profiler();
                try {
                    profiler.start();
                    profilerHeartbeatUpdate.start();
                    txn.transitToUserManagedConnection(getHeartbeatConnection());
                    if (s_logger.isTraceEnabled()) {
                        s_logger.trace("Cluster manager heartbeat update, id:" + _mshostId);
                    }
                    _mshostDao.update(_mshostId, _runId, DateUtil.currentGMTTime());
                    profilerHeartbeatUpdate.stop();
                    profilerPeerScan.start();
                    if (s_logger.isTraceEnabled()) {
                        s_logger.trace("Cluster manager peer-scan, id:" + _mshostId);
                    }
                    if (!_peerScanInited) {
                        _peerScanInited = true;
                        initPeerScan();
                    }
                    peerScan();
                    profilerPeerScan.stop();
                } catch (final SQLException e) {
                    s_logger.error("Unexpected exception in cluster heartbeat", e);
                    if (isRootCauseConnectionRelated(e.getCause())) {
                        invalidHeartbeatConnection();
                    }
                } finally {
                    profiler.stop();
                    if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
                        if (s_logger.isDebugEnabled()) {
                            s_logger.debug("Management server heartbeat takes too long to finish. profiler: " + profiler.toString() + ", profilerHeartbeatUpdate: " + profilerHeartbeatUpdate.toString() + ", profilerPeerScan: " + profilerPeerScan.toString());
                        }
                    }
                }
            } catch (final CloudRuntimeException e) {
                s_logger.error("Runtime DB exception ", e.getCause());
                if (e.getCause() instanceof ClusterInvalidSessionException) {
                    s_logger.error("Invalid cluster session found, fence it");
                    queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
                }
                if (isRootCauseConnectionRelated(e.getCause())) {
                    invalidHeartbeatConnection();
                }
            } catch (final ActiveFencingException e) {
                queueNotification(new ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeIsolated));
            } finally {
                txn.transitToAutoManagedConnection(TransactionLegacy.CLOUD_DB);
                txn.close("ClusterHeartbeat");
            }
        }
    };
}
Also used : TransactionLegacy(com.cloud.utils.db.TransactionLegacy) ManagedContextRunnable(com.cloud.common.managed.context.ManagedContextRunnable) Profiler(com.cloud.utils.Profiler) SQLException(java.sql.SQLException) CloudRuntimeException(com.cloud.legacymodel.exceptions.CloudRuntimeException) ActiveFencingException(com.cloud.legacymodel.exceptions.ActiveFencingException)

Aggregations

ActiveFencingException (com.cloud.legacymodel.exceptions.ActiveFencingException)2 CloudRuntimeException (com.cloud.legacymodel.exceptions.CloudRuntimeException)2 Profiler (com.cloud.utils.Profiler)2 SQLException (java.sql.SQLException)2 ManagedContextRunnable (com.cloud.common.managed.context.ManagedContextRunnable)1 TransactionLegacy (com.cloud.utils.db.TransactionLegacy)1 IOException (java.io.IOException)1 ConnectException (java.net.ConnectException)1 RemoteException (java.rmi.RemoteException)1 SQLNonTransientException (java.sql.SQLNonTransientException)1 SQLRecoverableException (java.sql.SQLRecoverableException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 ConfigurationException (javax.naming.ConfigurationException)1