Search in sources :

Example 1 with HostStatus

use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.

the class AgentManagerImpl method handleDisconnectWithInvestigation.

protected boolean handleDisconnectWithInvestigation(final AgentAttache attache, Event event) {
    final long hostId = attache.getId();
    HostVO host = this._hostDao.findById(hostId);
    if (host != null) {
        HostStatus nextStatus = null;
        try {
            nextStatus = host.getStatus().getNextStatus(event);
        } catch (final NoTransitionException ne) {
            /*
                 * Agent may be currently in status of Down, Alert, Removed, namely there is no next status for some events. Why this can happen? Ask God not me. I hate there was
                 * no piece of comment for code handling race condition. God knew what race condition the code dealt with!
                 */
            s_logger.debug("Caught exception while getting agent's next status", ne);
        }
        if (nextStatus == HostStatus.Alert) {
            /* OK, we are going to the bad status, let's see what happened */
            s_logger.info("Investigating why host " + hostId + " has disconnected with event " + event);
            HostStatus determinedState = investigate(attache);
            // if state cannot be determined do nothing and bail out
            if (determinedState == null) {
                if ((System.currentTimeMillis() >> 10) - host.getLastPinged() > this.AlertWait.value()) {
                    s_logger.warn("Agent " + hostId + " state cannot be determined for more than " + this.AlertWait + "(" + this.AlertWait.value() + ") seconds, will go to Alert state");
                    determinedState = HostStatus.Alert;
                } else {
                    s_logger.warn("Agent " + hostId + " state cannot be determined, do nothing");
                    return false;
                }
            }
            final HostStatus currentStatus = host.getStatus();
            s_logger.info("The agent from host " + hostId + " state determined is " + determinedState);
            if (determinedState == HostStatus.Down) {
                final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
                s_logger.error(message);
                if (host.getType() != HostType.SecondaryStorage && host.getType() != HostType.ConsoleProxy) {
                    this._alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
                }
                event = Event.HostDown;
            } else if (determinedState == HostStatus.Up) {
                /* Got ping response from host, bring it back */
                s_logger.info("Agent is determined to be up and running");
                agentStatusTransitTo(host, Event.Ping, this._nodeId);
                return false;
            } else if (determinedState == HostStatus.Disconnected) {
                s_logger.warn("Agent is disconnected but the host is still up: " + host.getId() + "-" + host.getName());
                if (currentStatus == HostStatus.Disconnected) {
                    if ((System.currentTimeMillis() >> 10) - host.getLastPinged() > this.AlertWait.value()) {
                        s_logger.warn("Host " + host.getId() + " has been disconnected past the wait time it should be disconnected.");
                        event = Event.WaitedTooLong;
                    } else {
                        s_logger.debug("Host " + host.getId() + " has been determined to be disconnected but it hasn't passed the wait time yet.");
                        return false;
                    }
                } else if (currentStatus == HostStatus.Up) {
                    final Zone zone = this._zoneRepository.findById(host.getDataCenterId()).orElse(null);
                    final HostPodVO podVO = this._podDao.findById(host.getPodId());
                    final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + zone.getName() + ", pod: " + podVO.getName();
                    if (host.getType() != HostType.SecondaryStorage && host.getType() != HostType.ConsoleProxy) {
                        this._alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + hostDesc, "If the agent for host [" + hostDesc + "] is not restarted within " + this.AlertWait + " seconds, host will go to Alert state");
                    }
                    event = Event.AgentDisconnected;
                }
            } else {
                // if we end up here we are in alert state, send an alert
                final Zone zone = this._zoneRepository.findById(host.getDataCenterId()).orElse(null);
                final HostPodVO podVO = this._podDao.findById(host.getPodId());
                final String podName = podVO != null ? podVO.getName() : "NO POD";
                final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + zone.getName() + ", pod: " + podName;
                this._alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host in ALERT state, " + hostDesc, "In availability zone " + host.getDataCenterId() + ", host is in alert state: " + host.getId() + "-" + host.getName());
            }
        } else {
            s_logger.debug("The next status of agent " + host.getId() + " is not Alert, no need to investigate what happened");
        }
    }
    handleDisconnectWithoutInvestigation(attache, event, true, true);
    // Maybe the host magically reappeared?
    host = this._hostDao.findById(hostId);
    if (host != null && host.getStatus() == HostStatus.Down) {
        this._haMgr.scheduleRestartForVmsOnHost(host, true);
    }
    return true;
}
Also used : Zone(com.cloud.db.model.Zone) NoTransitionException(com.cloud.legacymodel.exceptions.NoTransitionException) HostStatus(com.cloud.legacymodel.dc.HostStatus) HostPodVO(com.cloud.dc.HostPodVO) HostVO(com.cloud.host.HostVO)

Example 2 with HostStatus

use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.

the class AgentManagerImpl method investigate.

protected HostStatus investigate(final AgentAttache agent) {
    final Long hostId = agent.getId();
    final HostVO host = this._hostDao.findById(hostId);
    if (host != null && host.getType() != null && !host.getType().isVirtual()) {
        if (s_logger.isDebugEnabled()) {
            s_logger.debug("checking if agent (" + hostId + ") is alive");
        }
        final Answer answer = easySend(hostId, new CheckHealthCommand());
        if (answer != null && answer.getResult()) {
            final HostStatus status = HostStatus.Up;
            if (s_logger.isDebugEnabled()) {
                s_logger.debug("agent (" + hostId + ") responded to checkHeathCommand, reporting that agent is " + status);
            }
            return status;
        }
        return this._haMgr.investigate(hostId);
    }
    return HostStatus.Alert;
}
Also used : AgentControlAnswer(com.cloud.legacymodel.communication.answer.AgentControlAnswer) StartupAnswer(com.cloud.legacymodel.communication.answer.StartupAnswer) PingAnswer(com.cloud.legacymodel.communication.answer.PingAnswer) ReadyAnswer(com.cloud.legacymodel.communication.answer.ReadyAnswer) Answer(com.cloud.legacymodel.communication.answer.Answer) UnsupportedAnswer(com.cloud.legacymodel.communication.answer.UnsupportedAnswer) HostStatus(com.cloud.legacymodel.dc.HostStatus) HostVO(com.cloud.host.HostVO) CheckHealthCommand(com.cloud.legacymodel.communication.command.CheckHealthCommand)

Example 3 with HostStatus

use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.

the class AgentManagerImpl method handleDisconnectWithoutInvestigation.

protected boolean handleDisconnectWithoutInvestigation(final AgentAttache attache, final Event event, final boolean transitState, final boolean removeAgent) {
    final long hostId = attache.getId();
    s_logger.info("Host " + hostId + " is disconnecting with event " + event);
    HostStatus nextStatus = null;
    final HostVO host = this._hostDao.findById(hostId);
    if (host == null) {
        s_logger.warn("Can't find host with " + hostId);
        nextStatus = HostStatus.Removed;
    } else {
        final HostStatus currentStatus = host.getStatus();
        if (currentStatus == HostStatus.Down || currentStatus == HostStatus.Alert || currentStatus == HostStatus.Removed) {
            if (s_logger.isDebugEnabled()) {
                s_logger.debug("Host " + hostId + " is already " + currentStatus);
            }
            nextStatus = currentStatus;
        } else {
            try {
                nextStatus = currentStatus.getNextStatus(event);
            } catch (final NoTransitionException e) {
                final String err = "Cannot find next status for " + event + " as current status is " + currentStatus + " for agent " + hostId;
                s_logger.debug(err);
                throw new CloudRuntimeException(err);
            }
            if (s_logger.isDebugEnabled()) {
                s_logger.debug("The next status of agent " + hostId + "is " + nextStatus + ", current status is " + currentStatus);
            }
        }
    }
    if (s_logger.isDebugEnabled()) {
        s_logger.debug("Deregistering link for " + hostId + " with state " + nextStatus);
    }
    removeAgent(attache, nextStatus);
    // update the DB
    if (host != null && transitState) {
        disconnectAgent(host, event, this._nodeId);
    }
    return true;
}
Also used : CloudRuntimeException(com.cloud.legacymodel.exceptions.CloudRuntimeException) NoTransitionException(com.cloud.legacymodel.exceptions.NoTransitionException) HostStatus(com.cloud.legacymodel.dc.HostStatus) HostVO(com.cloud.host.HostVO)

Example 4 with HostStatus

use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.

the class HighAvailabilityManagerImpl method investigate.

@Override
public HostStatus investigate(final long hostId) {
    final HostVO host = _hostDao.findById(hostId);
    if (host == null) {
        return HostStatus.Alert;
    }
    HostStatus hostState = null;
    for (final Investigator investigator : investigators) {
        hostState = investigator.isAgentAlive(host);
        if (hostState != null) {
            if (s_logger.isDebugEnabled()) {
                s_logger.debug(investigator.getName() + " was able to determine host " + hostId + " is in " + hostState.toString());
            }
            return hostState;
        }
        if (s_logger.isDebugEnabled()) {
            s_logger.debug(investigator.getName() + " unable to determine the state of the host.  Moving on.");
        }
    }
    return hostState;
}
Also used : HostStatus(com.cloud.legacymodel.dc.HostStatus) HostVO(com.cloud.host.HostVO)

Example 5 with HostStatus

use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.

the class ManagementIPSystemVMInvestigator method isVmAlive.

@Override
public boolean isVmAlive(final VirtualMachine vm, final Host host) throws UnknownVM {
    if (!vm.getType().isUsedBySystem()) {
        s_logger.debug("Not a System Vm, unable to determine state of " + vm + " returning null");
    }
    if (s_logger.isDebugEnabled()) {
        s_logger.debug("Testing if " + vm + " is alive");
    }
    if (vm.getHostId() == null) {
        s_logger.debug("There's no host id for " + vm);
        throw new UnknownVM();
    }
    final HostVO vmHost = _hostDao.findById(vm.getHostId());
    if (vmHost == null) {
        s_logger.debug("Unable to retrieve the host by using id " + vm.getHostId());
        throw new UnknownVM();
    }
    final List<? extends Nic> nics = _networkMgr.getNicsForTraffic(vm.getId(), TrafficType.Management);
    if (nics.size() == 0) {
        if (s_logger.isDebugEnabled()) {
            s_logger.debug("Unable to find a management nic, cannot ping this system VM, unable to determine state of " + vm + " returning null");
        }
        throw new UnknownVM();
    }
    for (final Nic nic : nics) {
        if (nic.getIPv4Address() == null) {
            continue;
        }
        // get the data center IP address, find a host on the pod, use that host to ping the data center IP address
        final List<Long> otherHosts = findHostByPod(vmHost.getPodId(), vm.getHostId());
        for (final Long otherHost : otherHosts) {
            final HostStatus vmState = testIpAddress(otherHost, nic.getIPv4Address());
            assert vmState != null;
            // In case of HostStatus.Unknown, next host will be tried
            if (vmState == HostStatus.Up) {
                if (s_logger.isDebugEnabled()) {
                    s_logger.debug("successfully pinged vm's private IP (" + vm.getPrivateIpAddress() + "), returning that the VM is up");
                }
                return Boolean.TRUE;
            } else if (vmState == HostStatus.Down) {
                // We can't ping the VM directly...if we can ping the host, then report the VM down.
                // If we can't ping the host, then we don't have enough information.
                final HostStatus vmHostState = testIpAddress(otherHost, vmHost.getPrivateIpAddress());
                assert vmHostState != null;
                if (vmHostState == HostStatus.Up) {
                    if (s_logger.isDebugEnabled()) {
                        s_logger.debug("successfully pinged vm's host IP (" + vmHost.getPrivateIpAddress() + "), but could not ping VM, returning that the VM is down");
                    }
                    return Boolean.FALSE;
                }
            }
        }
    }
    if (s_logger.isDebugEnabled()) {
        s_logger.debug("unable to determine state of " + vm + " returning null");
    }
    throw new UnknownVM();
}
Also used : Nic(com.cloud.legacymodel.network.Nic) HostStatus(com.cloud.legacymodel.dc.HostStatus) HostVO(com.cloud.host.HostVO)

Aggregations

HostStatus (com.cloud.legacymodel.dc.HostStatus)7 HostVO (com.cloud.host.HostVO)6 Answer (com.cloud.legacymodel.communication.answer.Answer)3 NoTransitionException (com.cloud.legacymodel.exceptions.NoTransitionException)3 AgentControlAnswer (com.cloud.legacymodel.communication.answer.AgentControlAnswer)2 PingAnswer (com.cloud.legacymodel.communication.answer.PingAnswer)2 ReadyAnswer (com.cloud.legacymodel.communication.answer.ReadyAnswer)2 StartupAnswer (com.cloud.legacymodel.communication.answer.StartupAnswer)2 UnsupportedAnswer (com.cloud.legacymodel.communication.answer.UnsupportedAnswer)2 CloudRuntimeException (com.cloud.legacymodel.exceptions.CloudRuntimeException)2 Zone (com.cloud.db.model.Zone)1 HostPodVO (com.cloud.dc.HostPodVO)1 CheckHealthCommand (com.cloud.legacymodel.communication.command.CheckHealthCommand)1 CheckOnHostCommand (com.cloud.legacymodel.communication.command.CheckOnHostCommand)1 Host (com.cloud.legacymodel.dc.Host)1 AgentUnavailableException (com.cloud.legacymodel.exceptions.AgentUnavailableException)1 ConnectionException (com.cloud.legacymodel.exceptions.ConnectionException)1 HypervisorVersionChangedException (com.cloud.legacymodel.exceptions.HypervisorVersionChangedException)1 NioConnectionException (com.cloud.legacymodel.exceptions.NioConnectionException)1 OperationTimedoutException (com.cloud.legacymodel.exceptions.OperationTimedoutException)1