use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.
the class AgentManagerImpl method handleDisconnectWithInvestigation.
protected boolean handleDisconnectWithInvestigation(final AgentAttache attache, Event event) {
final long hostId = attache.getId();
HostVO host = this._hostDao.findById(hostId);
if (host != null) {
HostStatus nextStatus = null;
try {
nextStatus = host.getStatus().getNextStatus(event);
} catch (final NoTransitionException ne) {
/*
* Agent may be currently in status of Down, Alert, Removed, namely there is no next status for some events. Why this can happen? Ask God not me. I hate there was
* no piece of comment for code handling race condition. God knew what race condition the code dealt with!
*/
s_logger.debug("Caught exception while getting agent's next status", ne);
}
if (nextStatus == HostStatus.Alert) {
/* OK, we are going to the bad status, let's see what happened */
s_logger.info("Investigating why host " + hostId + " has disconnected with event " + event);
HostStatus determinedState = investigate(attache);
// if state cannot be determined do nothing and bail out
if (determinedState == null) {
if ((System.currentTimeMillis() >> 10) - host.getLastPinged() > this.AlertWait.value()) {
s_logger.warn("Agent " + hostId + " state cannot be determined for more than " + this.AlertWait + "(" + this.AlertWait.value() + ") seconds, will go to Alert state");
determinedState = HostStatus.Alert;
} else {
s_logger.warn("Agent " + hostId + " state cannot be determined, do nothing");
return false;
}
}
final HostStatus currentStatus = host.getStatus();
s_logger.info("The agent from host " + hostId + " state determined is " + determinedState);
if (determinedState == HostStatus.Down) {
final String message = "Host is down: " + host.getId() + "-" + host.getName() + ". Starting HA on the VMs";
s_logger.error(message);
if (host.getType() != HostType.SecondaryStorage && host.getType() != HostType.ConsoleProxy) {
this._alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host down, " + host.getId(), message);
}
event = Event.HostDown;
} else if (determinedState == HostStatus.Up) {
/* Got ping response from host, bring it back */
s_logger.info("Agent is determined to be up and running");
agentStatusTransitTo(host, Event.Ping, this._nodeId);
return false;
} else if (determinedState == HostStatus.Disconnected) {
s_logger.warn("Agent is disconnected but the host is still up: " + host.getId() + "-" + host.getName());
if (currentStatus == HostStatus.Disconnected) {
if ((System.currentTimeMillis() >> 10) - host.getLastPinged() > this.AlertWait.value()) {
s_logger.warn("Host " + host.getId() + " has been disconnected past the wait time it should be disconnected.");
event = Event.WaitedTooLong;
} else {
s_logger.debug("Host " + host.getId() + " has been determined to be disconnected but it hasn't passed the wait time yet.");
return false;
}
} else if (currentStatus == HostStatus.Up) {
final Zone zone = this._zoneRepository.findById(host.getDataCenterId()).orElse(null);
final HostPodVO podVO = this._podDao.findById(host.getPodId());
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + zone.getName() + ", pod: " + podVO.getName();
if (host.getType() != HostType.SecondaryStorage && host.getType() != HostType.ConsoleProxy) {
this._alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host disconnected, " + hostDesc, "If the agent for host [" + hostDesc + "] is not restarted within " + this.AlertWait + " seconds, host will go to Alert state");
}
event = Event.AgentDisconnected;
}
} else {
// if we end up here we are in alert state, send an alert
final Zone zone = this._zoneRepository.findById(host.getDataCenterId()).orElse(null);
final HostPodVO podVO = this._podDao.findById(host.getPodId());
final String podName = podVO != null ? podVO.getName() : "NO POD";
final String hostDesc = "name: " + host.getName() + " (id:" + host.getId() + "), availability zone: " + zone.getName() + ", pod: " + podName;
this._alertMgr.sendAlert(AlertManager.AlertType.ALERT_TYPE_HOST, host.getDataCenterId(), host.getPodId(), "Host in ALERT state, " + hostDesc, "In availability zone " + host.getDataCenterId() + ", host is in alert state: " + host.getId() + "-" + host.getName());
}
} else {
s_logger.debug("The next status of agent " + host.getId() + " is not Alert, no need to investigate what happened");
}
}
handleDisconnectWithoutInvestigation(attache, event, true, true);
// Maybe the host magically reappeared?
host = this._hostDao.findById(hostId);
if (host != null && host.getStatus() == HostStatus.Down) {
this._haMgr.scheduleRestartForVmsOnHost(host, true);
}
return true;
}
use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.
the class AgentManagerImpl method investigate.
protected HostStatus investigate(final AgentAttache agent) {
final Long hostId = agent.getId();
final HostVO host = this._hostDao.findById(hostId);
if (host != null && host.getType() != null && !host.getType().isVirtual()) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("checking if agent (" + hostId + ") is alive");
}
final Answer answer = easySend(hostId, new CheckHealthCommand());
if (answer != null && answer.getResult()) {
final HostStatus status = HostStatus.Up;
if (s_logger.isDebugEnabled()) {
s_logger.debug("agent (" + hostId + ") responded to checkHeathCommand, reporting that agent is " + status);
}
return status;
}
return this._haMgr.investigate(hostId);
}
return HostStatus.Alert;
}
use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.
the class AgentManagerImpl method handleDisconnectWithoutInvestigation.
protected boolean handleDisconnectWithoutInvestigation(final AgentAttache attache, final Event event, final boolean transitState, final boolean removeAgent) {
final long hostId = attache.getId();
s_logger.info("Host " + hostId + " is disconnecting with event " + event);
HostStatus nextStatus = null;
final HostVO host = this._hostDao.findById(hostId);
if (host == null) {
s_logger.warn("Can't find host with " + hostId);
nextStatus = HostStatus.Removed;
} else {
final HostStatus currentStatus = host.getStatus();
if (currentStatus == HostStatus.Down || currentStatus == HostStatus.Alert || currentStatus == HostStatus.Removed) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Host " + hostId + " is already " + currentStatus);
}
nextStatus = currentStatus;
} else {
try {
nextStatus = currentStatus.getNextStatus(event);
} catch (final NoTransitionException e) {
final String err = "Cannot find next status for " + event + " as current status is " + currentStatus + " for agent " + hostId;
s_logger.debug(err);
throw new CloudRuntimeException(err);
}
if (s_logger.isDebugEnabled()) {
s_logger.debug("The next status of agent " + hostId + "is " + nextStatus + ", current status is " + currentStatus);
}
}
}
if (s_logger.isDebugEnabled()) {
s_logger.debug("Deregistering link for " + hostId + " with state " + nextStatus);
}
removeAgent(attache, nextStatus);
// update the DB
if (host != null && transitState) {
disconnectAgent(host, event, this._nodeId);
}
return true;
}
use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.
the class HighAvailabilityManagerImpl method investigate.
@Override
public HostStatus investigate(final long hostId) {
final HostVO host = _hostDao.findById(hostId);
if (host == null) {
return HostStatus.Alert;
}
HostStatus hostState = null;
for (final Investigator investigator : investigators) {
hostState = investigator.isAgentAlive(host);
if (hostState != null) {
if (s_logger.isDebugEnabled()) {
s_logger.debug(investigator.getName() + " was able to determine host " + hostId + " is in " + hostState.toString());
}
return hostState;
}
if (s_logger.isDebugEnabled()) {
s_logger.debug(investigator.getName() + " unable to determine the state of the host. Moving on.");
}
}
return hostState;
}
use of com.cloud.legacymodel.dc.HostStatus in project cosmic by MissionCriticalCloud.
the class ManagementIPSystemVMInvestigator method isVmAlive.
@Override
public boolean isVmAlive(final VirtualMachine vm, final Host host) throws UnknownVM {
if (!vm.getType().isUsedBySystem()) {
s_logger.debug("Not a System Vm, unable to determine state of " + vm + " returning null");
}
if (s_logger.isDebugEnabled()) {
s_logger.debug("Testing if " + vm + " is alive");
}
if (vm.getHostId() == null) {
s_logger.debug("There's no host id for " + vm);
throw new UnknownVM();
}
final HostVO vmHost = _hostDao.findById(vm.getHostId());
if (vmHost == null) {
s_logger.debug("Unable to retrieve the host by using id " + vm.getHostId());
throw new UnknownVM();
}
final List<? extends Nic> nics = _networkMgr.getNicsForTraffic(vm.getId(), TrafficType.Management);
if (nics.size() == 0) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("Unable to find a management nic, cannot ping this system VM, unable to determine state of " + vm + " returning null");
}
throw new UnknownVM();
}
for (final Nic nic : nics) {
if (nic.getIPv4Address() == null) {
continue;
}
// get the data center IP address, find a host on the pod, use that host to ping the data center IP address
final List<Long> otherHosts = findHostByPod(vmHost.getPodId(), vm.getHostId());
for (final Long otherHost : otherHosts) {
final HostStatus vmState = testIpAddress(otherHost, nic.getIPv4Address());
assert vmState != null;
// In case of HostStatus.Unknown, next host will be tried
if (vmState == HostStatus.Up) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("successfully pinged vm's private IP (" + vm.getPrivateIpAddress() + "), returning that the VM is up");
}
return Boolean.TRUE;
} else if (vmState == HostStatus.Down) {
// We can't ping the VM directly...if we can ping the host, then report the VM down.
// If we can't ping the host, then we don't have enough information.
final HostStatus vmHostState = testIpAddress(otherHost, vmHost.getPrivateIpAddress());
assert vmHostState != null;
if (vmHostState == HostStatus.Up) {
if (s_logger.isDebugEnabled()) {
s_logger.debug("successfully pinged vm's host IP (" + vmHost.getPrivateIpAddress() + "), but could not ping VM, returning that the VM is down");
}
return Boolean.FALSE;
}
}
}
}
if (s_logger.isDebugEnabled()) {
s_logger.debug("unable to determine state of " + vm + " returning null");
}
throw new UnknownVM();
}
Aggregations