use of com.yahoo.vespa.orchestrator.status.HostStatus in project vespa by vespa-engine.
the class MetricsReporter method updateNodeMetrics.
private void updateNodeMetrics(Node node, Map<HostName, List<ServiceInstance>> servicesByHost) {
Metric.Context context;
Optional<Allocation> allocation = node.allocation();
if (allocation.isPresent()) {
ApplicationId applicationId = allocation.get().owner();
context = getContextAt("state", node.state().name(), "host", node.hostname(), "tenantName", applicationId.tenant().value(), "applicationId", applicationId.serializedForm().replace(':', '.'), "app", toApp(applicationId), "clustertype", allocation.get().membership().cluster().type().name(), "clusterid", allocation.get().membership().cluster().id().value());
long wantedRestartGeneration = allocation.get().restartGeneration().wanted();
metric.set("wantedRestartGeneration", wantedRestartGeneration, context);
long currentRestartGeneration = allocation.get().restartGeneration().current();
metric.set("currentRestartGeneration", currentRestartGeneration, context);
boolean wantToRestart = currentRestartGeneration < wantedRestartGeneration;
metric.set("wantToRestart", wantToRestart ? 1 : 0, context);
Version wantedVersion = allocation.get().membership().cluster().vespaVersion();
double wantedVersionNumber = getVersionAsNumber(wantedVersion);
metric.set("wantedVespaVersion", wantedVersionNumber, context);
Optional<Version> currentVersion = node.status().vespaVersion();
boolean converged = currentVersion.isPresent() && currentVersion.get().equals(wantedVersion);
metric.set("wantToChangeVespaVersion", converged ? 0 : 1, context);
} else {
context = getContextAt("state", node.state().name(), "host", node.hostname());
}
Optional<Version> currentVersion = node.status().vespaVersion();
// Node repo checks for !isEmpty(), so let's do that here too.
if (currentVersion.isPresent() && !currentVersion.get().isEmpty()) {
double currentVersionNumber = getVersionAsNumber(currentVersion.get());
metric.set("currentVespaVersion", currentVersionNumber, context);
}
long wantedRebootGeneration = node.status().reboot().wanted();
metric.set("wantedRebootGeneration", wantedRebootGeneration, context);
long currentRebootGeneration = node.status().reboot().current();
metric.set("currentRebootGeneration", currentRebootGeneration, context);
boolean wantToReboot = currentRebootGeneration < wantedRebootGeneration;
metric.set("wantToReboot", wantToReboot ? 1 : 0, context);
metric.set("wantToRetire", node.status().wantToRetire() ? 1 : 0, context);
metric.set("wantToDeprovision", node.status().wantToDeprovision() ? 1 : 0, context);
metric.set("hardwareFailure", node.status().hardwareFailureDescription().isPresent() ? 1 : 0, context);
metric.set("hardwareDivergence", node.status().hardwareDivergence().isPresent() ? 1 : 0, context);
try {
HostStatus status = orchestrator.getNodeStatus(new HostName(node.hostname()));
boolean allowedToBeDown = status == HostStatus.ALLOWED_TO_BE_DOWN;
metric.set("allowedToBeDown", allowedToBeDown ? 1 : 0, context);
} catch (HostNameNotFoundException e) {
// Ignore
}
long numberOfServices;
HostName hostName = new HostName(node.hostname());
List<ServiceInstance> services = servicesByHost.get(hostName);
if (services == null) {
numberOfServices = 0;
} else {
Map<ServiceStatus, Long> servicesCount = services.stream().collect(Collectors.groupingBy(ServiceInstance::serviceStatus, Collectors.counting()));
numberOfServices = servicesCount.values().stream().mapToLong(Long::longValue).sum();
metric.set("numberOfServicesUp", servicesCount.getOrDefault(ServiceStatus.UP, 0L), context);
metric.set("numberOfServicesNotChecked", servicesCount.getOrDefault(ServiceStatus.NOT_CHECKED, 0L), context);
long numberOfServicesDown = servicesCount.getOrDefault(ServiceStatus.DOWN, 0L);
metric.set("numberOfServicesDown", numberOfServicesDown, context);
metric.set("someServicesDown", (numberOfServicesDown > 0 ? 1 : 0), context);
boolean badNode = NodeFailer.badNode(services);
metric.set("nodeFailerBadNode", (badNode ? 1 : 0), context);
boolean nodeDownInNodeRepo = node.history().event(History.Event.Type.down).isPresent();
metric.set("downInNodeRepo", (nodeDownInNodeRepo ? 1 : 0), context);
}
metric.set("numberOfServices", numberOfServices, context);
}
use of com.yahoo.vespa.orchestrator.status.HostStatus in project vespa by vespa-engine.
the class ApplicationApiImplTest method verifyUpConditionWith.
private void verifyUpConditionWith(HostStatus hostStatus, ServiceStatus serviceStatus, boolean expectUp) {
HostName hostName1 = modelUtils.createNode("host1", hostStatus);
ApplicationInstance applicationInstance = modelUtils.createApplicationInstance(Arrays.asList(modelUtils.createServiceCluster("cluster-1", VespaModelUtil.STORAGENODE_SERVICE_TYPE, Arrays.asList(modelUtils.createServiceInstance("config-id-1", hostName1, serviceStatus)))));
ApplicationApiImpl applicationApi = modelUtils.createApplicationApiImpl(applicationInstance, hostName1);
List<HostName> upStorageNodes = expectUp ? Arrays.asList(hostName1) : new ArrayList<>();
List<HostName> actualStorageNodes = applicationApi.getUpStorageNodesInGroupInClusterOrder().stream().map(storageNode -> storageNode.hostName()).collect(Collectors.toList());
assertEquals(upStorageNodes, actualStorageNodes);
}
use of com.yahoo.vespa.orchestrator.status.HostStatus in project vespa by vespa-engine.
the class OrchestratorImpl method resume.
@Override
public void resume(HostName hostName) throws HostStateChangeDeniedException, HostNameNotFoundException {
/*
* When making a state transition to this state, we have to consider that if the host has been in
* ALLOWED_TO_BE_DOWN state, services on the host may recently have been stopped (and, presumably, started).
* Service monitoring may not have had enough time to detect that services were stopped,
* and may therefore mistakenly report services as up, even if they still haven't initialized and
* are not yet ready for serving. Erroneously reporting both host and services as up causes a race
* where services on other hosts may be stopped prematurely. A delay here ensures that service
* monitoring will have had time to catch up. Since we don't want do the delay with the lock held,
* and the host status service's locking functionality does not support something like condition
* variables or Object.wait(), we break out here, releasing the lock before delaying.
*/
sleep(serviceMonitorConvergenceLatencySeconds, TimeUnit.SECONDS);
ApplicationInstance appInstance = getApplicationInstance(hostName);
try (MutableStatusRegistry statusRegistry = statusService.lockApplicationInstance_forCurrentThreadOnly(appInstance.reference())) {
final HostStatus currentHostState = statusRegistry.getHostStatus(hostName);
if (HostStatus.NO_REMARKS == currentHostState) {
return;
}
ApplicationInstanceStatus appStatus = statusService.forApplicationInstance(appInstance.reference()).getApplicationInstanceStatus();
if (appStatus == ApplicationInstanceStatus.NO_REMARKS) {
policy.releaseSuspensionGrant(appInstance, hostName, statusRegistry);
}
}
}
use of com.yahoo.vespa.orchestrator.status.HostStatus in project vespa by vespa-engine.
the class OrchestratorImpl method getHost.
@Override
public Host getHost(HostName hostName) throws HostNameNotFoundException {
ApplicationInstance applicationInstance = getApplicationInstance(hostName);
List<ServiceInstance> serviceInstances = applicationInstance.serviceClusters().stream().flatMap(cluster -> cluster.serviceInstances().stream()).filter(serviceInstance -> hostName.equals(serviceInstance.hostName())).collect(Collectors.toList());
HostStatus hostStatus = getNodeStatus(applicationInstance.reference(), hostName);
return new Host(hostName, hostStatus, applicationInstance.reference(), serviceInstances);
}
use of com.yahoo.vespa.orchestrator.status.HostStatus in project vespa by vespa-engine.
the class HostResource method patch.
@Override
public PatchHostResponse patch(String hostNameString, PatchHostRequest request) {
HostName hostName = new HostName(hostNameString);
if (request.state != null) {
HostStatus state;
try {
state = HostStatus.valueOf(request.state);
} catch (IllegalArgumentException dummy) {
throw new BadRequestException("Bad state in request: '" + request.state + "'");
}
try {
orchestrator.setNodeStatus(hostName, state);
} catch (HostNameNotFoundException e) {
log.log(LogLevel.INFO, "Host not found: " + hostName);
throw new NotFoundException(e);
} catch (OrchestrationException e) {
String message = "Failed to set " + hostName + " to " + state + ": " + e.getMessage();
log.log(LogLevel.INFO, message, e);
throw new InternalServerErrorException(message);
}
}
PatchHostResponse response = new PatchHostResponse();
response.description = "ok";
return response;
}
Aggregations