use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.
the class NodeFailer method throttle.
/**
* Returns true if node failing should be throttled
*/
private boolean throttle(Node node) {
if (throttlePolicy == ThrottlePolicy.disabled)
return false;
Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow);
List<Node> nodes = nodeRepository().getNodes();
long recentlyFailedNodes = nodes.stream().map(n -> n.history().event(History.Event.Type.failed)).filter(Optional::isPresent).map(Optional::get).filter(failedEvent -> failedEvent.at().isAfter(startOfThrottleWindow)).count();
int allowedFailedNodes = (int) Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail, throttlePolicy.minimumAllowedToFail);
boolean throttle = allowedFailedNodes < recentlyFailedNodes || (allowedFailedNodes == recentlyFailedNodes && !node.type().isDockerHost());
if (throttle) {
log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), throttlePolicy.toHumanReadableString()));
}
metric.set("nodeFailThrottling", throttle ? 1 : 0, null);
return throttle;
}
use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.
the class NodeRetirer method retireUnallocated.
/**
* Retires unallocated nodes by moving them directly to parked.
* Returns true iff all there are no unallocated nodes that match the retirement policy
*/
boolean retireUnallocated() {
try (Mutex lock = nodeRepository().lockUnallocated()) {
List<Node> allNodes = nodeRepository().getNodes(NodeType.tenant);
Map<Flavor, Map<Node.State, Long>> numSpareNodesByFlavorByState = getNumberOfNodesByFlavorByNodeState(allNodes);
flavorSpareChecker.updateReadyAndActiveCountsByFlavor(numSpareNodesByFlavorByState);
long numFlavorsWithUnsuccessfullyRetiredNodes = allNodes.stream().filter(node -> node.state() == Node.State.ready).filter(node -> retirementPolicy.shouldRetire(node).isPresent()).collect(Collectors.groupingBy(Node::flavor, Collectors.toSet())).entrySet().stream().filter(entry -> {
Set<Node> nodesThatShouldBeRetiredForFlavor = entry.getValue();
for (Iterator<Node> iter = nodesThatShouldBeRetiredForFlavor.iterator(); iter.hasNext(); ) {
Node nodeToRetire = iter.next();
if (!flavorSpareChecker.canRetireUnallocatedNodeWithFlavor(nodeToRetire.flavor()))
break;
retirementPolicy.shouldRetire(nodeToRetire).ifPresent(reason -> {
nodeRepository().write(nodeToRetire.with(nodeToRetire.status().withWantToDeprovision(true)));
nodeRepository().park(nodeToRetire.hostname(), Agent.NodeRetirer, reason);
iter.remove();
});
}
if (!nodesThatShouldBeRetiredForFlavor.isEmpty()) {
String commaSeparatedHostnames = nodesThatShouldBeRetiredForFlavor.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.info(String.format("Failed to retire %s, wanted to retire %d nodes (%s), but there are no spare nodes left.", entry.getKey(), nodesThatShouldBeRetiredForFlavor.size(), commaSeparatedHostnames));
}
return !nodesThatShouldBeRetiredForFlavor.isEmpty();
}).count();
return numFlavorsWithUnsuccessfullyRetiredNodes == 0;
}
}
use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.
the class RetiredExpirer method maintain.
@Override
protected void maintain() {
List<Node> activeNodes = nodeRepository().getNodes(Node.State.active);
Map<ApplicationId, List<Node>> retiredNodesByApplication = activeNodes.stream().filter(node -> node.allocation().isPresent()).filter(node -> node.allocation().get().membership().retired()).collect(Collectors.groupingBy(node -> node.allocation().get().owner()));
for (Map.Entry<ApplicationId, List<Node>> entry : retiredNodesByApplication.entrySet()) {
ApplicationId application = entry.getKey();
List<Node> retiredNodes = entry.getValue();
try {
Optional<Deployment> deployment = deployer.deployFromLocalActive(application);
// this will be done at another config server
if (!deployment.isPresent())
continue;
List<Node> nodesToRemove = retiredNodes.stream().filter(this::canRemove).collect(Collectors.toList());
if (nodesToRemove.isEmpty()) {
continue;
}
nodeRepository().setRemovable(application, nodesToRemove);
deployment.get().activate();
String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
} catch (RuntimeException e) {
String nodeList = retiredNodes.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.log(Level.WARNING, "Exception trying to deactivate retired nodes from " + application + ": " + nodeList, e);
}
}
}
use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.
the class RetiredExpirer method canRemove.
/**
* Checks if the node can be removed:
* if the node is a docker host, it will only be removed if it has no children,
* or all its children are parked or failed
* Otherwise, a removal is allowed if either of these are true:
* - The node has been in state {@link History.Event.Type#retired} for longer than {@link #retiredExpiry}
* - Orchestrator allows it
*/
private boolean canRemove(Node node) {
if (node.type().isDockerHost()) {
return nodeRepository().getChildNodes(node.hostname()).stream().allMatch(child -> child.state() == Node.State.parked || child.state() == Node.State.failed);
}
Optional<Instant> timeOfRetiredEvent = node.history().event(History.Event.Type.retired).map(History.Event::at);
Optional<Instant> retireAfter = timeOfRetiredEvent.map(retiredEvent -> retiredEvent.plus(retiredExpiry));
boolean shouldRetireNowBecauseExpried = retireAfter.map(time -> time.isBefore(clock.instant())).orElse(false);
if (shouldRetireNowBecauseExpried) {
return true;
}
try {
orchestrator.acquirePermissionToRemove(new HostName(node.hostname()));
return true;
} catch (OrchestrationException e) {
log.info("Did not get permission to remove retired " + node + ": " + e.getMessage());
return false;
}
}
use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.
the class MetricsReporterTest method test_registered_metric.
@Test
public void test_registered_metric() throws Exception {
NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default");
Curator curator = new MockCurator();
NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, Clock.systemUTC(), Zone.defaultZone(), new MockNameResolver().mockAnyLookup(), new DockerImage("docker-registry.domain.tld:8080/dist/vespa"), true);
Node node = nodeRepository.createNode("openStackId", "hostname", Optional.empty(), nodeFlavors.getFlavorOrThrow("default"), NodeType.tenant);
nodeRepository.addNodes(Collections.singletonList(node));
Node hostNode = nodeRepository.createNode("openStackId2", "parent", Optional.empty(), nodeFlavors.getFlavorOrThrow("default"), NodeType.proxy);
nodeRepository.addNodes(Collections.singletonList(hostNode));
Map<String, Number> expectedMetrics = new HashMap<>();
expectedMetrics.put("hostedVespa.provisionedHosts", 1L);
expectedMetrics.put("hostedVespa.parkedHosts", 0L);
expectedMetrics.put("hostedVespa.readyHosts", 0L);
expectedMetrics.put("hostedVespa.reservedHosts", 0L);
expectedMetrics.put("hostedVespa.activeHosts", 0L);
expectedMetrics.put("hostedVespa.inactiveHosts", 0L);
expectedMetrics.put("hostedVespa.dirtyHosts", 0L);
expectedMetrics.put("hostedVespa.failedHosts", 0L);
expectedMetrics.put("hostedVespa.docker.totalCapacityDisk", 0.0);
expectedMetrics.put("hostedVespa.docker.totalCapacityMem", 0.0);
expectedMetrics.put("hostedVespa.docker.totalCapacityCpu", 0.0);
expectedMetrics.put("hostedVespa.docker.freeCapacityDisk", 0.0);
expectedMetrics.put("hostedVespa.docker.freeCapacityMem", 0.0);
expectedMetrics.put("hostedVespa.docker.freeCapacityCpu", 0.0);
expectedMetrics.put("wantedRebootGeneration", 0L);
expectedMetrics.put("currentRebootGeneration", 0L);
expectedMetrics.put("wantToReboot", 0);
expectedMetrics.put("wantToRetire", 0);
expectedMetrics.put("wantToDeprovision", 0);
expectedMetrics.put("hardwareFailure", 0);
expectedMetrics.put("hardwareDivergence", 0);
expectedMetrics.put("allowedToBeDown", 0);
expectedMetrics.put("numberOfServices", 0L);
Orchestrator orchestrator = mock(Orchestrator.class);
ServiceMonitor serviceMonitor = mock(ServiceMonitor.class);
when(orchestrator.getNodeStatus(any())).thenReturn(HostStatus.NO_REMARKS);
ServiceModel serviceModel = mock(ServiceModel.class);
when(serviceMonitor.getServiceModelSnapshot()).thenReturn(serviceModel);
when(serviceModel.getServiceInstancesByHostName()).thenReturn(Collections.emptyMap());
TestMetric metric = new TestMetric();
MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, Duration.ofMinutes(1), new JobControl(nodeRepository.database()));
metricsReporter.maintain();
assertEquals(expectedMetrics, metric.values);
}
Aggregations