Search in sources :

Example 1 with NodeRepository

use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.

the class NodeFailer method throttle.

/**
 * Returns true if node failing should be throttled
 */
private boolean throttle(Node node) {
    if (throttlePolicy == ThrottlePolicy.disabled)
        return false;
    Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow);
    List<Node> nodes = nodeRepository().getNodes();
    long recentlyFailedNodes = nodes.stream().map(n -> n.history().event(History.Event.Type.failed)).filter(Optional::isPresent).map(Optional::get).filter(failedEvent -> failedEvent.at().isAfter(startOfThrottleWindow)).count();
    int allowedFailedNodes = (int) Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail, throttlePolicy.minimumAllowedToFail);
    boolean throttle = allowedFailedNodes < recentlyFailedNodes || (allowedFailedNodes == recentlyFailedNodes && !node.type().isDockerHost());
    if (throttle) {
        log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), throttlePolicy.toHumanReadableString()));
    }
    metric.set("nodeFailThrottling", throttle ? 1 : 0, null);
    return throttle;
}
Also used : Metric(com.yahoo.jdisc.Metric) Deployer(com.yahoo.config.provision.Deployer) Collectors.counting(java.util.stream.Collectors.counting) HashMap(java.util.HashMap) Orchestrator(com.yahoo.vespa.orchestrator.Orchestrator) ServiceStatus(com.yahoo.vespa.applicationmodel.ServiceStatus) Node(com.yahoo.vespa.hosted.provision.Node) ConfigserverConfig(com.yahoo.cloud.config.ConfigserverConfig) ArrayList(java.util.ArrayList) Level(java.util.logging.Level) NodeRepository(com.yahoo.vespa.hosted.provision.NodeRepository) History(com.yahoo.vespa.hosted.provision.node.History) ApplicationIdNotFoundException(com.yahoo.vespa.orchestrator.ApplicationIdNotFoundException) Duration(java.time.Duration) Map(java.util.Map) HostLivenessTracker(com.yahoo.config.provision.HostLivenessTracker) ServiceInstance(com.yahoo.vespa.applicationmodel.ServiceInstance) ServiceMonitor(com.yahoo.vespa.service.monitor.ServiceMonitor) Deployment(com.yahoo.config.provision.Deployment) NodeType(com.yahoo.config.provision.NodeType) Instant(java.time.Instant) Logger(java.util.logging.Logger) ApplicationInstanceStatus(com.yahoo.vespa.orchestrator.status.ApplicationInstanceStatus) Collectors(java.util.stream.Collectors) Mutex(com.yahoo.transaction.Mutex) List(java.util.List) Agent(com.yahoo.vespa.hosted.provision.node.Agent) Clock(java.time.Clock) Optional(java.util.Optional) Optional(java.util.Optional) Instant(java.time.Instant) Node(com.yahoo.vespa.hosted.provision.Node)

Example 2 with NodeRepository

use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.

the class NodeRetirer method retireUnallocated.

/**
 * Retires unallocated nodes by moving them directly to parked.
 * Returns true iff all there are no unallocated nodes that match the retirement policy
 */
boolean retireUnallocated() {
    try (Mutex lock = nodeRepository().lockUnallocated()) {
        List<Node> allNodes = nodeRepository().getNodes(NodeType.tenant);
        Map<Flavor, Map<Node.State, Long>> numSpareNodesByFlavorByState = getNumberOfNodesByFlavorByNodeState(allNodes);
        flavorSpareChecker.updateReadyAndActiveCountsByFlavor(numSpareNodesByFlavorByState);
        long numFlavorsWithUnsuccessfullyRetiredNodes = allNodes.stream().filter(node -> node.state() == Node.State.ready).filter(node -> retirementPolicy.shouldRetire(node).isPresent()).collect(Collectors.groupingBy(Node::flavor, Collectors.toSet())).entrySet().stream().filter(entry -> {
            Set<Node> nodesThatShouldBeRetiredForFlavor = entry.getValue();
            for (Iterator<Node> iter = nodesThatShouldBeRetiredForFlavor.iterator(); iter.hasNext(); ) {
                Node nodeToRetire = iter.next();
                if (!flavorSpareChecker.canRetireUnallocatedNodeWithFlavor(nodeToRetire.flavor()))
                    break;
                retirementPolicy.shouldRetire(nodeToRetire).ifPresent(reason -> {
                    nodeRepository().write(nodeToRetire.with(nodeToRetire.status().withWantToDeprovision(true)));
                    nodeRepository().park(nodeToRetire.hostname(), Agent.NodeRetirer, reason);
                    iter.remove();
                });
            }
            if (!nodesThatShouldBeRetiredForFlavor.isEmpty()) {
                String commaSeparatedHostnames = nodesThatShouldBeRetiredForFlavor.stream().map(Node::hostname).collect(Collectors.joining(", "));
                log.info(String.format("Failed to retire %s, wanted to retire %d nodes (%s), but there are no spare nodes left.", entry.getKey(), nodesThatShouldBeRetiredForFlavor.size(), commaSeparatedHostnames));
            }
            return !nodesThatShouldBeRetiredForFlavor.isEmpty();
        }).count();
        return numFlavorsWithUnsuccessfullyRetiredNodes == 0;
    }
}
Also used : Deployer(com.yahoo.config.provision.Deployer) FlavorSpareChecker(com.yahoo.vespa.hosted.provision.provisioning.FlavorSpareChecker) RetirementPolicy(com.yahoo.vespa.hosted.provision.maintenance.retire.RetirementPolicy) Iterator(java.util.Iterator) ApplicationId(com.yahoo.config.provision.ApplicationId) Deployment(com.yahoo.config.provision.Deployment) NodeType(com.yahoo.config.provision.NodeType) Collection(java.util.Collection) ClusterSpec(com.yahoo.config.provision.ClusterSpec) Set(java.util.Set) HashMap(java.util.HashMap) Node(com.yahoo.vespa.hosted.provision.Node) Logger(java.util.logging.Logger) Collectors(java.util.stream.Collectors) NodeRepository(com.yahoo.vespa.hosted.provision.NodeRepository) Mutex(com.yahoo.transaction.Mutex) List(java.util.List) Stream(java.util.stream.Stream) Agent(com.yahoo.vespa.hosted.provision.node.Agent) Flavor(com.yahoo.config.provision.Flavor) Duration(java.time.Duration) Map(java.util.Map) LogLevel(com.yahoo.log.LogLevel) Optional(java.util.Optional) Set(java.util.Set) Node(com.yahoo.vespa.hosted.provision.Node) Iterator(java.util.Iterator) Mutex(com.yahoo.transaction.Mutex) Flavor(com.yahoo.config.provision.Flavor) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with NodeRepository

use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.

the class RetiredExpirer method maintain.

@Override
protected void maintain() {
    List<Node> activeNodes = nodeRepository().getNodes(Node.State.active);
    Map<ApplicationId, List<Node>> retiredNodesByApplication = activeNodes.stream().filter(node -> node.allocation().isPresent()).filter(node -> node.allocation().get().membership().retired()).collect(Collectors.groupingBy(node -> node.allocation().get().owner()));
    for (Map.Entry<ApplicationId, List<Node>> entry : retiredNodesByApplication.entrySet()) {
        ApplicationId application = entry.getKey();
        List<Node> retiredNodes = entry.getValue();
        try {
            Optional<Deployment> deployment = deployer.deployFromLocalActive(application);
            // this will be done at another config server
            if (!deployment.isPresent())
                continue;
            List<Node> nodesToRemove = retiredNodes.stream().filter(this::canRemove).collect(Collectors.toList());
            if (nodesToRemove.isEmpty()) {
                continue;
            }
            nodeRepository().setRemovable(application, nodesToRemove);
            deployment.get().activate();
            String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
            log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
        } catch (RuntimeException e) {
            String nodeList = retiredNodes.stream().map(Node::hostname).collect(Collectors.joining(", "));
            log.log(Level.WARNING, "Exception trying to deactivate retired nodes from " + application + ": " + nodeList, e);
        }
    }
}
Also used : OrchestrationException(com.yahoo.vespa.orchestrator.OrchestrationException) Deployer(com.yahoo.config.provision.Deployer) ApplicationId(com.yahoo.config.provision.ApplicationId) Deployment(com.yahoo.config.provision.Deployment) NodeType(com.yahoo.config.provision.NodeType) Orchestrator(com.yahoo.vespa.orchestrator.Orchestrator) Node(com.yahoo.vespa.hosted.provision.Node) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Level(java.util.logging.Level) NodeRepository(com.yahoo.vespa.hosted.provision.NodeRepository) List(java.util.List) History(com.yahoo.vespa.hosted.provision.node.History) HostName(com.yahoo.vespa.applicationmodel.HostName) Duration(java.time.Duration) Map(java.util.Map) Clock(java.time.Clock) Optional(java.util.Optional) Node(com.yahoo.vespa.hosted.provision.Node) Deployment(com.yahoo.config.provision.Deployment) List(java.util.List) ApplicationId(com.yahoo.config.provision.ApplicationId) Map(java.util.Map)

Example 4 with NodeRepository

use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.

the class RetiredExpirer method canRemove.

/**
 * Checks if the node can be removed:
 * if the node is a docker host, it will only be removed if it has no children,
 * or all its children are parked or failed
 * Otherwise, a removal is allowed if either of these are true:
 * - The node has been in state {@link History.Event.Type#retired} for longer than {@link #retiredExpiry}
 * - Orchestrator allows it
 */
private boolean canRemove(Node node) {
    if (node.type().isDockerHost()) {
        return nodeRepository().getChildNodes(node.hostname()).stream().allMatch(child -> child.state() == Node.State.parked || child.state() == Node.State.failed);
    }
    Optional<Instant> timeOfRetiredEvent = node.history().event(History.Event.Type.retired).map(History.Event::at);
    Optional<Instant> retireAfter = timeOfRetiredEvent.map(retiredEvent -> retiredEvent.plus(retiredExpiry));
    boolean shouldRetireNowBecauseExpried = retireAfter.map(time -> time.isBefore(clock.instant())).orElse(false);
    if (shouldRetireNowBecauseExpried) {
        return true;
    }
    try {
        orchestrator.acquirePermissionToRemove(new HostName(node.hostname()));
        return true;
    } catch (OrchestrationException e) {
        log.info("Did not get permission to remove retired " + node + ": " + e.getMessage());
        return false;
    }
}
Also used : OrchestrationException(com.yahoo.vespa.orchestrator.OrchestrationException) Deployer(com.yahoo.config.provision.Deployer) ApplicationId(com.yahoo.config.provision.ApplicationId) Deployment(com.yahoo.config.provision.Deployment) NodeType(com.yahoo.config.provision.NodeType) Orchestrator(com.yahoo.vespa.orchestrator.Orchestrator) Node(com.yahoo.vespa.hosted.provision.Node) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Level(java.util.logging.Level) NodeRepository(com.yahoo.vespa.hosted.provision.NodeRepository) List(java.util.List) History(com.yahoo.vespa.hosted.provision.node.History) HostName(com.yahoo.vespa.applicationmodel.HostName) Duration(java.time.Duration) Map(java.util.Map) Clock(java.time.Clock) Optional(java.util.Optional) Instant(java.time.Instant) OrchestrationException(com.yahoo.vespa.orchestrator.OrchestrationException) HostName(com.yahoo.vespa.applicationmodel.HostName)

Example 5 with NodeRepository

use of com.yahoo.vespa.hosted.provision.NodeRepository in project vespa by vespa-engine.

the class MetricsReporterTest method test_registered_metric.

@Test
public void test_registered_metric() throws Exception {
    NodeFlavors nodeFlavors = FlavorConfigBuilder.createDummies("default");
    Curator curator = new MockCurator();
    NodeRepository nodeRepository = new NodeRepository(nodeFlavors, curator, Clock.systemUTC(), Zone.defaultZone(), new MockNameResolver().mockAnyLookup(), new DockerImage("docker-registry.domain.tld:8080/dist/vespa"), true);
    Node node = nodeRepository.createNode("openStackId", "hostname", Optional.empty(), nodeFlavors.getFlavorOrThrow("default"), NodeType.tenant);
    nodeRepository.addNodes(Collections.singletonList(node));
    Node hostNode = nodeRepository.createNode("openStackId2", "parent", Optional.empty(), nodeFlavors.getFlavorOrThrow("default"), NodeType.proxy);
    nodeRepository.addNodes(Collections.singletonList(hostNode));
    Map<String, Number> expectedMetrics = new HashMap<>();
    expectedMetrics.put("hostedVespa.provisionedHosts", 1L);
    expectedMetrics.put("hostedVespa.parkedHosts", 0L);
    expectedMetrics.put("hostedVespa.readyHosts", 0L);
    expectedMetrics.put("hostedVespa.reservedHosts", 0L);
    expectedMetrics.put("hostedVespa.activeHosts", 0L);
    expectedMetrics.put("hostedVespa.inactiveHosts", 0L);
    expectedMetrics.put("hostedVespa.dirtyHosts", 0L);
    expectedMetrics.put("hostedVespa.failedHosts", 0L);
    expectedMetrics.put("hostedVespa.docker.totalCapacityDisk", 0.0);
    expectedMetrics.put("hostedVespa.docker.totalCapacityMem", 0.0);
    expectedMetrics.put("hostedVespa.docker.totalCapacityCpu", 0.0);
    expectedMetrics.put("hostedVespa.docker.freeCapacityDisk", 0.0);
    expectedMetrics.put("hostedVespa.docker.freeCapacityMem", 0.0);
    expectedMetrics.put("hostedVespa.docker.freeCapacityCpu", 0.0);
    expectedMetrics.put("wantedRebootGeneration", 0L);
    expectedMetrics.put("currentRebootGeneration", 0L);
    expectedMetrics.put("wantToReboot", 0);
    expectedMetrics.put("wantToRetire", 0);
    expectedMetrics.put("wantToDeprovision", 0);
    expectedMetrics.put("hardwareFailure", 0);
    expectedMetrics.put("hardwareDivergence", 0);
    expectedMetrics.put("allowedToBeDown", 0);
    expectedMetrics.put("numberOfServices", 0L);
    Orchestrator orchestrator = mock(Orchestrator.class);
    ServiceMonitor serviceMonitor = mock(ServiceMonitor.class);
    when(orchestrator.getNodeStatus(any())).thenReturn(HostStatus.NO_REMARKS);
    ServiceModel serviceModel = mock(ServiceModel.class);
    when(serviceMonitor.getServiceModelSnapshot()).thenReturn(serviceModel);
    when(serviceModel.getServiceInstancesByHostName()).thenReturn(Collections.emptyMap());
    TestMetric metric = new TestMetric();
    MetricsReporter metricsReporter = new MetricsReporter(nodeRepository, metric, orchestrator, serviceMonitor, Duration.ofMinutes(1), new JobControl(nodeRepository.database()));
    metricsReporter.maintain();
    assertEquals(expectedMetrics, metric.values);
}
Also used : MockNameResolver(com.yahoo.vespa.hosted.provision.testutils.MockNameResolver) HashMap(java.util.HashMap) Node(com.yahoo.vespa.hosted.provision.Node) JobControl(com.yahoo.vespa.hosted.provision.maintenance.JobControl) Curator(com.yahoo.vespa.curator.Curator) MockCurator(com.yahoo.vespa.curator.mock.MockCurator) Orchestrator(com.yahoo.vespa.orchestrator.Orchestrator) ServiceMonitor(com.yahoo.vespa.service.monitor.ServiceMonitor) NodeFlavors(com.yahoo.config.provision.NodeFlavors) ServiceModel(com.yahoo.vespa.service.monitor.ServiceModel) MetricsReporter(com.yahoo.vespa.hosted.provision.maintenance.MetricsReporter) NodeRepository(com.yahoo.vespa.hosted.provision.NodeRepository) DockerImage(com.yahoo.config.provision.DockerImage) MockCurator(com.yahoo.vespa.curator.mock.MockCurator) Test(org.junit.Test)

Aggregations

NodeRepository (com.yahoo.vespa.hosted.provision.NodeRepository)13 Node (com.yahoo.vespa.hosted.provision.Node)11 ApplicationId (com.yahoo.config.provision.ApplicationId)8 Duration (java.time.Duration)7 List (java.util.List)7 Collectors (java.util.stream.Collectors)7 Deployer (com.yahoo.config.provision.Deployer)6 NodeType (com.yahoo.config.provision.NodeType)6 Orchestrator (com.yahoo.vespa.orchestrator.Orchestrator)6 Map (java.util.Map)6 Optional (java.util.Optional)6 Deployment (com.yahoo.config.provision.Deployment)5 DockerImage (com.yahoo.config.provision.DockerImage)5 MockNameResolver (com.yahoo.vespa.hosted.provision.testutils.MockNameResolver)5 HashMap (java.util.HashMap)5 Curator (com.yahoo.vespa.curator.Curator)4 MockCurator (com.yahoo.vespa.curator.mock.MockCurator)4 Agent (com.yahoo.vespa.hosted.provision.node.Agent)4 ServiceMonitor (com.yahoo.vespa.service.monitor.ServiceMonitor)4 Test (org.junit.Test)4