Search in sources :

Example 16 with Node

use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.

the class NodeFailerTest method failing_docker_hosts.

@Test
public void failing_docker_hosts() {
    NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(7);
    // For a day all nodes work so nothing happens
    for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
        tester.clock.advance(Duration.ofMinutes(interval));
        tester.allNodesMakeAConfigRequestExcept();
        tester.failer.run();
        assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
        assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
        assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
    }
    // Select the first host that has two active nodes
    String downHost1 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
    tester.serviceMonitor.setHostDown(downHost1);
    // nothing happens the first 45 minutes
    for (int minutes = 0; minutes < 45; minutes += 5) {
        tester.failer.run();
        tester.clock.advance(Duration.ofMinutes(5));
        tester.allNodesMakeAConfigRequestExcept();
        assertEquals(0, tester.deployer.redeployments);
        assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
        assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
        assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
    }
    tester.clock.advance(Duration.ofMinutes(30));
    tester.allNodesMakeAConfigRequestExcept();
    tester.failer.run();
    assertEquals(2 + 1, tester.deployer.redeployments);
    assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
    assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
    assertEquals(10, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
    // Now lets fail an active tenant node
    Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0);
    tester.serviceMonitor.setHostDown(downTenant1.hostname());
    // nothing happens during the entire day because of the failure throttling
    for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
        tester.failer.run();
        tester.clock.advance(Duration.ofMinutes(interval));
        tester.allNodesMakeAConfigRequestExcept();
        assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size());
    }
    tester.clock.advance(Duration.ofMinutes(30));
    tester.allNodesMakeAConfigRequestExcept();
    tester.failer.run();
    assertEquals(3 + 1, tester.deployer.redeployments);
    assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
    assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
    assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
    // Lets fail another host, make sure it is not the same where downTenant1 is a child
    String downHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());
    tester.serviceMonitor.setHostDown(downHost2);
    tester.failer.run();
    tester.clock.advance(Duration.ofMinutes(90));
    tester.allNodesMakeAConfigRequestExcept();
    tester.failer.run();
    assertEquals(5 + 2, tester.deployer.redeployments);
    assertEquals(7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
    assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
    assertEquals(6, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
    // We have only 5 hosts remaining, so if we fail another host, we should only be able to redeploy app1's
    // node, while app2's should remain
    String downHost3 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());
    tester.serviceMonitor.setHostDown(downHost3);
    tester.failer.run();
    tester.clock.advance(Duration.ofDays(1));
    tester.allNodesMakeAConfigRequestExcept();
    tester.failer.run();
    assertEquals(6 + 2, tester.deployer.redeployments);
    assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
    assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
    assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
}
Also used : Node(com.yahoo.vespa.hosted.provision.Node) Test(org.junit.Test)

Example 17 with Node

use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.

the class NodeFailerTest method node_failing.

@Test
public void node_failing() throws InterruptedException {
    NodeFailTester tester = NodeFailTester.withTwoApplications();
    // For a day all nodes work so nothing happens
    for (int minutes = 0; minutes < 24 * 60; minutes += 5) {
        tester.failer.run();
        tester.clock.advance(Duration.ofMinutes(5));
        tester.allNodesMakeAConfigRequestExcept();
        assertEquals(0, tester.deployer.redeployments);
        assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
        assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
        assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    }
    // Hardware failures are detected on two ready nodes, which are then failed
    Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2);
    Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3);
    tester.nodeRepository.write(readyFail1.with(readyFail1.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))));
    tester.nodeRepository.write(readyFail2.with(readyFail2.status().withHardwareFailureDescription(Optional.of("disk_smart"))));
    assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    tester.failer.run();
    assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state());
    assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state());
    String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
    String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname();
    tester.serviceMonitor.setHostDown(downHost1);
    tester.serviceMonitor.setHostDown(downHost2);
    // nothing happens the first 45 minutes
    for (int minutes = 0; minutes < 45; minutes += 5) {
        tester.failer.run();
        tester.clock.advance(Duration.ofMinutes(5));
        tester.allNodesMakeAConfigRequestExcept();
        assertEquals(0, tester.deployer.redeployments);
        assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
        assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
        assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    }
    tester.serviceMonitor.setHostUp(downHost1);
    // downHost2 should now be failed and replaced, but not downHost1
    tester.clock.advance(Duration.ofDays(1));
    tester.allNodesMakeAConfigRequestExcept();
    tester.failer.run();
    assertEquals(1, tester.deployer.redeployments);
    assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
    assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
    assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    assertEquals(downHost2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname());
    // downHost1 fails again
    tester.serviceMonitor.setHostDown(downHost1);
    tester.failer.run();
    tester.clock.advance(Duration.ofMinutes(5));
    tester.allNodesMakeAConfigRequestExcept();
    // the system goes down
    tester.clock.advance(Duration.ofMinutes(120));
    tester.failer = tester.createFailer();
    tester.failer.run();
    // the host is still down and fails
    tester.clock.advance(Duration.ofMinutes(5));
    tester.allNodesMakeAConfigRequestExcept();
    tester.failer.run();
    assertEquals(2, tester.deployer.redeployments);
    assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
    assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
    assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    // the last host goes down
    Node lastNode = tester.highestIndex(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active));
    tester.serviceMonitor.setHostDown(lastNode.hostname());
    // it is not failed because there are no ready nodes to replace it
    for (int minutes = 0; minutes < 75; minutes += 5) {
        tester.failer.run();
        tester.clock.advance(Duration.ofMinutes(5));
        tester.allNodesMakeAConfigRequestExcept();
        assertEquals(2, tester.deployer.redeployments);
        assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
        assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
        assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    }
    // A new node is available
    tester.createReadyNodes(1, 16);
    tester.clock.advance(Duration.ofDays(1));
    tester.allNodesMakeAConfigRequestExcept();
    tester.failer.run();
    // The node is now failed
    assertEquals(3, tester.deployer.redeployments);
    assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
    assertEquals(5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
    assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
    assertTrue("The index of the last failed node is not reused", tester.highestIndex(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active)).allocation().get().membership().index() > lastNode.allocation().get().membership().index());
}
Also used : Node(com.yahoo.vespa.hosted.provision.Node) Test(org.junit.Test)

Example 18 with Node

use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.

the class NodeRetirer method retireUnallocated.

/**
 * Retires unallocated nodes by moving them directly to parked.
 * Returns true iff all there are no unallocated nodes that match the retirement policy
 */
boolean retireUnallocated() {
    try (Mutex lock = nodeRepository().lockUnallocated()) {
        List<Node> allNodes = nodeRepository().getNodes(NodeType.tenant);
        Map<Flavor, Map<Node.State, Long>> numSpareNodesByFlavorByState = getNumberOfNodesByFlavorByNodeState(allNodes);
        flavorSpareChecker.updateReadyAndActiveCountsByFlavor(numSpareNodesByFlavorByState);
        long numFlavorsWithUnsuccessfullyRetiredNodes = allNodes.stream().filter(node -> node.state() == Node.State.ready).filter(node -> retirementPolicy.shouldRetire(node).isPresent()).collect(Collectors.groupingBy(Node::flavor, Collectors.toSet())).entrySet().stream().filter(entry -> {
            Set<Node> nodesThatShouldBeRetiredForFlavor = entry.getValue();
            for (Iterator<Node> iter = nodesThatShouldBeRetiredForFlavor.iterator(); iter.hasNext(); ) {
                Node nodeToRetire = iter.next();
                if (!flavorSpareChecker.canRetireUnallocatedNodeWithFlavor(nodeToRetire.flavor()))
                    break;
                retirementPolicy.shouldRetire(nodeToRetire).ifPresent(reason -> {
                    nodeRepository().write(nodeToRetire.with(nodeToRetire.status().withWantToDeprovision(true)));
                    nodeRepository().park(nodeToRetire.hostname(), Agent.NodeRetirer, reason);
                    iter.remove();
                });
            }
            if (!nodesThatShouldBeRetiredForFlavor.isEmpty()) {
                String commaSeparatedHostnames = nodesThatShouldBeRetiredForFlavor.stream().map(Node::hostname).collect(Collectors.joining(", "));
                log.info(String.format("Failed to retire %s, wanted to retire %d nodes (%s), but there are no spare nodes left.", entry.getKey(), nodesThatShouldBeRetiredForFlavor.size(), commaSeparatedHostnames));
            }
            return !nodesThatShouldBeRetiredForFlavor.isEmpty();
        }).count();
        return numFlavorsWithUnsuccessfullyRetiredNodes == 0;
    }
}
Also used : Deployer(com.yahoo.config.provision.Deployer) FlavorSpareChecker(com.yahoo.vespa.hosted.provision.provisioning.FlavorSpareChecker) RetirementPolicy(com.yahoo.vespa.hosted.provision.maintenance.retire.RetirementPolicy) Iterator(java.util.Iterator) ApplicationId(com.yahoo.config.provision.ApplicationId) Deployment(com.yahoo.config.provision.Deployment) NodeType(com.yahoo.config.provision.NodeType) Collection(java.util.Collection) ClusterSpec(com.yahoo.config.provision.ClusterSpec) Set(java.util.Set) HashMap(java.util.HashMap) Node(com.yahoo.vespa.hosted.provision.Node) Logger(java.util.logging.Logger) Collectors(java.util.stream.Collectors) NodeRepository(com.yahoo.vespa.hosted.provision.NodeRepository) Mutex(com.yahoo.transaction.Mutex) List(java.util.List) Stream(java.util.stream.Stream) Agent(com.yahoo.vespa.hosted.provision.node.Agent) Flavor(com.yahoo.config.provision.Flavor) Duration(java.time.Duration) Map(java.util.Map) LogLevel(com.yahoo.log.LogLevel) Optional(java.util.Optional) Set(java.util.Set) Node(com.yahoo.vespa.hosted.provision.Node) Iterator(java.util.Iterator) Mutex(com.yahoo.transaction.Mutex) Flavor(com.yahoo.config.provision.Flavor) HashMap(java.util.HashMap) Map(java.util.Map)

Example 19 with Node

use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.

the class NodesApiHandler method handlePATCH.

private HttpResponse handlePATCH(HttpRequest request) {
    String path = request.getUri().getPath();
    if (!path.startsWith("/nodes/v2/node/"))
        throw new NotFoundException("Nothing at '" + path + "'");
    Node node = nodeFromRequest(request);
    nodeRepository.write(new NodePatcher(nodeFlavors, request.getData(), node, nodeRepository).apply());
    return new MessageResponse("Updated " + node.hostname());
}
Also used : Node(com.yahoo.vespa.hosted.provision.Node) SlimeUtils.optionalString(com.yahoo.vespa.config.SlimeUtils.optionalString)

Example 20 with Node

use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.

the class RetiredExpirer method maintain.

@Override
protected void maintain() {
    List<Node> activeNodes = nodeRepository().getNodes(Node.State.active);
    Map<ApplicationId, List<Node>> retiredNodesByApplication = activeNodes.stream().filter(node -> node.allocation().isPresent()).filter(node -> node.allocation().get().membership().retired()).collect(Collectors.groupingBy(node -> node.allocation().get().owner()));
    for (Map.Entry<ApplicationId, List<Node>> entry : retiredNodesByApplication.entrySet()) {
        ApplicationId application = entry.getKey();
        List<Node> retiredNodes = entry.getValue();
        try {
            Optional<Deployment> deployment = deployer.deployFromLocalActive(application);
            // this will be done at another config server
            if (!deployment.isPresent())
                continue;
            List<Node> nodesToRemove = retiredNodes.stream().filter(this::canRemove).collect(Collectors.toList());
            if (nodesToRemove.isEmpty()) {
                continue;
            }
            nodeRepository().setRemovable(application, nodesToRemove);
            deployment.get().activate();
            String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
            log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
        } catch (RuntimeException e) {
            String nodeList = retiredNodes.stream().map(Node::hostname).collect(Collectors.joining(", "));
            log.log(Level.WARNING, "Exception trying to deactivate retired nodes from " + application + ": " + nodeList, e);
        }
    }
}
Also used : OrchestrationException(com.yahoo.vespa.orchestrator.OrchestrationException) Deployer(com.yahoo.config.provision.Deployer) ApplicationId(com.yahoo.config.provision.ApplicationId) Deployment(com.yahoo.config.provision.Deployment) NodeType(com.yahoo.config.provision.NodeType) Orchestrator(com.yahoo.vespa.orchestrator.Orchestrator) Node(com.yahoo.vespa.hosted.provision.Node) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Level(java.util.logging.Level) NodeRepository(com.yahoo.vespa.hosted.provision.NodeRepository) List(java.util.List) History(com.yahoo.vespa.hosted.provision.node.History) HostName(com.yahoo.vespa.applicationmodel.HostName) Duration(java.time.Duration) Map(java.util.Map) Clock(java.time.Clock) Optional(java.util.Optional) Node(com.yahoo.vespa.hosted.provision.Node) Deployment(com.yahoo.config.provision.Deployment) List(java.util.List) ApplicationId(com.yahoo.config.provision.ApplicationId) Map(java.util.Map)

Aggregations

Node (com.yahoo.vespa.hosted.provision.Node)121 Test (org.junit.Test)67 ApplicationId (com.yahoo.config.provision.ApplicationId)40 ClusterSpec (com.yahoo.config.provision.ClusterSpec)33 List (java.util.List)26 ArrayList (java.util.ArrayList)23 Zone (com.yahoo.config.provision.Zone)22 Flavor (com.yahoo.config.provision.Flavor)21 HashSet (java.util.HashSet)19 Collectors (java.util.stream.Collectors)19 Optional (java.util.Optional)18 NodeRepository (com.yahoo.vespa.hosted.provision.NodeRepository)16 Duration (java.time.Duration)16 HostSpec (com.yahoo.config.provision.HostSpec)15 NodeType (com.yahoo.config.provision.NodeType)15 Agent (com.yahoo.vespa.hosted.provision.node.Agent)13 Map (java.util.Map)13 HashMap (java.util.HashMap)12 Collections (java.util.Collections)11 Set (java.util.Set)11