use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class NodeFailerTest method failing_docker_hosts.
@Test
public void failing_docker_hosts() {
NodeFailTester tester = NodeFailTester.withTwoApplicationsOnDocker(7);
// For a day all nodes work so nothing happens
for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
}
// Select the first host that has two active nodes
String downHost1 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2);
tester.serviceMonitor.setHostDown(downHost1);
// nothing happens the first 45 minutes
for (int minutes = 0; minutes < 45; minutes += 5) {
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(0, tester.deployer.redeployments);
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(13, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(7, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
}
tester.clock.advance(Duration.ofMinutes(30));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(2 + 1, tester.deployer.redeployments);
assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(10, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
// Now lets fail an active tenant node
Node downTenant1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).get(0);
tester.serviceMonitor.setHostDown(downTenant1.hostname());
// nothing happens during the entire day because of the failure throttling
for (int minutes = 0, interval = 30; minutes < 24 * 60; minutes += interval) {
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(interval));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(3 + 1, tester.nodeRepository.getNodes(Node.State.failed).size());
}
tester.clock.advance(Duration.ofMinutes(30));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(3 + 1, tester.deployer.redeployments);
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(6, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
// Lets fail another host, make sure it is not the same where downTenant1 is a child
String downHost2 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());
tester.serviceMonitor.setHostDown(downHost2);
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(90));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(5 + 2, tester.deployer.redeployments);
assertEquals(7, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(6, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
// We have only 5 hosts remaining, so if we fail another host, we should only be able to redeploy app1's
// node, while app2's should remain
String downHost3 = selectFirstParentHostWithNActiveNodesExcept(tester.nodeRepository, 2, downTenant1.parentHostname().get());
tester.serviceMonitor.setHostDown(downHost3);
tester.failer.run();
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(6 + 2, tester.deployer.redeployments);
assertEquals(9, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(8, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(5, tester.nodeRepository.getNodes(NodeType.host, Node.State.active).size());
}
use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class NodeFailerTest method node_failing.
@Test
public void node_failing() throws InterruptedException {
NodeFailTester tester = NodeFailTester.withTwoApplications();
// For a day all nodes work so nothing happens
for (int minutes = 0; minutes < 24 * 60; minutes += 5) {
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(0, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
// Hardware failures are detected on two ready nodes, which are then failed
Node readyFail1 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(2);
Node readyFail2 = tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).get(3);
tester.nodeRepository.write(readyFail1.with(readyFail1.status().withHardwareFailureDescription(Optional.of("memory_mcelog"))));
tester.nodeRepository.write(readyFail2.with(readyFail2.status().withHardwareFailureDescription(Optional.of("disk_smart"))));
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
tester.failer.run();
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail1.hostname()).get().state());
assertEquals(Node.State.failed, tester.nodeRepository.getNode(readyFail2.hostname()).get().state());
String downHost1 = tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active).get(1).hostname();
String downHost2 = tester.nodeRepository.getNodes(NodeFailTester.app2, Node.State.active).get(3).hostname();
tester.serviceMonitor.setHostDown(downHost1);
tester.serviceMonitor.setHostDown(downHost2);
// nothing happens the first 45 minutes
for (int minutes = 0; minutes < 45; minutes += 5) {
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(0, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
tester.serviceMonitor.setHostUp(downHost1);
// downHost2 should now be failed and replaced, but not downHost1
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(1, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(3, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(1, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertEquals(downHost2, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).get(0).hostname());
// downHost1 fails again
tester.serviceMonitor.setHostDown(downHost1);
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
// the system goes down
tester.clock.advance(Duration.ofMinutes(120));
tester.failer = tester.createFailer();
tester.failer.run();
// the host is still down and fails
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
assertEquals(2, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
// the last host goes down
Node lastNode = tester.highestIndex(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active));
tester.serviceMonitor.setHostDown(lastNode.hostname());
// it is not failed because there are no ready nodes to replace it
for (int minutes = 0; minutes < 75; minutes += 5) {
tester.failer.run();
tester.clock.advance(Duration.ofMinutes(5));
tester.allNodesMakeAConfigRequestExcept();
assertEquals(2, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(4, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
}
// A new node is available
tester.createReadyNodes(1, 16);
tester.clock.advance(Duration.ofDays(1));
tester.allNodesMakeAConfigRequestExcept();
tester.failer.run();
// The node is now failed
assertEquals(3, tester.deployer.redeployments);
assertEquals(12, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.active).size());
assertEquals(5, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.failed).size());
assertEquals(0, tester.nodeRepository.getNodes(NodeType.tenant, Node.State.ready).size());
assertTrue("The index of the last failed node is not reused", tester.highestIndex(tester.nodeRepository.getNodes(NodeFailTester.app1, Node.State.active)).allocation().get().membership().index() > lastNode.allocation().get().membership().index());
}
use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class NodeRetirer method retireUnallocated.
/**
* Retires unallocated nodes by moving them directly to parked.
* Returns true iff all there are no unallocated nodes that match the retirement policy
*/
boolean retireUnallocated() {
try (Mutex lock = nodeRepository().lockUnallocated()) {
List<Node> allNodes = nodeRepository().getNodes(NodeType.tenant);
Map<Flavor, Map<Node.State, Long>> numSpareNodesByFlavorByState = getNumberOfNodesByFlavorByNodeState(allNodes);
flavorSpareChecker.updateReadyAndActiveCountsByFlavor(numSpareNodesByFlavorByState);
long numFlavorsWithUnsuccessfullyRetiredNodes = allNodes.stream().filter(node -> node.state() == Node.State.ready).filter(node -> retirementPolicy.shouldRetire(node).isPresent()).collect(Collectors.groupingBy(Node::flavor, Collectors.toSet())).entrySet().stream().filter(entry -> {
Set<Node> nodesThatShouldBeRetiredForFlavor = entry.getValue();
for (Iterator<Node> iter = nodesThatShouldBeRetiredForFlavor.iterator(); iter.hasNext(); ) {
Node nodeToRetire = iter.next();
if (!flavorSpareChecker.canRetireUnallocatedNodeWithFlavor(nodeToRetire.flavor()))
break;
retirementPolicy.shouldRetire(nodeToRetire).ifPresent(reason -> {
nodeRepository().write(nodeToRetire.with(nodeToRetire.status().withWantToDeprovision(true)));
nodeRepository().park(nodeToRetire.hostname(), Agent.NodeRetirer, reason);
iter.remove();
});
}
if (!nodesThatShouldBeRetiredForFlavor.isEmpty()) {
String commaSeparatedHostnames = nodesThatShouldBeRetiredForFlavor.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.info(String.format("Failed to retire %s, wanted to retire %d nodes (%s), but there are no spare nodes left.", entry.getKey(), nodesThatShouldBeRetiredForFlavor.size(), commaSeparatedHostnames));
}
return !nodesThatShouldBeRetiredForFlavor.isEmpty();
}).count();
return numFlavorsWithUnsuccessfullyRetiredNodes == 0;
}
}
use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class NodesApiHandler method handlePATCH.
private HttpResponse handlePATCH(HttpRequest request) {
String path = request.getUri().getPath();
if (!path.startsWith("/nodes/v2/node/"))
throw new NotFoundException("Nothing at '" + path + "'");
Node node = nodeFromRequest(request);
nodeRepository.write(new NodePatcher(nodeFlavors, request.getData(), node, nodeRepository).apply());
return new MessageResponse("Updated " + node.hostname());
}
use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class RetiredExpirer method maintain.
@Override
protected void maintain() {
List<Node> activeNodes = nodeRepository().getNodes(Node.State.active);
Map<ApplicationId, List<Node>> retiredNodesByApplication = activeNodes.stream().filter(node -> node.allocation().isPresent()).filter(node -> node.allocation().get().membership().retired()).collect(Collectors.groupingBy(node -> node.allocation().get().owner()));
for (Map.Entry<ApplicationId, List<Node>> entry : retiredNodesByApplication.entrySet()) {
ApplicationId application = entry.getKey();
List<Node> retiredNodes = entry.getValue();
try {
Optional<Deployment> deployment = deployer.deployFromLocalActive(application);
// this will be done at another config server
if (!deployment.isPresent())
continue;
List<Node> nodesToRemove = retiredNodes.stream().filter(this::canRemove).collect(Collectors.toList());
if (nodesToRemove.isEmpty()) {
continue;
}
nodeRepository().setRemovable(application, nodesToRemove);
deployment.get().activate();
String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
} catch (RuntimeException e) {
String nodeList = retiredNodes.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.log(Level.WARNING, "Exception trying to deactivate retired nodes from " + application + ": " + nodeList, e);
}
}
}
Aggregations