use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class NodeRepositoryProvisioner method asSortedHosts.
private List<HostSpec> asSortedHosts(List<Node> nodes) {
nodes.sort(Comparator.comparingInt(node -> node.allocation().get().membership().index()));
List<HostSpec> hosts = new ArrayList<>(nodes.size());
for (Node node : nodes) {
log.log(LogLevel.DEBUG, () -> "Prepared node " + node.hostname() + " - " + node.flavor());
hosts.add(new HostSpec(node.hostname(), node.allocation().orElseThrow(IllegalStateException::new).membership(), node.flavor()));
}
return hosts;
}
use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class Expirer method maintain.
@Override
protected void maintain() {
List<Node> expired = new ArrayList<>();
for (Node node : nodeRepository().getNodes(fromState)) {
Optional<History.Event> event = node.history().event(eventType);
if (event.isPresent() && event.get().at().plus(expiryTime).isBefore(clock.instant()))
expired.add(node);
}
if (!expired.isEmpty())
log.info(fromState + " expirer found " + expired.size() + " expired nodes: " + expired);
expire(expired);
}
use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class NodeFailer method maintain.
@Override
protected void maintain() {
// Ready nodes
try (Mutex lock = nodeRepository().lockUnallocated()) {
updateNodeLivenessEventsForReadyNodes();
getReadyNodesByFailureReason().forEach((node, reason) -> {
if (!throttle(node)) {
nodeRepository().fail(node.hostname(), Agent.system, reason);
}
});
}
// Active nodes
for (Node node : determineActiveNodeDownStatus()) {
Instant graceTimeEnd = node.history().event(History.Event.Type.down).get().at().plus(downTimeLimit);
if (graceTimeEnd.isBefore(clock.instant()) && !applicationSuspended(node) && failAllowedFor(node.type()))
if (!throttle(node))
failActive(node, "Node has been down longer than " + downTimeLimit);
}
}
use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class NodeFailer method throttle.
/**
* Returns true if node failing should be throttled
*/
private boolean throttle(Node node) {
if (throttlePolicy == ThrottlePolicy.disabled)
return false;
Instant startOfThrottleWindow = clock.instant().minus(throttlePolicy.throttleWindow);
List<Node> nodes = nodeRepository().getNodes();
long recentlyFailedNodes = nodes.stream().map(n -> n.history().event(History.Event.Type.failed)).filter(Optional::isPresent).map(Optional::get).filter(failedEvent -> failedEvent.at().isAfter(startOfThrottleWindow)).count();
int allowedFailedNodes = (int) Math.max(nodes.size() * throttlePolicy.fractionAllowedToFail, throttlePolicy.minimumAllowedToFail);
boolean throttle = allowedFailedNodes < recentlyFailedNodes || (allowedFailedNodes == recentlyFailedNodes && !node.type().isDockerHost());
if (throttle) {
log.info(String.format("Want to fail node %s, but throttling is in effect: %s", node.hostname(), throttlePolicy.toHumanReadableString()));
}
metric.set("nodeFailThrottling", throttle ? 1 : 0, null);
return throttle;
}
use of com.yahoo.vespa.hosted.provision.Node in project vespa by vespa-engine.
the class InactiveAndFailedExpirerTest method inactive_and_failed_times_out.
@Test
public void inactive_and_failed_times_out() {
ProvisioningTester tester = new ProvisioningTester(new Zone(Environment.prod, RegionName.from("us-east")));
List<Node> nodes = tester.makeReadyNodes(2, "default");
// Allocate then deallocate 2 nodes
ClusterSpec cluster = ClusterSpec.request(ClusterSpec.Type.content, ClusterSpec.Id.from("test"), Version.fromString("6.42"), false);
List<HostSpec> preparedNodes = tester.prepare(applicationId, cluster, Capacity.fromNodeCount(2), 1);
tester.activate(applicationId, new HashSet<>(preparedNodes));
assertEquals(2, tester.getNodes(applicationId, Node.State.active).size());
tester.deactivate(applicationId);
List<Node> inactiveNodes = tester.getNodes(applicationId, Node.State.inactive).asList();
assertEquals(2, inactiveNodes.size());
// Inactive times out
tester.advanceTime(Duration.ofMinutes(14));
new InactiveExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10), new JobControl(tester.nodeRepository().database())).run();
assertEquals(0, tester.nodeRepository().getNodes(Node.State.inactive).size());
List<Node> dirty = tester.nodeRepository().getNodes(Node.State.dirty);
assertEquals(2, dirty.size());
assertFalse(dirty.get(0).allocation().isPresent());
assertFalse(dirty.get(1).allocation().isPresent());
// One node is set back to ready
Node ready = tester.nodeRepository().setReady(Collections.singletonList(dirty.get(0)), Agent.system, getClass().getSimpleName()).get(0);
assertEquals("Allocated history is removed on readying", Arrays.asList(History.Event.Type.provisioned, History.Event.Type.readied), ready.history().events().stream().map(History.Event::type).collect(Collectors.toList()));
// Dirty times out for the other one
tester.advanceTime(Duration.ofMinutes(14));
new DirtyExpirer(tester.nodeRepository(), tester.clock(), Duration.ofMinutes(10), new JobControl(tester.nodeRepository().database())).run();
assertEquals(0, tester.nodeRepository().getNodes(NodeType.tenant, Node.State.dirty).size());
List<Node> failed = tester.nodeRepository().getNodes(NodeType.tenant, Node.State.failed);
assertEquals(1, failed.size());
assertEquals(1, failed.get(0).status().failCount());
}
Aggregations