use of com.yahoo.config.provision.Deployment in project vespa by vespa-engine.
the class ApplicationMaintainer method deployWithLock.
/**
* Redeploy this application. A lock will be taken for the duration of the deployment activation
*/
final void deployWithLock(ApplicationId application) {
// Lock is acquired with a low timeout to reduce the chance of colliding with an external deployment.
try (Mutex lock = nodeRepository().lock(application, Duration.ofSeconds(1))) {
// became inactive since deployment was requested
if (!isActive(application))
return;
Optional<Deployment> deployment = deployer.deployFromLocalActive(application);
// this will be done at another config server
if (!deployment.isPresent())
return;
deployment.get().activate();
} catch (RuntimeException e) {
log.log(Level.WARNING, "Exception on maintenance redeploy", e);
}
}
use of com.yahoo.config.provision.Deployment in project vespa by vespa-engine.
the class RetiredExpirer method maintain.
@Override
protected void maintain() {
List<Node> activeNodes = nodeRepository().getNodes(Node.State.active);
Map<ApplicationId, List<Node>> retiredNodesByApplication = activeNodes.stream().filter(node -> node.allocation().isPresent()).filter(node -> node.allocation().get().membership().retired()).collect(Collectors.groupingBy(node -> node.allocation().get().owner()));
for (Map.Entry<ApplicationId, List<Node>> entry : retiredNodesByApplication.entrySet()) {
ApplicationId application = entry.getKey();
List<Node> retiredNodes = entry.getValue();
try {
Optional<Deployment> deployment = deployer.deployFromLocalActive(application);
// this will be done at another config server
if (!deployment.isPresent())
continue;
List<Node> nodesToRemove = retiredNodes.stream().filter(this::canRemove).collect(Collectors.toList());
if (nodesToRemove.isEmpty()) {
continue;
}
nodeRepository().setRemovable(application, nodesToRemove);
deployment.get().activate();
String nodeList = nodesToRemove.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.info("Redeployed " + application + " to deactivate retired nodes: " + nodeList);
} catch (RuntimeException e) {
String nodeList = retiredNodes.stream().map(Node::hostname).collect(Collectors.joining(", "));
log.log(Level.WARNING, "Exception trying to deactivate retired nodes from " + application + ": " + nodeList, e);
}
}
}
use of com.yahoo.config.provision.Deployment in project vespa by vespa-engine.
the class NodeFailer method failActive.
/**
* Called when a node should be moved to the failed state: Do that if it seems safe,
* which is when the node repo has available capacity to replace the node (and all its tenant nodes if host).
* Otherwise not replacing the node ensures (by Orchestrator check) that no further action will be taken.
*
* @return whether node was successfully failed
*/
private boolean failActive(Node node, String reason) {
Optional<Deployment> deployment = deployer.deployFromLocalActive(node.allocation().get().owner(), Duration.ofMinutes(30));
// this will be done at another config server
if (!deployment.isPresent())
return false;
try (Mutex lock = nodeRepository().lock(node.allocation().get().owner())) {
// If the active node that we are trying to fail is of type host, we need to successfully fail all
// the children nodes running on it before we fail the host
boolean allTenantNodesFailedOutSuccessfully = true;
String reasonForChildFailure = "Failing due to parent host " + node.hostname() + " failure: " + reason;
for (Node failingTenantNode : nodeRepository().getChildNodes(node.hostname())) {
if (failingTenantNode.state() == Node.State.active) {
allTenantNodesFailedOutSuccessfully &= failActive(failingTenantNode, reasonForChildFailure);
} else {
nodeRepository().fail(failingTenantNode.hostname(), Agent.system, reasonForChildFailure);
}
}
if (!allTenantNodesFailedOutSuccessfully)
return false;
node = nodeRepository().fail(node.hostname(), Agent.system, reason);
try {
deployment.get().activate();
return true;
} catch (RuntimeException e) {
// The expected reason for deployment to fail here is that there is no capacity available to redeploy.
// In that case we should leave the node in the active state to avoid failing additional nodes.
nodeRepository().reactivate(node.hostname(), Agent.system, "Failed to redeploy after being failed by NodeFailer");
log.log(Level.WARNING, "Attempted to fail " + node + " for " + node.allocation().get().owner() + ", but redeploying without the node failed", e);
return false;
}
}
}
use of com.yahoo.config.provision.Deployment in project vespa by vespa-engine.
the class NodeRetirer method retireAllocated.
void retireAllocated() {
List<Node> allNodes = nodeRepository().getNodes(NodeType.tenant);
List<ApplicationId> activeApplications = getActiveApplicationIds(allNodes);
Map<Flavor, Map<Node.State, Long>> numSpareNodesByFlavorByState = getNumberOfNodesByFlavorByNodeState(allNodes);
flavorSpareChecker.updateReadyAndActiveCountsByFlavor(numSpareNodesByFlavorByState);
// Get all the nodes that we could retire along with their deployments
Map<Deployment, Set<Node>> nodesToRetireByDeployment = new HashMap<>();
for (ApplicationId applicationId : activeApplications) {
Map<ClusterSpec.Id, Set<Node>> nodesByCluster = getNodesBelongingToApplication(allNodes, applicationId).stream().collect(Collectors.groupingBy(node -> node.allocation().get().membership().cluster().id(), Collectors.toSet()));
Map<ClusterSpec.Id, Set<Node>> retireableNodesByCluster = nodesByCluster.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, entry -> filterRetireableNodes(entry.getValue())));
if (retireableNodesByCluster.values().stream().mapToInt(Set::size).sum() == 0)
continue;
Optional<Deployment> deployment = deployer.deployFromLocalActive(applicationId);
// this will be done at another config server
if (!deployment.isPresent())
continue;
Set<Node> replaceableNodes = retireableNodesByCluster.entrySet().stream().flatMap(entry -> entry.getValue().stream().filter(node -> flavorSpareChecker.canRetireAllocatedNodeWithFlavor(node.flavor())).limit(getNumberNodesAllowToRetireForCluster(nodesByCluster.get(entry.getKey()), MAX_SIMULTANEOUS_RETIRES_PER_CLUSTER))).collect(Collectors.toSet());
if (!replaceableNodes.isEmpty())
nodesToRetireByDeployment.put(deployment.get(), replaceableNodes);
}
nodesToRetireByDeployment.forEach(((deployment, nodes) -> {
ApplicationId app = nodes.iterator().next().allocation().get().owner();
Set<Node> nodesToRetire;
// that may have changed) with wantToRetire and wantToDeprovision.
try (Mutex lock = nodeRepository().lock(app)) {
nodesToRetire = nodes.stream().map(node -> nodeRepository().getNode(node.hostname()).filter(upToDateNode -> node.state() == Node.State.active).filter(upToDateNode -> node.allocation().get().owner().equals(upToDateNode.allocation().get().owner()))).flatMap(node -> node.map(Stream::of).orElseGet(Stream::empty)).collect(Collectors.toSet());
nodesToRetire.forEach(node -> retirementPolicy.shouldRetire(node).ifPresent(reason -> {
log.info("Setting wantToRetire and wantToDeprovision for host " + node.hostname() + " with flavor " + node.flavor().name() + " allocated to " + node.allocation().get().owner() + ". Reason: " + reason);
Node updatedNode = node.with(node.status().withWantToRetire(true).withWantToDeprovision(true));
nodeRepository().write(updatedNode);
}));
}
// This takes a while, so do it outside of the application lock
if (!nodesToRetire.isEmpty()) {
try {
deployment.activate();
} catch (Exception e) {
log.log(LogLevel.INFO, "Failed to redeploy " + app.serializedForm() + ", will be redeployed later by application maintainer", e);
}
}
}));
}
Aggregations