use of com.yahoo.vespa.hosted.node.admin.nodeagent.NodeAgentImpl.ContainerState.STARTING in project vespa by vespa-engine.
the class NodeAgentImpl method converge.
// Public for testing
void converge() {
final Optional<ContainerNodeSpec> nodeSpecOptional = nodeRepository.getContainerNodeSpec(hostname);
// We just removed the node from node repo, so this is expected until NodeAdmin stop this NodeAgent
if (!nodeSpecOptional.isPresent() && expectNodeNotInNodeRepo)
return;
final ContainerNodeSpec nodeSpec = nodeSpecOptional.orElseThrow(() -> new IllegalStateException(String.format("Node '%s' missing from node repository.", hostname)));
expectNodeNotInNodeRepo = false;
Optional<Container> container = getContainer();
if (!nodeSpec.equals(lastNodeSpec)) {
// will change and we will be reporting duplicate metrics.
if (container.map(c -> c.state.isRunning()).orElse(false)) {
storageMaintainer.writeMetricsConfig(containerName, nodeSpec);
}
addDebugMessage("Loading new node spec: " + nodeSpec.toString());
lastNodeSpec = nodeSpec;
}
switch(nodeSpec.nodeState) {
case ready:
case reserved:
case parked:
case failed:
removeContainerIfNeededUpdateContainerState(nodeSpec, container);
updateNodeRepoWithCurrentAttributes(nodeSpec);
break;
case active:
storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, false);
storageMaintainer.getDiskUsageFor(containerName).map(diskUsage -> (double) diskUsage / BYTES_IN_GB / nodeSpec.minDiskAvailableGb).filter(diskUtil -> diskUtil >= 0.8).ifPresent(diskUtil -> storageMaintainer.removeOldFilesFromNode(containerName));
scheduleDownLoadIfNeeded(nodeSpec);
if (isDownloadingImage()) {
addDebugMessage("Waiting for image to download " + imageBeingDownloaded.asString());
return;
}
container = removeContainerIfNeededUpdateContainerState(nodeSpec, container);
if (!container.isPresent()) {
storageMaintainer.handleCoreDumpsForContainer(containerName, nodeSpec, false);
containerState = STARTING;
startContainer(nodeSpec);
containerState = UNKNOWN;
}
runLocalResumeScriptIfNeeded(nodeSpec);
// Because it's more important to stop a bad release from rolling out in prod,
// we put the resume call last. So if we fail after updating the node repo attributes
// but before resume, the app may go through the tenant pipeline but will halt in prod.
//
// Note that this problem exists only because there are 2 different mechanisms
// that should really be parts of a single mechanism:
// - The content of node repo is used to determine whether a new Vespa+application
// has been successfully rolled out.
// - Slobrok and internal orchestrator state is used to determine whether
// to allow upgrade (suspend).
updateNodeRepoWithCurrentAttributes(nodeSpec);
logger.info("Call resume against Orchestrator");
orchestrator.resume(hostname);
break;
case inactive:
removeContainerIfNeededUpdateContainerState(nodeSpec, container);
updateNodeRepoWithCurrentAttributes(nodeSpec);
break;
case provisioned:
nodeRepository.markAsDirty(hostname);
break;
case dirty:
removeContainerIfNeededUpdateContainerState(nodeSpec, container);
logger.info("State is " + nodeSpec.nodeState + ", will delete application storage and mark node as ready");
storageMaintainer.cleanupNodeStorage(containerName, nodeSpec);
updateNodeRepoWithCurrentAttributes(nodeSpec);
nodeRepository.markNodeAvailableForNewAllocation(hostname);
expectNodeNotInNodeRepo = true;
break;
default:
throw new RuntimeException("UNKNOWN STATE " + nodeSpec.nodeState.name());
}
}
Aggregations