use of org.bf2.srs.fleetmanager.execution.manager.Worker in project kas-fleetshard by bf2fc6cc711aee1a0c2a.
the class InstanceProfiler method setup.
private void setup() throws Exception {
readResults();
if (profilingResult.name == null) {
profilingResult.name = "profile-" + Environment.DATE_FORMAT.format(LocalDateTime.now());
}
logDir = new File("target", profilingResult.name);
Files.createDirectories(logDir.toPath());
kafkaCluster = KubeClusterResource.connectToKubeCluster(PerformanceEnvironment.KAFKA_KUBECONFIG);
profilingResult.kafkaNodeType = kafkaCluster.getWorkerNodes().get(0).getMetadata().getLabels().get("node.kubernetes.io/instance-type");
kafkaProvisioner = ManagedKafkaProvisioner.create(kafkaCluster);
kafkaProvisioner.setup();
omb = new OMB(KubeClusterResource.connectToKubeCluster(PerformanceEnvironment.OMB_KUBECONFIG));
omb.install(kafkaProvisioner.getTlsConfig());
// TODO: if there is an existing result, make sure it's the same test setup
profilingResult.ombNodeType = omb.getOmbCluster().getWorkerNodes().get(0).getMetadata().getLabels().get("node.kubernetes.io/instance-type");
profilingResult.ombWorkerNodes = omb.getOmbCluster().getWorkerNodes().size();
AvailableResources resources = getMinAvailableResources(omb.getOmbCluster().getWorkerNodes().stream());
// use all available resources on the worker nodes with 2 workers per node
// if (resources.memoryBytes > 16*ONE_GB || resources.memoryBytes < 8*ONE_GB) {
// throw new IllegalStateException("Client instance types are expected to have 16 GB");
// }
// assume instead resources that will fit on 2xlarge or xlarge
resources.cpuMillis = Math.min(6400, resources.cpuMillis);
resources.memoryBytes = Math.min(12 * ONE_GB, resources.memoryBytes);
omb.setWorkerCpu(Quantity.parse(resources.cpuMillis / 2 + "m"));
omb.setWorkerContainerMemory(Quantity.parse(String.valueOf(resources.memoryBytes / 2)));
profilingResult.ombWorkerCpu = omb.getWorkerCpu();
profilingResult.ombWorkerMemory = omb.getWorkerContainerMemory();
LOGGER.info("OMB Workers will use {} cpu and {} memory requests", omb.getWorkerCpu(), omb.getWorkerContainerMemory());
if (profilingResult.completedStep == null) {
installedProvisioner = true;
kafkaProvisioner.install();
writeResults(Step.SETUP);
}
}
use of org.bf2.srs.fleetmanager.execution.manager.Worker in project srs-fleet-manager by bf2fc6cc711aee1a0c2a.
the class JobWrapper method execute.
@Override
@SneakyThrows
@ActivateRequestContext
public void execute(JobExecutionContext quartzJobContext) {
Task task = loadTask(quartzJobContext);
List<Worker> selectedWorkers = workers.stream().filter(w -> w.supports(task) && !workerExclusions.contains(w.getClass())).collect(toList());
for (Worker worker : selectedWorkers) {
WorkerContextImpl wCtx = loadWorkerContext(quartzJobContext, worker, task);
Instant next = null;
Exception lastException = null;
try {
log.debug("Task Manager (task = {}, worker = {}, workerContext = {}): Executing task.", task, worker, wCtx);
worker.execute(task, wCtx);
wCtx.getDelayedActions().forEach(Runnable::run);
// OK vvv
// Reset retry counter
wCtx.setRetryAttempts(0);
// Reset min retry counter
wCtx.setMinRetries(task.getSchedule().getMinRetries());
// Normal rescheduling
next = nextExecution(task);
} catch (Exception anEx) {
// TODO Throwable?
lastException = anEx;
if (anEx instanceof RetryExecutionControlException) {
log.debug("Task Manager (task = {}, worker = {}, workerContext = {}): Task requested a retry.", task, worker, wCtx, anEx);
RetryExecutionControlException ex = (RetryExecutionControlException) anEx;
if (ex.isForce() && wCtx.getMinRetries() < Integer.MAX_VALUE) {
// Make space for forced retry, no more than Integer.MAX_VALUE
wCtx.setMinRetries(wCtx.getMinRetries() + 1);
next = Instant.now().plus(Duration.ofSeconds(1));
}
if (ex.getMinRetries() > wCtx.getMinRetries()) {
wCtx.setMinRetries(ex.getMinRetries());
}
lastException = null;
}
if (wCtx.getRetryAttempts() < wCtx.getMinRetries() && (next == null)) {
// Reschedule if the minRetries is not reached
next = Instant.now().plus(backoff(wCtx.getRetryAttempts()));
}
if (anEx instanceof StopExecutionControlException) {
log.debug("Task Manager (task = {}, worker = {}, workerContext = {}): Task requested a stop.", task, worker, wCtx, anEx);
// Unschedule
next = null;
lastException = null;
}
if (lastException != null) {
log.warn("Task Manager (task = {}, worker = {}, workerContext = {}, nextExecution = {}): Task threw an exception during execution: {}", task, worker, wCtx, next, anEx);
}
wCtx.setRetryAttempts(wCtx.getRetryAttempts() + 1);
} finally {
// Unlikely used
wCtx.setDelayedActions(new ArrayList<>(0));
saveWorkerContext(quartzJobContext, wCtx, worker);
saveTask(quartzJobContext, task);
// Scheduling
if (next != null) {
if (wCtx.getRetryAttempts() == wCtx.getMinRetries()) {
log.info("Task Manager (task = {}, worker = {}, workerContext = {}): Last rescheduling at {}.", task, worker, wCtx, next);
} else {
log.debug("Task Manager (task = {}, worker = {}, workerContext = {}): Rescheduling task at {}.", task, worker, wCtx, next);
}
taskManager.rerigger(task, next);
} else {
try {
log.debug("Task Manager (task = {}, worker = {}, workerContext = {}): Executing finallyExecute. Last exception = {}", task, worker, wCtx, lastException);
worker.finallyExecute(task, wCtx, ofNullable(lastException));
wCtx.getDelayedActions().forEach(Runnable::run);
} catch (Exception ex) {
log.warn("Task Manager (task = {}, worker = {}, workerContext = {}): Ignoring an exception thrown in finallyExecute: {}", task, worker, wCtx, ex);
} finally {
log.debug("Task Manager (task = {}, worker = {}, workerContext = {}): Removing task.", task, worker, wCtx);
taskManager.remove(task);
}
}
}
}
}
use of org.bf2.srs.fleetmanager.execution.manager.Worker in project kas-fleetshard by bf2fc6cc711aee1a0c2a.
the class OMB method createWorker.
private void createWorker(String jvmOpts, String name, Node node) throws IOException {
KubeClient kubeClient = ombCluster.kubeClient();
DeploymentBuilder deploymentBuilder = new DeploymentBuilder().editOrNewMetadata().withName(name).withNamespace(Constants.OMB_NAMESPACE).addToLabels("app", "worker").endMetadata().editOrNewSpec().withReplicas(1).editOrNewSelector().addToMatchLabels("worker", name).endSelector().editOrNewTemplate().editOrNewMetadata().addToLabels("worker", name).addToLabels("app", "worker").endMetadata().editOrNewSpec().addNewContainer().withName("worker").withImage(Constants.OMB_WORKER_IMAGE).withResources(new ResourceRequirementsBuilder().withLimits(getResourceLimits()).withRequests(getResourceLimits()).build()).addToCommand("sh", "-c").addToEnv(new EnvVar("_JAVA_OPTIONS", jvmOpts, null)).addToEnv(envVars.toArray(new EnvVar[0])).addToArgs("cd /tmp/src; ./bin/benchmark-worker").addToPorts(new ContainerPortBuilder().withContainerPort(8080).build(), new ContainerPortBuilder().withContainerPort(8081).build()).withLivenessProbe(new ProbeBuilder().withInitialDelaySeconds(10).withHttpGet(new HTTPGetActionBuilder().withPort(new IntOrString(8080)).withPath("counters-stats").build()).build()).addNewVolumeMount().withName("ca").withMountPath("/cert").withReadOnly(true).endVolumeMount().endContainer().withTerminationGracePeriodSeconds(15L).addNewVolume().withName("ca").editOrNewSecret().withSecretName("ext-listener-crt").endSecret().endVolume().endSpec().endTemplate().endSpec();
if (node != null) {
deploymentBuilder.editSpec().editTemplate().editSpec().withNodeSelector(Collections.singletonMap("kubernetes.io/hostname", node.getMetadata().getLabels().get("kubernetes.io/hostname"))).endSpec().endTemplate().endSpec();
}
kubeClient.client().apps().deployments().inNamespace(Constants.OMB_NAMESPACE).createOrReplace(deploymentBuilder.build());
kubeClient.client().services().inNamespace(Constants.OMB_NAMESPACE).createOrReplace(new ServiceBuilder().editOrNewMetadata().withName(name).withNamespace(Constants.OMB_NAMESPACE).addToLabels("app", "worker").endMetadata().editOrNewSpec().addToSelector("worker", name).addNewPort().withPort(80).withTargetPort(new IntOrString(8080)).withProtocol("TCP").endPort().endSpec().build());
kubeClient.client().adapt(OpenShiftClient.class).routes().inNamespace(Constants.OMB_NAMESPACE).createOrReplace(new RouteBuilder().editOrNewMetadata().withName(name).withNamespace(Constants.OMB_NAMESPACE).withAnnotations(Map.of("haproxy.router.openshift.io/timeout", "360s")).addToLabels("app", "worker").addToLabels("app.kubernetes.io/name", name).endMetadata().editOrNewSpec().editOrNewTo().withKind("Service").withName(name).endTo().endSpec().build());
}
Aggregations