use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.
the class ExtendedJobSanitizerTest method testLegacyBatchJobDisruptionBudgetRewrite.
@Test
public void testLegacyBatchJobDisruptionBudgetRewrite() {
JobDescriptor<BatchJobExt> jobDescriptor = newBatchJob().getValue().toBuilder().withDisruptionBudget(DisruptionBudget.none()).build();
ExtendedJobSanitizer sanitizer = new ExtendedJobSanitizer(configuration, jobAssertions, entitySanitizer, disruptionBudgetSanitizer, jd -> false, jd -> false, titusRuntime);
Optional<JobDescriptor<BatchJobExt>> sanitizedOpt = sanitizer.sanitize(jobDescriptor);
assertThat(sanitizedOpt).isNotEmpty();
JobDescriptor<BatchJobExt> sanitized = sanitizedOpt.get();
String nonCompliant = sanitized.getAttributes().get(TITUS_NON_COMPLIANT_FEATURES);
assertThat(nonCompliant).contains(JobFeatureComplianceChecks.DISRUPTION_BUDGET_FEATURE);
SelfManagedDisruptionBudgetPolicy policy = (SelfManagedDisruptionBudgetPolicy) sanitized.getDisruptionBudget().getDisruptionBudgetPolicy();
assertThat(policy.getRelocationTimeMs()).isEqualTo((long) ((jobDescriptor.getExtensions()).getRuntimeLimitMs() * BATCH_RUNTIME_LIMIT_FACTOR));
}
use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.
the class BatchDifferenceResolver method applyStore.
private List<ChangeAction> applyStore(ReconciliationEngine<JobManagerReconcilerEvent> engine, BatchJobView refJobView, EntityHolder storeJob, AtomicInteger allowedNewTasks) {
if (!storeWriteRetryInterceptor.executionLimits(storeJob)) {
return Collections.emptyList();
}
List<ChangeAction> actions = new ArrayList<>();
EntityHolder refJobHolder = refJobView.getJobHolder();
Job<BatchJobExt> refJob = refJobHolder.getEntity();
if (!refJobHolder.getEntity().equals(storeJob.getEntity())) {
actions.add(storeWriteRetryInterceptor.apply(BasicJobActions.updateJobInStore(engine, jobStore)));
}
boolean isJobTerminating = refJob.getStatus().getState() == JobState.KillInitiated;
for (EntityHolder referenceTask : refJobHolder.getChildren()) {
Optional<EntityHolder> storeHolder = storeJob.findById(referenceTask.getId());
boolean refAndStoreInSync = storeHolder.isPresent() && DifferenceResolverUtils.areEquivalent(storeHolder.get(), referenceTask);
boolean shouldRetry = !isJobTerminating && DifferenceResolverUtils.shouldRetry(refJob, referenceTask.getEntity()) && allowedNewTasks.get() > 0;
if (refAndStoreInSync) {
BatchJobTask storeTask = storeHolder.get().getEntity();
if (shouldRetry && TaskRetryers.shouldRetryNow(referenceTask, clock)) {
logger.info("Retrying task: oldTaskId={}, index={}", referenceTask.getId(), storeTask.getIndex());
createNewTaskAction(refJobView, storeTask.getIndex(), Optional.of(referenceTask), Collections.emptyList(), Collections.emptyList()).ifPresent(actions::add);
}
} else {
Task task = referenceTask.getEntity();
CallMetadata callMetadata = RECONCILER_CALLMETADATA.toBuilder().withCallReason("Writing runtime state changes to store").build();
actions.add(storeWriteRetryInterceptor.apply(BasicTaskActions.writeReferenceTaskToStore(jobStore, engine, task.getId(), callMetadata, titusRuntime)));
}
// Both current and delayed retries are counted
if (shouldRetry) {
allowedNewTasks.decrementAndGet();
}
}
return actions;
}
use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.
the class DifferenceResolverUtils method findTaskStateTimeouts.
/**
* Find all tasks that are stuck in a specific state. The number of {@link ChangeAction changes} will be limited
* by the {@link TokenBucket stuckInStateRateLimiter}
*/
public static List<ChangeAction> findTaskStateTimeouts(ReconciliationEngine<JobManagerReconcilerEvent> engine, JobView runningJobView, JobManagerConfiguration configuration, JobServiceRuntime runtime, JobStore jobStore, VersionSupplier versionSupplier, TokenBucket stuckInStateRateLimiter, TitusRuntime titusRuntime) {
Clock clock = titusRuntime.getClock();
List<ChangeAction> actions = new ArrayList<>();
runningJobView.getJobHolder().getChildren().forEach(taskHolder -> {
Task task = taskHolder.getEntity();
TaskState taskState = task.getStatus().getState();
if (JobFunctions.isBatchJob(runningJobView.getJob()) && taskState == TaskState.Started) {
Job<BatchJobExt> batchJob = runningJobView.getJob();
// We expect runtime limit to be always set, so this is just extra safety measure.
long runtimeLimitMs = Math.max(BatchJobExt.RUNTIME_LIMIT_MIN, batchJob.getJobDescriptor().getExtensions().getRuntimeLimitMs());
long deadline = task.getStatus().getTimestamp() + runtimeLimitMs;
if (deadline < clock.wallTime()) {
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_RUNTIME_LIMIT_EXCEEDED, "Task running too long (runtimeLimit=" + runtimeLimitMs + "ms)", titusRuntime));
}
return;
}
TaskTimeoutChangeActions.TimeoutStatus timeoutStatus = TaskTimeoutChangeActions.getTimeoutStatus(taskHolder, clock);
switch(timeoutStatus) {
case Ignore:
case Pending:
break;
case NotSet:
long timeoutMs = -1;
switch(taskState) {
case Launched:
timeoutMs = configuration.getTaskInLaunchedStateTimeoutMs();
break;
case StartInitiated:
timeoutMs = isBatch(runningJobView.getJob()) ? configuration.getBatchTaskInStartInitiatedStateTimeoutMs() : configuration.getServiceTaskInStartInitiatedStateTimeoutMs();
break;
case KillInitiated:
timeoutMs = configuration.getTaskInKillInitiatedStateTimeoutMs();
break;
}
if (timeoutMs > 0) {
actions.add(TaskTimeoutChangeActions.setTimeout(taskHolder.getId(), task.getStatus().getState(), timeoutMs, clock));
}
break;
case TimedOut:
if (!stuckInStateRateLimiter.tryTake()) {
break;
}
if (task.getStatus().getState() == TaskState.KillInitiated) {
int attempts = TaskTimeoutChangeActions.getKillInitiatedAttempts(taskHolder) + 1;
if (attempts >= configuration.getTaskKillAttempts()) {
actions.add(BasicTaskActions.updateTaskInRunningModel(task.getId(), V3JobOperations.Trigger.Reconciler, configuration, engine, taskParam -> Optional.of(taskParam.toBuilder().withStatus(taskParam.getStatus().toBuilder().withState(TaskState.Finished).withReasonCode(TaskStatus.REASON_STUCK_IN_KILLING_STATE).withReasonMessage("stuck in " + taskState + "state").build()).build()), "TimedOut in KillInitiated state", versionSupplier, titusRuntime, JobManagerConstants.RECONCILER_CALLMETADATA.toBuilder().withCallReason("Kill initiated").build()));
} else {
actions.add(TaskTimeoutChangeActions.incrementTaskKillAttempt(task.getId(), configuration.getTaskInKillInitiatedStateTimeoutMs(), clock));
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_STUCK_IN_KILLING_STATE, "Another kill attempt (" + (attempts + 1) + ')', titusRuntime));
}
} else {
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_STUCK_IN_STATE, "Task stuck in " + taskState + " state", titusRuntime));
}
break;
}
});
return actions;
}
use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.
the class JobRuntimePredictionSanitizer method capPredictionToRuntimeLimit.
/**
* Use the prediction when available and shorter than the runtime limit, otherwise the runtime limit becomes
* the prediction if within {@link JobRuntimePredictionConfiguration#getMaxOpportunisticRuntimeLimitMs()}
*/
@SuppressWarnings("unchecked")
private JobDescriptor capPredictionToRuntimeLimit(JobDescriptor jobDescriptor) {
// non-batch jobs have been filtered before this point, it is safe to cast
BatchJobExt extensions = ((JobDescriptor<BatchJobExt>) jobDescriptor).getExtensions();
long runtimeLimitMs = extensions.getRuntimeLimitMs();
if (runtimeLimitMs <= 0 || runtimeLimitMs > configuration.getMaxOpportunisticRuntimeLimitMs()) {
// no runtime limit or too high to be used, noop
return jobDescriptor;
}
return JobFunctions.getJobRuntimePrediction(jobDescriptor).filter(prediction -> runtimeLimitMs > prediction.toMillis()).map(ignored -> jobDescriptor).orElseGet(() -> JobFunctions.appendJobDescriptorAttributes(jobDescriptor, ImmutableMap.<String, String>builder().put(JOB_ATTRIBUTES_RUNTIME_PREDICTION_SEC, Double.toString(runtimeLimitMs / 1000.0)).put(JOB_ATTRIBUTES_RUNTIME_PREDICTION_CONFIDENCE, Double.toString(1.0)).build()));
}
use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.
the class JobModelSanitizationTest method testBatchJobWithIncompleteEfsDefinition.
@Test
public void testBatchJobWithIncompleteEfsDefinition() {
JobDescriptor<BatchJobExt> jobDescriptor = oneTaskBatchJobDescriptor();
JobDescriptor<BatchJobExt> incompleteEfsDefinition = JobModel.newJobDescriptor(jobDescriptor).withContainer(JobModel.newContainer(jobDescriptor.getContainer()).withContainerResources(JobModel.newContainerResources(jobDescriptor.getContainer().getContainerResources()).withEfsMounts(Collections.singletonList(new EfsMount("efsId#1", "/data", null, null))).build()).build()).build();
Job<BatchJobExt> job = JobGenerator.batchJobs(incompleteEfsDefinition).getValue();
// EFS violation expected
assertThat(entitySanitizer.validate(job)).hasSize(1);
// Now do cleanup
Job<BatchJobExt> sanitized = entitySanitizer.sanitize(job).get();
assertThat(entitySanitizer.validate(sanitized)).isEmpty();
}
Aggregations