use of com.netflix.titus.common.util.time.Clock in project titus-control-plane by Netflix.
the class BatchDifferenceResolver method createNewTaskAction.
private Optional<TitusChangeAction> createNewTaskAction(BatchJobView refJobView, int taskIndex, Optional<EntityHolder> previousTask, List<String> unassignedIpAllocations, List<String> ebsVolumeIds) {
// Safety check
long numberOfNotFinishedTasks = refJobView.getJobHolder().getChildren().stream().filter(holder -> TaskState.isRunning(((Task) holder.getEntity()).getStatus().getState())).count();
if (numberOfNotFinishedTasks >= refJobView.getRequiredSize()) {
titusRuntime.getCodeInvariants().inconsistent("Batch job reconciler attempts to create too many tasks: jobId=%s, requiredSize=%s, current=%s", refJobView.getJob().getId(), refJobView.getRequiredSize(), numberOfNotFinishedTasks);
return Optional.empty();
}
Map<String, String> taskContext = getTaskContext(previousTask, unassignedIpAllocations, ebsVolumeIds);
JobDescriptor jobDescriptor = refJobView.getJob().getJobDescriptor();
ApplicationSLA capacityGroupDescriptor = JobManagerUtil.getCapacityGroupDescriptor(jobDescriptor, capacityGroupService);
String resourcePool = capacityGroupDescriptor.getResourcePool();
taskContext = CollectionsExt.copyAndAdd(taskContext, ImmutableMap.of(TaskAttributes.TASK_ATTRIBUTES_RESOURCE_POOL, resourcePool, TaskAttributes.TASK_ATTRIBUTES_TIER, capacityGroupDescriptor.getTier().name()));
TitusChangeAction storeAction = storeWriteRetryInterceptor.apply(createOrReplaceTaskAction(runtime, jobStore, refJobView.getJobHolder(), taskIndex, versionSupplier, clock, taskContext));
return Optional.of(storeAction);
}
use of com.netflix.titus.common.util.time.Clock in project titus-control-plane by Netflix.
the class TaskTimeoutChangeActions method setTimeout.
public static TitusChangeAction setTimeout(String taskId, TaskState taskState, long timeoutMs, Clock clock) {
String tagName = STATE_TAGS.get(taskState);
Preconditions.checkArgument(tagName != null, "Timeout not tracked for state %s", taskState);
return TitusChangeAction.newAction("setTimeout").id(taskId).trigger(Trigger.Reconciler).summary("Setting timeout for task in state %s: %s", taskState, DateTimeExt.toTimeUnitString(timeoutMs)).callMetadata(JobManagerConstants.RECONCILER_CALLMETADATA.toBuilder().withCallReason("configure timeout").build()).applyModelUpdate(self -> {
TitusModelAction modelAction = TitusModelAction.newModelUpdate(self).taskMaybeUpdate(jobHolder -> jobHolder.findById(taskId).map(taskHolder -> {
EntityHolder newTaskHolder = taskHolder.addTag(tagName, clock.wallTime() + timeoutMs);
if (taskState == TaskState.KillInitiated) {
newTaskHolder = newTaskHolder.addTag(KILL_INITIATED_ATTEMPT_TAG, 0);
}
return Pair.of(jobHolder.addChild(newTaskHolder), newTaskHolder);
}));
return ModelActionHolder.running(modelAction);
});
}
use of com.netflix.titus.common.util.time.Clock in project titus-control-plane by Netflix.
the class DefaultV3JobOperations method enterActiveMode.
@Activator
public void enterActiveMode() {
this.reconciliationFramework = jobReconciliationFrameworkFactory.newInstance();
// BUG: event stream breaks permanently, and cannot be retried.
// As we cannot fix the underlying issue yet, we have to be able to discover when it happens.
AtomicLong eventStreamLastError = new AtomicLong();
Clock clock = titusRuntime.getClock();
this.transactionLoggerSubscription = JobTransactionLogger.logEvents(reconciliationFramework, eventStreamLastError, clock);
PolledMeter.using(titusRuntime.getRegistry()).withName(METRIC_EVENT_STREAM_LAST_ERROR).monitorValue(eventStreamLastError, value -> value.get() <= 0 ? 0 : clock.wallTime() - value.get());
// Remove finished jobs from the reconciliation framework.
Observable<JobManagerReconcilerEvent> reconciliationEventsObservable = reconciliationFramework.events().onBackpressureBuffer(OBSERVE_JOBS_BACKPRESSURE_BUFFER_SIZE, () -> logger.warn("Overflowed the buffer size: " + OBSERVE_JOBS_BACKPRESSURE_BUFFER_SIZE), BackpressureOverflow.ON_OVERFLOW_ERROR).doOnSubscribe(() -> {
List<EntityHolder> entityHolders = reconciliationFramework.orderedView(IndexKind.StatusCreationTime);
for (EntityHolder entityHolder : entityHolders) {
handleJobCompletedEvent(entityHolder);
}
});
this.reconcilerEventSubscription = titusRuntime.persistentStream(reconciliationEventsObservable).subscribe(event -> {
if (event instanceof JobModelUpdateReconcilerEvent) {
JobModelUpdateReconcilerEvent jobUpdateEvent = (JobModelUpdateReconcilerEvent) event;
handleJobCompletedEvent(jobUpdateEvent.getChangedEntityHolder());
}
}, e -> logger.error("Event stream terminated with an error", e), () -> logger.info("Event stream completed"));
reconciliationFramework.start();
}
use of com.netflix.titus.common.util.time.Clock in project titus-control-plane by Netflix.
the class DifferenceResolverUtils method findTaskStateTimeouts.
/**
* Find all tasks that are stuck in a specific state. The number of {@link ChangeAction changes} will be limited
* by the {@link TokenBucket stuckInStateRateLimiter}
*/
public static List<ChangeAction> findTaskStateTimeouts(ReconciliationEngine<JobManagerReconcilerEvent> engine, JobView runningJobView, JobManagerConfiguration configuration, JobServiceRuntime runtime, JobStore jobStore, VersionSupplier versionSupplier, TokenBucket stuckInStateRateLimiter, TitusRuntime titusRuntime) {
Clock clock = titusRuntime.getClock();
List<ChangeAction> actions = new ArrayList<>();
runningJobView.getJobHolder().getChildren().forEach(taskHolder -> {
Task task = taskHolder.getEntity();
TaskState taskState = task.getStatus().getState();
if (JobFunctions.isBatchJob(runningJobView.getJob()) && taskState == TaskState.Started) {
Job<BatchJobExt> batchJob = runningJobView.getJob();
// We expect runtime limit to be always set, so this is just extra safety measure.
long runtimeLimitMs = Math.max(BatchJobExt.RUNTIME_LIMIT_MIN, batchJob.getJobDescriptor().getExtensions().getRuntimeLimitMs());
long deadline = task.getStatus().getTimestamp() + runtimeLimitMs;
if (deadline < clock.wallTime()) {
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_RUNTIME_LIMIT_EXCEEDED, "Task running too long (runtimeLimit=" + runtimeLimitMs + "ms)", titusRuntime));
}
return;
}
TaskTimeoutChangeActions.TimeoutStatus timeoutStatus = TaskTimeoutChangeActions.getTimeoutStatus(taskHolder, clock);
switch(timeoutStatus) {
case Ignore:
case Pending:
break;
case NotSet:
long timeoutMs = -1;
switch(taskState) {
case Launched:
timeoutMs = configuration.getTaskInLaunchedStateTimeoutMs();
break;
case StartInitiated:
timeoutMs = isBatch(runningJobView.getJob()) ? configuration.getBatchTaskInStartInitiatedStateTimeoutMs() : configuration.getServiceTaskInStartInitiatedStateTimeoutMs();
break;
case KillInitiated:
timeoutMs = configuration.getTaskInKillInitiatedStateTimeoutMs();
break;
}
if (timeoutMs > 0) {
actions.add(TaskTimeoutChangeActions.setTimeout(taskHolder.getId(), task.getStatus().getState(), timeoutMs, clock));
}
break;
case TimedOut:
if (!stuckInStateRateLimiter.tryTake()) {
break;
}
if (task.getStatus().getState() == TaskState.KillInitiated) {
int attempts = TaskTimeoutChangeActions.getKillInitiatedAttempts(taskHolder) + 1;
if (attempts >= configuration.getTaskKillAttempts()) {
actions.add(BasicTaskActions.updateTaskInRunningModel(task.getId(), V3JobOperations.Trigger.Reconciler, configuration, engine, taskParam -> Optional.of(taskParam.toBuilder().withStatus(taskParam.getStatus().toBuilder().withState(TaskState.Finished).withReasonCode(TaskStatus.REASON_STUCK_IN_KILLING_STATE).withReasonMessage("stuck in " + taskState + "state").build()).build()), "TimedOut in KillInitiated state", versionSupplier, titusRuntime, JobManagerConstants.RECONCILER_CALLMETADATA.toBuilder().withCallReason("Kill initiated").build()));
} else {
actions.add(TaskTimeoutChangeActions.incrementTaskKillAttempt(task.getId(), configuration.getTaskInKillInitiatedStateTimeoutMs(), clock));
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_STUCK_IN_KILLING_STATE, "Another kill attempt (" + (attempts + 1) + ')', titusRuntime));
}
} else {
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_STUCK_IN_STATE, "Task stuck in " + taskState + " state", titusRuntime));
}
break;
}
});
return actions;
}
Aggregations