Search in sources :

Example 11 with ChangeAction

use of com.netflix.titus.common.framework.reconciler.ChangeAction in project titus-control-plane by Netflix.

the class KillInitiatedActions method userInitiateTaskKillAction.

/**
 * Change a task to {@link TaskState#KillInitiated} state, store it, and send the kill command to the compute provider.
 * All models are updated when both operations complete.
 * This method is used for user initiated kill operations, so the store operation happens before response is sent back to the user.
 */
public static ChangeAction userInitiateTaskKillAction(ReconciliationEngine<JobManagerReconcilerEvent> engine, JobServiceRuntime executionContext, JobStore jobStore, VersionSupplier versionSupplier, String taskId, boolean shrink, boolean preventMinSizeUpdate, String reasonCode, String reason, TitusRuntime titusRuntime, CallMetadata callMetadata) {
    return TitusChangeAction.newAction("userInitiateTaskKill").id(taskId).trigger(V3JobOperations.Trigger.API).summary(reason).callMetadata(callMetadata).changeWithModelUpdates(self -> JobEntityHolders.toTaskObservable(engine, taskId, titusRuntime).flatMap(task -> {
        TaskState taskState = task.getStatus().getState();
        if (taskState == TaskState.KillInitiated || taskState == TaskState.Finished) {
            return Observable.just(Collections.<ModelActionHolder>emptyList());
        }
        if (shrink) {
            Job<ServiceJobExt> job = engine.getReferenceView().getEntity();
            Capacity capacity = job.getJobDescriptor().getExtensions().getCapacity();
            if (preventMinSizeUpdate && capacity.getDesired() <= capacity.getMin()) {
                return Observable.<List<ModelActionHolder>>error(JobManagerException.terminateAndShrinkNotAllowed(job, task));
            }
        }
        Task taskWithKillInitiated = VersionSuppliers.nextVersion(JobFunctions.changeTaskStatus(task, TaskState.KillInitiated, reasonCode, reason, titusRuntime.getClock()), versionSupplier);
        Callable<List<ModelActionHolder>> modelUpdateActions = () -> JobEntityHolders.expectTask(engine, task.getId(), titusRuntime).map(current -> {
            List<ModelActionHolder> updateActions = new ArrayList<>();
            TitusModelAction stateUpdateAction = TitusModelAction.newModelUpdate(self).taskUpdate(taskWithKillInitiated);
            updateActions.addAll(ModelActionHolder.allModels(stateUpdateAction));
            if (shrink) {
                TitusModelAction shrinkAction = createShrinkAction(self, versionSupplier);
                updateActions.add(ModelActionHolder.reference(shrinkAction));
            }
            return updateActions;
        }).orElse(Collections.emptyList());
        return jobStore.updateTask(taskWithKillInitiated).andThen(createKillAction(executionContext, task)).andThen(Observable.fromCallable(modelUpdateActions));
    }));
}
Also used : Completable(rx.Completable) JobManagerConstants(com.netflix.titus.api.jobmanager.service.JobManagerConstants) JobServiceRuntime(com.netflix.titus.master.jobmanager.service.JobServiceRuntime) TitusChangeAction(com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction) Task(com.netflix.titus.api.jobmanager.model.job.Task) Callable(java.util.concurrent.Callable) ReactorExt(com.netflix.titus.common.util.rx.ReactorExt) ArrayList(java.util.ArrayList) Observable(rx.Observable) HashSet(java.util.HashSet) JobStatus(com.netflix.titus.api.jobmanager.model.job.JobStatus) JobState(com.netflix.titus.api.jobmanager.model.job.JobState) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) JobManagerException(com.netflix.titus.api.jobmanager.service.JobManagerException) JobEntityHolders(com.netflix.titus.master.jobmanager.service.common.action.JobEntityHolders) JobStore(com.netflix.titus.api.jobmanager.store.JobStore) CallMetadata(com.netflix.titus.api.model.callmetadata.CallMetadata) Job(com.netflix.titus.api.jobmanager.model.job.Job) ServiceJobExt(com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt) TaskStatus(com.netflix.titus.api.jobmanager.model.job.TaskStatus) Set(java.util.Set) JobFunctions(com.netflix.titus.api.jobmanager.model.job.JobFunctions) TitusModelAction(com.netflix.titus.master.jobmanager.service.common.action.TitusModelAction) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState) Capacity(com.netflix.titus.api.jobmanager.model.job.Capacity) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) ModelActionHolder(com.netflix.titus.common.framework.reconciler.ModelActionHolder) List(java.util.List) V3JobOperations(com.netflix.titus.api.jobmanager.service.V3JobOperations) VersionSupplier(com.netflix.titus.master.jobmanager.service.VersionSupplier) ReconciliationEngine(com.netflix.titus.common.framework.reconciler.ReconciliationEngine) VersionSuppliers(com.netflix.titus.master.jobmanager.service.VersionSuppliers) Optional(java.util.Optional) JobManagerReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobManagerReconcilerEvent) TitusRuntime(com.netflix.titus.common.runtime.TitusRuntime) Collections(java.util.Collections) TitusModelAction(com.netflix.titus.master.jobmanager.service.common.action.TitusModelAction) Task(com.netflix.titus.api.jobmanager.model.job.Task) Capacity(com.netflix.titus.api.jobmanager.model.job.Capacity) ArrayList(java.util.ArrayList) Job(com.netflix.titus.api.jobmanager.model.job.Job) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState) Callable(java.util.concurrent.Callable) ModelActionHolder(com.netflix.titus.common.framework.reconciler.ModelActionHolder)

Example 12 with ChangeAction

use of com.netflix.titus.common.framework.reconciler.ChangeAction in project titus-control-plane by Netflix.

the class DifferenceResolverUtils method findTaskStateTimeouts.

/**
 * Find all tasks that are stuck in a specific state. The number of {@link ChangeAction changes} will be limited
 * by the {@link TokenBucket stuckInStateRateLimiter}
 */
public static List<ChangeAction> findTaskStateTimeouts(ReconciliationEngine<JobManagerReconcilerEvent> engine, JobView runningJobView, JobManagerConfiguration configuration, JobServiceRuntime runtime, JobStore jobStore, VersionSupplier versionSupplier, TokenBucket stuckInStateRateLimiter, TitusRuntime titusRuntime) {
    Clock clock = titusRuntime.getClock();
    List<ChangeAction> actions = new ArrayList<>();
    runningJobView.getJobHolder().getChildren().forEach(taskHolder -> {
        Task task = taskHolder.getEntity();
        TaskState taskState = task.getStatus().getState();
        if (JobFunctions.isBatchJob(runningJobView.getJob()) && taskState == TaskState.Started) {
            Job<BatchJobExt> batchJob = runningJobView.getJob();
            // We expect runtime limit to be always set, so this is just extra safety measure.
            long runtimeLimitMs = Math.max(BatchJobExt.RUNTIME_LIMIT_MIN, batchJob.getJobDescriptor().getExtensions().getRuntimeLimitMs());
            long deadline = task.getStatus().getTimestamp() + runtimeLimitMs;
            if (deadline < clock.wallTime()) {
                actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_RUNTIME_LIMIT_EXCEEDED, "Task running too long (runtimeLimit=" + runtimeLimitMs + "ms)", titusRuntime));
            }
            return;
        }
        TaskTimeoutChangeActions.TimeoutStatus timeoutStatus = TaskTimeoutChangeActions.getTimeoutStatus(taskHolder, clock);
        switch(timeoutStatus) {
            case Ignore:
            case Pending:
                break;
            case NotSet:
                long timeoutMs = -1;
                switch(taskState) {
                    case Launched:
                        timeoutMs = configuration.getTaskInLaunchedStateTimeoutMs();
                        break;
                    case StartInitiated:
                        timeoutMs = isBatch(runningJobView.getJob()) ? configuration.getBatchTaskInStartInitiatedStateTimeoutMs() : configuration.getServiceTaskInStartInitiatedStateTimeoutMs();
                        break;
                    case KillInitiated:
                        timeoutMs = configuration.getTaskInKillInitiatedStateTimeoutMs();
                        break;
                }
                if (timeoutMs > 0) {
                    actions.add(TaskTimeoutChangeActions.setTimeout(taskHolder.getId(), task.getStatus().getState(), timeoutMs, clock));
                }
                break;
            case TimedOut:
                if (!stuckInStateRateLimiter.tryTake()) {
                    break;
                }
                if (task.getStatus().getState() == TaskState.KillInitiated) {
                    int attempts = TaskTimeoutChangeActions.getKillInitiatedAttempts(taskHolder) + 1;
                    if (attempts >= configuration.getTaskKillAttempts()) {
                        actions.add(BasicTaskActions.updateTaskInRunningModel(task.getId(), V3JobOperations.Trigger.Reconciler, configuration, engine, taskParam -> Optional.of(taskParam.toBuilder().withStatus(taskParam.getStatus().toBuilder().withState(TaskState.Finished).withReasonCode(TaskStatus.REASON_STUCK_IN_KILLING_STATE).withReasonMessage("stuck in " + taskState + "state").build()).build()), "TimedOut in KillInitiated state", versionSupplier, titusRuntime, JobManagerConstants.RECONCILER_CALLMETADATA.toBuilder().withCallReason("Kill initiated").build()));
                    } else {
                        actions.add(TaskTimeoutChangeActions.incrementTaskKillAttempt(task.getId(), configuration.getTaskInKillInitiatedStateTimeoutMs(), clock));
                        actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_STUCK_IN_KILLING_STATE, "Another kill attempt (" + (attempts + 1) + ')', titusRuntime));
                    }
                } else {
                    actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_STUCK_IN_STATE, "Task stuck in " + taskState + " state", titusRuntime));
                }
                break;
        }
    });
    return actions;
}
Also used : JobManagerConstants(com.netflix.titus.api.jobmanager.service.JobManagerConstants) JobServiceRuntime(com.netflix.titus.master.jobmanager.service.JobServiceRuntime) Task(com.netflix.titus.api.jobmanager.model.job.Task) HashMap(java.util.HashMap) Function(java.util.function.Function) TaskTimeoutChangeActions(com.netflix.titus.master.jobmanager.service.common.action.task.TaskTimeoutChangeActions) ArrayList(java.util.ArrayList) EbsVolume(com.netflix.titus.api.jobmanager.model.job.ebs.EbsVolume) TASK_ATTRIBUTES_EBS_VOLUME_ID(com.netflix.titus.api.jobmanager.TaskAttributes.TASK_ATTRIBUTES_EBS_VOLUME_ID) HashSet(java.util.HashSet) Map(java.util.Map) JobState(com.netflix.titus.api.jobmanager.model.job.JobState) BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) JobManagerConfiguration(com.netflix.titus.master.jobmanager.service.JobManagerConfiguration) JobStore(com.netflix.titus.api.jobmanager.store.JobStore) JobDescriptor(com.netflix.titus.api.jobmanager.model.job.JobDescriptor) Job(com.netflix.titus.api.jobmanager.model.job.Job) ServiceJobExt(com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt) TaskStatus(com.netflix.titus.api.jobmanager.model.job.TaskStatus) Set(java.util.Set) JobFunctions(com.netflix.titus.api.jobmanager.model.job.JobFunctions) Collectors(java.util.stream.Collectors) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) Consumer(java.util.function.Consumer) List(java.util.List) ExecutableStatus(com.netflix.titus.api.jobmanager.model.job.ExecutableStatus) V3JobOperations(com.netflix.titus.api.jobmanager.service.V3JobOperations) VersionSupplier(com.netflix.titus.master.jobmanager.service.VersionSupplier) ReconciliationEngine(com.netflix.titus.common.framework.reconciler.ReconciliationEngine) Optional(java.util.Optional) BasicTaskActions(com.netflix.titus.master.jobmanager.service.common.action.task.BasicTaskActions) JobManagerReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobManagerReconcilerEvent) TitusRuntime(com.netflix.titus.common.runtime.TitusRuntime) TokenBucket(com.netflix.titus.common.util.limiter.tokenbucket.TokenBucket) Clock(com.netflix.titus.common.util.time.Clock) KillInitiatedActions(com.netflix.titus.master.jobmanager.service.common.action.task.KillInitiatedActions) TASK_ATTRIBUTES_IP_ALLOCATION_ID(com.netflix.titus.api.jobmanager.TaskAttributes.TASK_ATTRIBUTES_IP_ALLOCATION_ID) Task(com.netflix.titus.api.jobmanager.model.job.Task) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) ArrayList(java.util.ArrayList) TaskTimeoutChangeActions(com.netflix.titus.master.jobmanager.service.common.action.task.TaskTimeoutChangeActions) Clock(com.netflix.titus.common.util.time.Clock) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState)

Example 13 with ChangeAction

use of com.netflix.titus.common.framework.reconciler.ChangeAction in project titus-control-plane by Netflix.

the class ServiceDifferenceResolver method applyStore.

private List<ChangeAction> applyStore(ReconciliationEngine<JobManagerReconcilerEvent> engine, ServiceJobView refJobView, EntityHolder storeJob, AtomicInteger allowedNewTasks) {
    if (!storeWriteRetryInterceptor.executionLimits(storeJob)) {
        return Collections.emptyList();
    }
    List<ChangeAction> actions = new ArrayList<>();
    EntityHolder refJobHolder = refJobView.getJobHolder();
    Job<ServiceJobExt> refJob = refJobHolder.getEntity();
    if (!refJobHolder.getEntity().equals(storeJob.getEntity())) {
        actions.add(storeWriteRetryInterceptor.apply(BasicJobActions.updateJobInStore(engine, jobStore)));
    }
    boolean isJobTerminating = refJob.getStatus().getState() == JobState.KillInitiated;
    for (EntityHolder referenceTaskHolder : refJobHolder.getChildren()) {
        ServiceJobTask refTask = referenceTaskHolder.getEntity();
        Optional<EntityHolder> storeHolder = storeJob.findById(referenceTaskHolder.getId());
        ServiceJobTask storeTask = storeHolder.get().getEntity();
        boolean refAndStoreInSync = areEquivalent(storeHolder.get(), referenceTaskHolder);
        boolean shouldRetry = !isJobTerminating && refTask.getStatus().getState() == TaskState.Finished && !refTask.getStatus().getReasonCode().equals(TaskStatus.REASON_SCALED_DOWN) && allowedNewTasks.get() > 0;
        if (refAndStoreInSync) {
            TaskState currentTaskState = refTask.getStatus().getState();
            if (currentTaskState == TaskState.Finished) {
                if (isJobTerminating || isScaledDown(storeTask) || hasEnoughTasksRunning(refJobView)) {
                    actions.add(removeFinishedServiceTaskAction(jobStore, storeTask));
                } else if (shouldRetry && TaskRetryers.shouldRetryNow(referenceTaskHolder, clock)) {
                    createNewTaskAction(refJobView, Optional.of(referenceTaskHolder), Collections.emptyList(), Collections.emptyList()).ifPresent(actions::add);
                }
            }
        } else {
            Task task = referenceTaskHolder.getEntity();
            CallMetadata callMetadata = RECONCILER_CALLMETADATA.toBuilder().withCallReason("Writing runtime state changes to store").build();
            actions.add(storeWriteRetryInterceptor.apply(BasicTaskActions.writeReferenceTaskToStore(jobStore, engine, task.getId(), callMetadata, titusRuntime)));
        }
        // Both current and delayed retries are counted
        if (shouldRetry) {
            allowedNewTasks.decrementAndGet();
        }
    }
    return actions;
}
Also used : Task(com.netflix.titus.api.jobmanager.model.job.Task) ServiceJobTask(com.netflix.titus.api.jobmanager.model.job.ServiceJobTask) TitusChangeAction(com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) CallMetadata(com.netflix.titus.api.model.callmetadata.CallMetadata) ServiceJobExt(com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt) ArrayList(java.util.ArrayList) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) ServiceJobTask(com.netflix.titus.api.jobmanager.model.job.ServiceJobTask) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState)

Example 14 with ChangeAction

use of com.netflix.titus.common.framework.reconciler.ChangeAction in project titus-control-plane by Netflix.

the class ServiceDifferenceResolver method apply.

@Override
public List<ChangeAction> apply(ReconciliationEngine<JobManagerReconcilerEvent> engine) {
    List<ChangeAction> actions = new ArrayList<>();
    ServiceJobView refJobView = new ServiceJobView(engine.getReferenceView());
    int activeNotStartedTasks = DifferenceResolverUtils.countActiveNotStartedTasks(refJobView.getJobHolder(), engine.getRunningView());
    AtomicInteger allowedNewTasks = new AtomicInteger(Math.max(0, configuration.getActiveNotStartedTasksLimit() - activeNotStartedTasks));
    AtomicInteger allowedTaskKills = new AtomicInteger(configuration.getConcurrentReconcilerStoreUpdateLimit());
    actions.addAll(applyStore(engine, refJobView, engine.getStoreView(), allowedNewTasks));
    actions.addAll(applyRuntime(engine, refJobView, engine.getRunningView(), engine.getStoreView(), allowedNewTasks, allowedTaskKills));
    if (actions.isEmpty()) {
        actions.addAll(removeCompletedJob(engine.getReferenceView(), engine.getStoreView(), jobStore, versionSupplier));
    }
    return actions;
}
Also used : TitusChangeAction(com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ArrayList(java.util.ArrayList)

Example 15 with ChangeAction

use of com.netflix.titus.common.framework.reconciler.ChangeAction in project titus-control-plane by Netflix.

the class DefaultReconciliationFrameworkTest method setUp.

@Before
public void setUp() {
    framework.start();
    when(engineFactory.apply(any())).thenReturn(engine1, engine2);
    when(engine1.triggerActions()).thenReturn(true);
    when(engine1.getReferenceView()).thenReturn(EntityHolder.newRoot("myRoot1", "myEntity1"));
    when(engine1.events()).thenReturn(engine1Events.asObservable());
    when(engine1.changeReferenceModel(any())).thenAnswer(invocation -> {
        ChangeAction changeAction = invocation.getArgument(0);
        return changeAction.apply().ignoreElements().cast(Void.class);
    });
    when(engine2.triggerActions()).thenReturn(true);
    when(engine2.getReferenceView()).thenReturn(EntityHolder.newRoot("myRoot2", "myEntity2"));
    when(engine2.events()).thenReturn(engine2Events.asObservable());
    when(engine2.changeReferenceModel(any())).thenAnswer(invocation -> {
        ChangeAction changeAction = invocation.getArgument(0);
        return changeAction.apply().ignoreElements().cast(Void.class);
    });
}
Also used : ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) MultiEngineChangeAction(com.netflix.titus.common.framework.reconciler.MultiEngineChangeAction) Before(org.junit.Before)

Aggregations

ChangeAction (com.netflix.titus.common.framework.reconciler.ChangeAction)17 ArrayList (java.util.ArrayList)13 EntityHolder (com.netflix.titus.common.framework.reconciler.EntityHolder)12 TitusChangeAction (com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction)10 List (java.util.List)7 Task (com.netflix.titus.api.jobmanager.model.job.Task)5 ModelActionHolder (com.netflix.titus.common.framework.reconciler.ModelActionHolder)5 MultiEngineChangeAction (com.netflix.titus.common.framework.reconciler.MultiEngineChangeAction)5 Observable (rx.Observable)5 TaskState (com.netflix.titus.api.jobmanager.model.job.TaskState)4 ServiceJobExt (com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt)4 CallMetadata (com.netflix.titus.api.model.callmetadata.CallMetadata)4 ReconciliationEngine (com.netflix.titus.common.framework.reconciler.ReconciliationEngine)4 HashSet (java.util.HashSet)4 Optional (java.util.Optional)4 Set (java.util.Set)4 Job (com.netflix.titus.api.jobmanager.model.job.Job)3 JobFunctions (com.netflix.titus.api.jobmanager.model.job.JobFunctions)3 JobState (com.netflix.titus.api.jobmanager.model.job.JobState)3 TaskStatus (com.netflix.titus.api.jobmanager.model.job.TaskStatus)3