Search in sources :

Example 1 with TaskState

use of com.netflix.titus.api.jobmanager.model.job.TaskState in project titus-control-plane by Netflix.

the class KubeNotificationProcessor method fillInMissingStates.

private static Task fillInMissingStates(PodWrapper podWrapper, Task task) {
    TaskState currentState = task.getStatus().getState();
    if (currentState != TaskState.Started && currentState != TaskState.Finished) {
        return task;
    }
    V1ContainerState containerState = podWrapper.findContainerState().orElse(null);
    if (containerState == null) {
        return task;
    }
    long startAtTimestamp;
    if (currentState == TaskState.Started) {
        if (containerState.getRunning() == null || containerState.getRunning().getStartedAt() == null) {
            return task;
        }
        startAtTimestamp = containerState.getRunning().getStartedAt().toInstant().toEpochMilli();
    } else {
        // TaskState.Finished
        if (containerState.getTerminated() == null || containerState.getTerminated().getStartedAt() == null) {
            // It must be the case where the container setup failed.
            return fillInMissingStatesForContainerSetupFailure(podWrapper, task);
        }
        startAtTimestamp = containerState.getTerminated().getStartedAt().toInstant().toEpochMilli();
    }
    TaskStatus.Builder statusTemplate = TaskStatus.newBuilder().withReasonCode(TaskStatus.REASON_STATE_MISSING).withReasonMessage("Filled in missing state update that was missed previously").withTimestamp(startAtTimestamp);
    List<TaskStatus> missingStatuses = new ArrayList<>();
    addIfMissing(task, TaskState.Launched, statusTemplate).ifPresent(missingStatuses::add);
    addIfMissing(task, TaskState.StartInitiated, statusTemplate).ifPresent(missingStatuses::add);
    addIfMissing(task, TaskState.Started, statusTemplate).ifPresent(missingStatuses::add);
    if (missingStatuses.isEmpty()) {
        return task;
    }
    List<TaskStatus> newStatusHistory = new ArrayList<>(task.getStatusHistory());
    newStatusHistory.addAll(missingStatuses);
    newStatusHistory.sort(Comparator.comparing(ExecutableStatus::getState));
    return task.toBuilder().withStatusHistory(newStatusHistory).build();
}
Also used : V1ContainerState(io.kubernetes.client.openapi.models.V1ContainerState) ArrayList(java.util.ArrayList) TaskStatus(com.netflix.titus.api.jobmanager.model.job.TaskStatus) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState)

Example 2 with TaskState

use of com.netflix.titus.api.jobmanager.model.job.TaskState in project titus-control-plane by Netflix.

the class KubeNotificationProcessor method handlePodUpdatedEvent.

private Mono<Void> handlePodUpdatedEvent(PodEvent event, Job job, Task task) {
    // This is basic sanity check. If it fails, we have a major problem with pod state.
    if (event.getPod() == null || event.getPod().getStatus() == null || event.getPod().getStatus().getPhase() == null) {
        logger.warn("Pod notification with pod without status or phase set: taskId={}, pod={}", task.getId(), event.getPod());
        metricsNoChangesApplied.increment();
        return Mono.empty();
    }
    PodWrapper podWrapper = new PodWrapper(event.getPod());
    Optional<V1Node> node;
    if (event instanceof PodUpdatedEvent) {
        node = ((PodUpdatedEvent) event).getNode();
    } else if (event instanceof PodDeletedEvent) {
        node = ((PodDeletedEvent) event).getNode();
    } else {
        node = Optional.empty();
    }
    Either<TaskStatus, String> newTaskStatusOrError = new PodToTaskMapper(podWrapper, node, task, event instanceof PodDeletedEvent, containerResultCodeResolver, titusRuntime).getNewTaskStatus();
    if (newTaskStatusOrError.hasError()) {
        logger.info(newTaskStatusOrError.getError());
        metricsNoChangesApplied.increment();
        return Mono.empty();
    }
    TaskStatus newTaskStatus = newTaskStatusOrError.getValue();
    if (TaskStatus.areEquivalent(task.getStatus(), newTaskStatus)) {
        logger.info("Pod change notification does not change task status: taskId={}, status={}, eventSequenceNumber={}", task.getId(), newTaskStatus, event.getSequenceNumber());
    } else {
        logger.info("Pod notification changes task status: taskId={}, fromStatus={}, toStatus={}, eventSequenceNumber={}", task.getId(), task.getStatus(), newTaskStatus, event.getSequenceNumber());
    }
    // against most up to date task version.
    if (!updateTaskStatus(podWrapper, newTaskStatus, node, task, true).isPresent()) {
        return Mono.empty();
    }
    return ReactorExt.toMono(v3JobOperations.updateTask(task.getId(), current -> updateTaskStatus(podWrapper, newTaskStatus, node, current, false), V3JobOperations.Trigger.Kube, "Pod status updated from kubernetes node (k8phase='" + event.getPod().getStatus().getPhase() + "', taskState=" + task.getStatus().getState() + ")", KUBE_CALL_METADATA));
}
Also used : Retry(reactor.util.retry.Retry) Task(com.netflix.titus.api.jobmanager.model.job.Task) CollectionsExt(com.netflix.titus.common.util.CollectionsExt) LoggerFactory(org.slf4j.LoggerFactory) V1PodStatus(io.kubernetes.client.openapi.models.V1PodStatus) ReactorExt(com.netflix.titus.common.util.rx.ReactorExt) KubeUtil(com.netflix.titus.master.kubernetes.KubeUtil) TITUS_NODE_DOMAIN(com.netflix.titus.runtime.kubernetes.KubeConstants.TITUS_NODE_DOMAIN) Duration(java.time.Duration) Map(java.util.Map) DirectKubeApiServerIntegrator(com.netflix.titus.master.kubernetes.client.DirectKubeApiServerIntegrator) Either(com.netflix.titus.common.util.tuple.Either) CallMetadata(com.netflix.titus.api.model.callmetadata.CallMetadata) PodEvent(com.netflix.titus.master.kubernetes.client.model.PodEvent) Job(com.netflix.titus.api.jobmanager.model.job.Job) TaskStatus(com.netflix.titus.api.jobmanager.model.job.TaskStatus) JobFunctions(com.netflix.titus.api.jobmanager.model.job.JobFunctions) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState) PodNotFoundEvent(com.netflix.titus.master.kubernetes.client.model.PodNotFoundEvent) Timer(com.netflix.spectator.api.Timer) List(java.util.List) Optional(java.util.Optional) PodWrapper(com.netflix.titus.master.kubernetes.client.model.PodWrapper) Gauge(com.netflix.spectator.api.Gauge) Disposable(reactor.core.Disposable) Stopwatch(com.google.common.base.Stopwatch) PodDeletedEvent(com.netflix.titus.master.kubernetes.client.model.PodDeletedEvent) Counter(com.netflix.spectator.api.Counter) HashMap(java.util.HashMap) MetricConstants(com.netflix.titus.master.MetricConstants) V1Node(io.kubernetes.client.openapi.models.V1Node) Singleton(javax.inject.Singleton) Scheduler(reactor.core.scheduler.Scheduler) ArrayList(java.util.ArrayList) Inject(javax.inject.Inject) Pair(com.netflix.titus.common.util.tuple.Pair) ContainerResultCodeResolver(com.netflix.titus.master.kubernetes.ContainerResultCodeResolver) Schedulers(reactor.core.scheduler.Schedulers) Evaluators.acceptNotNull(com.netflix.titus.common.util.Evaluators.acceptNotNull) KubeJobManagementReconciler(com.netflix.titus.master.kubernetes.controller.KubeJobManagementReconciler) ExecutorService(java.util.concurrent.ExecutorService) ExecutorsExt(com.netflix.titus.common.util.ExecutorsExt) Logger(org.slf4j.Logger) PodUpdatedEvent(com.netflix.titus.master.kubernetes.client.model.PodUpdatedEvent) Mono(reactor.core.publisher.Mono) Activator(com.netflix.titus.common.util.guice.annotation.Activator) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) ExecutableStatus(com.netflix.titus.api.jobmanager.model.job.ExecutableStatus) V3JobOperations(com.netflix.titus.api.jobmanager.service.V3JobOperations) TaskAttributes(com.netflix.titus.api.jobmanager.TaskAttributes) PodToTaskMapper(com.netflix.titus.master.kubernetes.PodToTaskMapper) V1ContainerState(io.kubernetes.client.openapi.models.V1ContainerState) VisibleForTesting(com.google.common.annotations.VisibleForTesting) TitusRuntime(com.netflix.titus.common.runtime.TitusRuntime) Comparator(java.util.Comparator) Evaluators(com.netflix.titus.common.util.Evaluators) PodToTaskMapper(com.netflix.titus.master.kubernetes.PodToTaskMapper) PodDeletedEvent(com.netflix.titus.master.kubernetes.client.model.PodDeletedEvent) V1Node(io.kubernetes.client.openapi.models.V1Node) PodWrapper(com.netflix.titus.master.kubernetes.client.model.PodWrapper) PodUpdatedEvent(com.netflix.titus.master.kubernetes.client.model.PodUpdatedEvent) TaskStatus(com.netflix.titus.api.jobmanager.model.job.TaskStatus)

Example 3 with TaskState

use of com.netflix.titus.api.jobmanager.model.job.TaskState in project titus-control-plane by Netflix.

the class DifferenceResolverUtils method countActiveNotStartedTasks.

public static int countActiveNotStartedTasks(EntityHolder refJobHolder, EntityHolder runningJobHolder) {
    Set<String> pendingTaskIds = new HashSet<>();
    Consumer<EntityHolder> countingFun = jobHolder -> jobHolder.getChildren().forEach(taskHolder -> {
        TaskState state = ((Task) taskHolder.getEntity()).getStatus().getState();
        if (state != TaskState.Started && state != TaskState.Finished) {
            pendingTaskIds.add(taskHolder.getId());
        }
    });
    countingFun.accept(refJobHolder);
    countingFun.accept(runningJobHolder);
    return pendingTaskIds.size();
}
Also used : JobManagerConstants(com.netflix.titus.api.jobmanager.service.JobManagerConstants) JobServiceRuntime(com.netflix.titus.master.jobmanager.service.JobServiceRuntime) Task(com.netflix.titus.api.jobmanager.model.job.Task) HashMap(java.util.HashMap) Function(java.util.function.Function) TaskTimeoutChangeActions(com.netflix.titus.master.jobmanager.service.common.action.task.TaskTimeoutChangeActions) ArrayList(java.util.ArrayList) EbsVolume(com.netflix.titus.api.jobmanager.model.job.ebs.EbsVolume) TASK_ATTRIBUTES_EBS_VOLUME_ID(com.netflix.titus.api.jobmanager.TaskAttributes.TASK_ATTRIBUTES_EBS_VOLUME_ID) HashSet(java.util.HashSet) Map(java.util.Map) JobState(com.netflix.titus.api.jobmanager.model.job.JobState) BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) JobManagerConfiguration(com.netflix.titus.master.jobmanager.service.JobManagerConfiguration) JobStore(com.netflix.titus.api.jobmanager.store.JobStore) JobDescriptor(com.netflix.titus.api.jobmanager.model.job.JobDescriptor) Job(com.netflix.titus.api.jobmanager.model.job.Job) ServiceJobExt(com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt) TaskStatus(com.netflix.titus.api.jobmanager.model.job.TaskStatus) Set(java.util.Set) JobFunctions(com.netflix.titus.api.jobmanager.model.job.JobFunctions) Collectors(java.util.stream.Collectors) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) Consumer(java.util.function.Consumer) List(java.util.List) ExecutableStatus(com.netflix.titus.api.jobmanager.model.job.ExecutableStatus) V3JobOperations(com.netflix.titus.api.jobmanager.service.V3JobOperations) VersionSupplier(com.netflix.titus.master.jobmanager.service.VersionSupplier) ReconciliationEngine(com.netflix.titus.common.framework.reconciler.ReconciliationEngine) Optional(java.util.Optional) BasicTaskActions(com.netflix.titus.master.jobmanager.service.common.action.task.BasicTaskActions) JobManagerReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobManagerReconcilerEvent) TitusRuntime(com.netflix.titus.common.runtime.TitusRuntime) TokenBucket(com.netflix.titus.common.util.limiter.tokenbucket.TokenBucket) Clock(com.netflix.titus.common.util.time.Clock) KillInitiatedActions(com.netflix.titus.master.jobmanager.service.common.action.task.KillInitiatedActions) TASK_ATTRIBUTES_IP_ALLOCATION_ID(com.netflix.titus.api.jobmanager.TaskAttributes.TASK_ATTRIBUTES_IP_ALLOCATION_ID) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState) HashSet(java.util.HashSet)

Example 4 with TaskState

use of com.netflix.titus.api.jobmanager.model.job.TaskState in project titus-control-plane by Netflix.

the class TaskTimeoutChangeActions method getTimeoutStatus.

public static TimeoutStatus getTimeoutStatus(EntityHolder taskHolder, Clock clock) {
    Task task = taskHolder.getEntity();
    TaskState state = task.getStatus().getState();
    if (state != TaskState.Launched && state != TaskState.StartInitiated && state != TaskState.KillInitiated) {
        return TimeoutStatus.Ignore;
    }
    Long deadline = (Long) taskHolder.getAttributes().get(STATE_TAGS.get(state));
    if (deadline == null) {
        return TimeoutStatus.NotSet;
    }
    return clock.wallTime() < deadline ? TimeoutStatus.Pending : TimeoutStatus.TimedOut;
}
Also used : Task(com.netflix.titus.api.jobmanager.model.job.Task) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState)

Example 5 with TaskState

use of com.netflix.titus.api.jobmanager.model.job.TaskState in project titus-control-plane by Netflix.

the class TaskTimeoutChangeActions method setTimeout.

public static TitusChangeAction setTimeout(String taskId, TaskState taskState, long timeoutMs, Clock clock) {
    String tagName = STATE_TAGS.get(taskState);
    Preconditions.checkArgument(tagName != null, "Timeout not tracked for state %s", taskState);
    return TitusChangeAction.newAction("setTimeout").id(taskId).trigger(Trigger.Reconciler).summary("Setting timeout for task in state %s: %s", taskState, DateTimeExt.toTimeUnitString(timeoutMs)).callMetadata(JobManagerConstants.RECONCILER_CALLMETADATA.toBuilder().withCallReason("configure timeout").build()).applyModelUpdate(self -> {
        TitusModelAction modelAction = TitusModelAction.newModelUpdate(self).taskMaybeUpdate(jobHolder -> jobHolder.findById(taskId).map(taskHolder -> {
            EntityHolder newTaskHolder = taskHolder.addTag(tagName, clock.wallTime() + timeoutMs);
            if (taskState == TaskState.KillInitiated) {
                newTaskHolder = newTaskHolder.addTag(KILL_INITIATED_ATTEMPT_TAG, 0);
            }
            return Pair.of(jobHolder.addChild(newTaskHolder), newTaskHolder);
        }));
        return ModelActionHolder.running(modelAction);
    });
}
Also used : Trigger(com.netflix.titus.api.jobmanager.service.V3JobOperations.Trigger) DateTimeExt(com.netflix.titus.common.util.DateTimeExt) JobManagerConstants(com.netflix.titus.api.jobmanager.service.JobManagerConstants) ImmutableMap(com.google.common.collect.ImmutableMap) TitusChangeAction(com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction) Task(com.netflix.titus.api.jobmanager.model.job.Task) JobFunctions(com.netflix.titus.api.jobmanager.model.job.JobFunctions) TitusModelAction(com.netflix.titus.master.jobmanager.service.common.action.TitusModelAction) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) ModelActionHolder(com.netflix.titus.common.framework.reconciler.ModelActionHolder) Pair(com.netflix.titus.common.util.tuple.Pair) Map(java.util.Map) JobManagerConfiguration(com.netflix.titus.master.jobmanager.service.JobManagerConfiguration) Preconditions(com.google.common.base.Preconditions) Clock(com.netflix.titus.common.util.time.Clock) TitusModelAction(com.netflix.titus.master.jobmanager.service.common.action.TitusModelAction) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder)

Aggregations

TaskState (com.netflix.titus.api.jobmanager.model.job.TaskState)22 Task (com.netflix.titus.api.jobmanager.model.job.Task)12 ArrayList (java.util.ArrayList)9 JobFunctions (com.netflix.titus.api.jobmanager.model.job.JobFunctions)7 TaskStatus (com.netflix.titus.api.jobmanager.model.job.TaskStatus)7 ServiceJobExt (com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt)7 Job (com.netflix.titus.api.jobmanager.model.job.Job)6 V3JobOperations (com.netflix.titus.api.jobmanager.service.V3JobOperations)6 EntityHolder (com.netflix.titus.common.framework.reconciler.EntityHolder)6 TitusRuntime (com.netflix.titus.common.runtime.TitusRuntime)6 HashMap (java.util.HashMap)6 List (java.util.List)6 Optional (java.util.Optional)6 Stopwatch (com.google.common.base.Stopwatch)5 JobManagerConstants (com.netflix.titus.api.jobmanager.service.JobManagerConstants)5 JobState (com.netflix.titus.api.jobmanager.model.job.JobState)4 CallMetadata (com.netflix.titus.api.model.callmetadata.CallMetadata)4 ChangeAction (com.netflix.titus.common.framework.reconciler.ChangeAction)4 Pair (com.netflix.titus.common.util.tuple.Pair)4 VersionSupplier (com.netflix.titus.master.jobmanager.service.VersionSupplier)4