use of com.netflix.titus.api.jobmanager.model.job.Task in project titus-control-plane by Netflix.
the class JobReconciliationFrameworkFactory method newRestoredEngine.
private InternalReconciliationEngine<JobManagerReconcilerEvent> newRestoredEngine(Job job, List<Task> tasks) {
EntityHolder jobHolder = EntityHolder.newRoot(job.getId(), job);
for (Task task : tasks) {
EntityHolder taskHolder = EntityHolder.newRoot(task.getId(), task);
EntityHolder decorated = TaskTimeoutChangeActions.setTimeoutOnRestoreFromStore(jobManagerConfiguration, taskHolder, clock);
jobHolder = jobHolder.addChild(decorated);
}
return newEngine(jobHolder, false);
}
use of com.netflix.titus.api.jobmanager.model.job.Task in project titus-control-plane by Netflix.
the class JobReconciliationFrameworkFactory method validateTask.
private Optional<Task> validateTask(Task task) {
// Perform strict validation for reporting purposes
Set<ValidationError> strictViolations = strictEntitySanitizer.validate(task);
if (!strictViolations.isEmpty()) {
logger.error("No strictly consistent task record found: taskId={}, violations={}", task.getId(), EntitySanitizerUtil.toStringMap(strictViolations));
errorCollector.strictlyInvalidTask(task.getId());
}
// Required checks
Set<ValidationError> violations = permissiveEntitySanitizer.validate(task);
if (!violations.isEmpty()) {
logger.error("Bad task record found: taskId={}, violations={}", task.getId(), EntitySanitizerUtil.toStringMap(violations));
if (jobManagerConfiguration.isFailOnDataValidation()) {
return Optional.empty();
}
}
// If version is missing (old task objects) create one based on the current task state.
Task taskWithVersion = task;
if (task.getVersion() == null || task.getVersion().getTimestamp() < 0) {
Version newVersion = Version.newBuilder().withTimestamp(task.getStatus().getTimestamp()).build();
taskWithVersion = task.toBuilder().withVersion(newVersion).build();
}
return Optional.of(taskWithVersion);
}
use of com.netflix.titus.api.jobmanager.model.job.Task in project titus-control-plane by Netflix.
the class JobReconciliationFrameworkFactory method newInstance.
ReconciliationFramework<JobManagerReconcilerEvent> newInstance() {
List<Pair<Job, List<Task>>> jobsAndTasks = loadJobsAndTasksFromStore(errorCollector);
// initialize fenzo with running tasks
List<InternalReconciliationEngine<JobManagerReconcilerEvent>> engines = new ArrayList<>();
for (Pair<Job, List<Task>> pair : jobsAndTasks) {
Job job = pair.getLeft();
List<Task> tasks = pair.getRight();
InternalReconciliationEngine<JobManagerReconcilerEvent> engine = newRestoredEngine(job, tasks);
engines.add(engine);
for (Task task : tasks) {
Optional<Task> validatedTask = validateTask(task);
if (!validatedTask.isPresent()) {
errorCollector.invalidTaskRecord(task.getId());
}
}
}
errorCollector.failIfTooManyBadRecords();
return new DefaultReconciliationFramework<>(engines, bootstrapModel -> newEngine(bootstrapModel, true), jobManagerConfiguration.getReconcilerIdleTimeoutMs(), jobManagerConfiguration.getReconcilerActiveTimeoutMs(), jobManagerConfiguration.getCheckpointIntervalMs(), INDEX_COMPARATORS, JOB_EVENT_FACTORY, registry, optionalScheduler);
}
use of com.netflix.titus.api.jobmanager.model.job.Task in project titus-control-plane by Netflix.
the class JobReconciliationFrameworkFactory method loadJobsAndTasksFromStore.
private List<Pair<Job, List<Task>>> loadJobsAndTasksFromStore(InitializationErrorCollector errorCollector) {
long startTime = clock.wallTime();
// load all job/task pairs
List<Pair<Job, Pair<List<Task>, Integer>>> jobTasksPairs;
try {
jobTasksPairs = store.init().andThen(store.retrieveJobs().flatMap(retrievedJobsAndErrors -> {
errorCollector.corruptedJobRecords(retrievedJobsAndErrors.getRight());
List<Job<?>> retrievedJobs = retrievedJobsAndErrors.getLeft();
List<Observable<Pair<Job, Pair<List<Task>, Integer>>>> retrieveTasksObservables = new ArrayList<>();
for (Job job : retrievedJobs) {
// TODO Finished jobs that were not archived immediately should be archived by background archive process
if (job.getStatus().getState() == JobState.Finished) {
logger.info("Not loading finished job: {}", job.getId());
continue;
}
Optional<Job> validatedJob = validateJob(job);
if (validatedJob.isPresent()) {
Observable<Pair<Job, Pair<List<Task>, Integer>>> retrieveTasksObservable = store.retrieveTasksForJob(job.getId()).map(taskList -> new Pair<>(validatedJob.get(), taskList));
retrieveTasksObservables.add(retrieveTasksObservable);
} else {
errorCollector.invalidJob(job.getId());
}
}
return Observable.merge(retrieveTasksObservables, MAX_RETRIEVE_TASK_CONCURRENCY);
})).toList().toBlocking().singleOrDefault(Collections.emptyList());
int corruptedTaskRecords = jobTasksPairs.stream().mapToInt(p -> p.getRight().getRight()).sum();
errorCollector.corruptedTaskRecords(corruptedTaskRecords);
int taskCount = jobTasksPairs.stream().map(p -> p.getRight().getLeft().size()).reduce(0, (a, v) -> a + v);
loadedJobs.set(jobTasksPairs.size());
loadedTasks.set(taskCount);
for (Pair<Job, Pair<List<Task>, Integer>> jobTaskPair : jobTasksPairs) {
Job job = jobTaskPair.getLeft();
List<Task> tasks = jobTaskPair.getRight().getLeft();
List<String> taskStrings = tasks.stream().map(t -> String.format("<%s,ks:%s>", t.getId(), t.getStatus().getState())).collect(Collectors.toList());
logger.info("Loaded job: {} with tasks: {}", job.getId(), taskStrings);
}
logger.info("{} jobs and {} tasks loaded from store in {}ms", jobTasksPairs.size(), taskCount, clock.wallTime() - startTime);
} catch (Exception e) {
logger.error("Failed to load jobs from the store during initialization:", e);
throw new IllegalStateException("Failed to load jobs from the store during initialization", e);
} finally {
storeLoadTimeMs.set(clock.wallTime() - startTime);
}
return jobTasksPairs.stream().map(p -> Pair.of(p.getLeft(), p.getRight().getLeft())).collect(Collectors.toList());
}
use of com.netflix.titus.api.jobmanager.model.job.Task in project titus-control-plane by Netflix.
the class KubeNotificationProcessor method handlePodUpdatedEvent.
private Mono<Void> handlePodUpdatedEvent(PodEvent event, Job job, Task task) {
// This is basic sanity check. If it fails, we have a major problem with pod state.
if (event.getPod() == null || event.getPod().getStatus() == null || event.getPod().getStatus().getPhase() == null) {
logger.warn("Pod notification with pod without status or phase set: taskId={}, pod={}", task.getId(), event.getPod());
metricsNoChangesApplied.increment();
return Mono.empty();
}
PodWrapper podWrapper = new PodWrapper(event.getPod());
Optional<V1Node> node;
if (event instanceof PodUpdatedEvent) {
node = ((PodUpdatedEvent) event).getNode();
} else if (event instanceof PodDeletedEvent) {
node = ((PodDeletedEvent) event).getNode();
} else {
node = Optional.empty();
}
Either<TaskStatus, String> newTaskStatusOrError = new PodToTaskMapper(podWrapper, node, task, event instanceof PodDeletedEvent, containerResultCodeResolver, titusRuntime).getNewTaskStatus();
if (newTaskStatusOrError.hasError()) {
logger.info(newTaskStatusOrError.getError());
metricsNoChangesApplied.increment();
return Mono.empty();
}
TaskStatus newTaskStatus = newTaskStatusOrError.getValue();
if (TaskStatus.areEquivalent(task.getStatus(), newTaskStatus)) {
logger.info("Pod change notification does not change task status: taskId={}, status={}, eventSequenceNumber={}", task.getId(), newTaskStatus, event.getSequenceNumber());
} else {
logger.info("Pod notification changes task status: taskId={}, fromStatus={}, toStatus={}, eventSequenceNumber={}", task.getId(), task.getStatus(), newTaskStatus, event.getSequenceNumber());
}
// against most up to date task version.
if (!updateTaskStatus(podWrapper, newTaskStatus, node, task, true).isPresent()) {
return Mono.empty();
}
return ReactorExt.toMono(v3JobOperations.updateTask(task.getId(), current -> updateTaskStatus(podWrapper, newTaskStatus, node, current, false), V3JobOperations.Trigger.Kube, "Pod status updated from kubernetes node (k8phase='" + event.getPod().getStatus().getPhase() + "', taskState=" + task.getStatus().getState() + ")", KUBE_CALL_METADATA));
}
Aggregations