use of com.netflix.titus.api.jobmanager.model.job.event.JobUpdateEvent in project titus-control-plane by Netflix.
the class LocalCacheQueryProcessor method toObserveJobsEvent.
private Optional<JobChangeNotification> toObserveJobsEvent(JobSnapshot snapshot, JobManagerEvent<?> event, long now, V3JobQueryCriteriaEvaluator jobsPredicate, V3TaskQueryCriteriaEvaluator tasksPredicate, Set<String> jobFields, Set<String> taskFields) {
if (event instanceof JobUpdateEvent) {
JobUpdateEvent jobUpdateEvent = (JobUpdateEvent) event;
Job<?> job = jobUpdateEvent.getCurrent();
List<com.netflix.titus.api.jobmanager.model.job.Task> tasks = new ArrayList<>(snapshot.getTasks(job.getId()).values());
return jobsPredicate.test(Pair.of(job, tasks)) ? Optional.of(toGrpcJobEvent(job, now, jobFields)) : Optional.empty();
}
if (event instanceof TaskUpdateEvent) {
TaskUpdateEvent taskUpdateEvent = (TaskUpdateEvent) event;
Job<?> job = taskUpdateEvent.getCurrentJob();
com.netflix.titus.api.jobmanager.model.job.Task task = taskUpdateEvent.getCurrentTask();
return tasksPredicate.test(Pair.of(job, task)) ? Optional.of(toGrpcTaskEvent(task, taskUpdateEvent.isMovedFromAnotherJob(), now, taskFields)) : Optional.empty();
}
return Optional.empty();
}
use of com.netflix.titus.api.jobmanager.model.job.event.JobUpdateEvent in project titus-control-plane by Netflix.
the class ObserveJobsCommand method executeOnce.
private void executeOnce(Flux<JobManagerEvent<?>> events, JobEventPropagationMetrics metrics, boolean printLatency, boolean printEvents, boolean snapshotOnly) throws InterruptedException {
CountDownLatch latch = new CountDownLatch(1);
AtomicBoolean snapshotRead = new AtomicBoolean();
Stopwatch stopwatch = Stopwatch.createStarted();
Disposable disposable = events.subscribe(next -> {
if (next == JobManagerEvent.snapshotMarker()) {
logger.info("Emitted: snapshot marker in {}ms", stopwatch.elapsed(TimeUnit.MILLISECONDS));
snapshotRead.set(true);
if (snapshotOnly) {
latch.countDown();
}
} else if (next instanceof JobUpdateEvent) {
Job<?> job = ((JobUpdateEvent) next).getCurrent();
if (printEvents) {
logger.info("Emitted job update: jobId={}({}), jobState={}, version={}", job.getId(), next.isArchived() ? "archived" : job.getStatus().getState(), job.getStatus(), job.getVersion());
}
Optional<EventPropagationTrace> trace = metrics.recordJob(((JobUpdateEvent) next).getCurrent(), !snapshotRead.get());
if (printLatency) {
trace.ifPresent(t -> {
logger.info("Event propagation data: stages={}", t);
});
}
} else if (next instanceof TaskUpdateEvent) {
Task task = ((TaskUpdateEvent) next).getCurrent();
if (printEvents) {
logger.info("Emitted task update: jobId={}({}), taskId={}, taskState={}, version={}", task.getJobId(), next.isArchived() ? "archived" : task.getStatus().getState(), task.getId(), task.getStatus(), task.getVersion());
}
Optional<EventPropagationTrace> trace = metrics.recordTask(((TaskUpdateEvent) next).getCurrent(), !snapshotRead.get());
if (printLatency) {
trace.ifPresent(t -> logger.info("Event propagation data: {}", t));
}
} else if (next instanceof JobKeepAliveEvent) {
if (printEvents) {
logger.info("Keep alive response: " + next);
}
} else {
logger.info("Unrecognized event type: {}", next);
}
}, e -> {
ErrorReports.handleReplyError("Error in the event stream", e);
latch.countDown();
}, () -> {
logger.info("Event stream closed");
latch.countDown();
});
latch.await();
disposable.dispose();
}
use of com.netflix.titus.api.jobmanager.model.job.event.JobUpdateEvent in project titus-control-plane by Netflix.
the class JobUtil method loadActiveJobsAndTasks.
public static Pair<Map<String, Job>, Map<String, Map<String, Task>>> loadActiveJobsAndTasks(CommandContext context) {
Map<String, Job> activeJobs = new HashMap<>();
Map<String, Map<String, Task>> activeTasks = new HashMap<>();
Iterator<JobManagerEvent<?>> it = context.getJobManagementClient().observeJobs(Collections.emptyMap()).toIterable().iterator();
while (it.hasNext()) {
JobManagerEvent<?> event = it.next();
if (event instanceof JobUpdateEvent) {
JobUpdateEvent je = (JobUpdateEvent) event;
Job job = je.getCurrent();
if (job.getStatus().getState() == JobState.Accepted) {
activeJobs.put(job.getId(), job);
}
} else if (event instanceof TaskUpdateEvent) {
TaskUpdateEvent te = (TaskUpdateEvent) event;
Task task = te.getCurrent();
if (activeJobs.containsKey(task.getJobId())) {
activeTasks.computeIfAbsent(task.getJobId(), j -> new HashMap<>()).put(task.getId(), task);
}
} else if (event.equals(JobManagerEvent.snapshotMarker())) {
break;
}
}
return Pair.of(activeJobs, activeTasks);
}
use of com.netflix.titus.api.jobmanager.model.job.event.JobUpdateEvent in project titus-control-plane by Netflix.
the class StreamDataReplicatorPerf method main.
public static void main(String[] args) throws InterruptedException {
TitusRuntime titusRuntime = TitusRuntimes.internal();
JobManagementClient client = Mockito.mock(JobManagementClient.class);
JobConnectorConfiguration configuration = Mockito.mock(JobConnectorConfiguration.class);
Mockito.when(client.observeJobs(ArgumentMatchers.any())).thenAnswer(invocation -> Flux.defer(() -> {
JobManagerEvent jobUpdateEvent = JobUpdateEvent.newJob(JOB, JobManagerConstants.GRPC_REPLICATOR_CALL_METADATA);
JobManagerEvent taskUpdateEvent = TaskUpdateEvent.newTask(JOB, TASK, JobManagerConstants.GRPC_REPLICATOR_CALL_METADATA);
return Flux.just(jobUpdateEvent, JobManagerEvent.snapshotMarker()).concatWith(Flux.interval(Duration.ofSeconds(1)).take(1).map(tick -> taskUpdateEvent)).concatWith(Flux.interval(Duration.ofSeconds(1)).take(1).flatMap(tick -> Flux.error(new RuntimeException("Simulated error"))));
}));
JobDataReplicator replicator = new JobDataReplicatorProvider(configuration, client, JobSnapshotFactories.newDefault(titusRuntime), titusRuntime).get();
replicator.events().subscribe(System.out::println);
Thread.sleep(3600_000);
}
use of com.netflix.titus.api.jobmanager.model.job.event.JobUpdateEvent in project titus-control-plane by Netflix.
the class DefaultV3JobOperations method enterActiveMode.
@Activator
public void enterActiveMode() {
this.reconciliationFramework = jobReconciliationFrameworkFactory.newInstance();
// BUG: event stream breaks permanently, and cannot be retried.
// As we cannot fix the underlying issue yet, we have to be able to discover when it happens.
AtomicLong eventStreamLastError = new AtomicLong();
Clock clock = titusRuntime.getClock();
this.transactionLoggerSubscription = JobTransactionLogger.logEvents(reconciliationFramework, eventStreamLastError, clock);
PolledMeter.using(titusRuntime.getRegistry()).withName(METRIC_EVENT_STREAM_LAST_ERROR).monitorValue(eventStreamLastError, value -> value.get() <= 0 ? 0 : clock.wallTime() - value.get());
// Remove finished jobs from the reconciliation framework.
Observable<JobManagerReconcilerEvent> reconciliationEventsObservable = reconciliationFramework.events().onBackpressureBuffer(OBSERVE_JOBS_BACKPRESSURE_BUFFER_SIZE, () -> logger.warn("Overflowed the buffer size: " + OBSERVE_JOBS_BACKPRESSURE_BUFFER_SIZE), BackpressureOverflow.ON_OVERFLOW_ERROR).doOnSubscribe(() -> {
List<EntityHolder> entityHolders = reconciliationFramework.orderedView(IndexKind.StatusCreationTime);
for (EntityHolder entityHolder : entityHolders) {
handleJobCompletedEvent(entityHolder);
}
});
this.reconcilerEventSubscription = titusRuntime.persistentStream(reconciliationEventsObservable).subscribe(event -> {
if (event instanceof JobModelUpdateReconcilerEvent) {
JobModelUpdateReconcilerEvent jobUpdateEvent = (JobModelUpdateReconcilerEvent) event;
handleJobCompletedEvent(jobUpdateEvent.getChangedEntityHolder());
}
}, e -> logger.error("Event stream terminated with an error", e), () -> logger.info("Event stream completed"));
reconciliationFramework.start();
}
Aggregations