use of com.netflix.titus.runtime.connector.jobmanager.snapshot.JobSnapshot in project titus-control-plane by Netflix.
the class GrpcJobReplicatorEventStreamTest method testCacheBootstrap.
@Test
public void testCacheBootstrap() {
jobServiceStub.creteMultipleJobsAndTasks(SERVICE_JOB, BATCH_JOB);
newConnectVerifier().assertNext(initialReplicatorEvent -> {
assertThat(initialReplicatorEvent).isNotNull();
JobSnapshot cache = initialReplicatorEvent.getSnapshot();
assertThat(cache.getJobMap()).hasSize(2);
assertThat(cache.getTaskMap()).hasSize(SERVICE_DESIRED + BATCH_DESIRED);
}).thenCancel().verify();
}
use of com.netflix.titus.runtime.connector.jobmanager.snapshot.JobSnapshot in project titus-control-plane by Netflix.
the class GrpcJobReplicatorEventStream method newConnection.
@Override
protected Flux<ReplicatorEvent<JobSnapshot, JobManagerEvent<?>>> newConnection() {
return Flux.<ReplicatorEvent<JobSnapshot, JobManagerEvent<?>>>create(sink -> {
CacheUpdater cacheUpdater = new CacheUpdater(jobSnapshotFactory, keepAliveEnabled, titusRuntime);
logger.info("Connecting to the job event stream (filteringCriteria={})...", filteringCriteria);
ConnectableFlux<JobManagerEvent<?>> connectableStream = client.observeJobs(filteringCriteria).publish();
Flux<JobManagerEvent<?>> augmentedStream;
if (configuration.isConnectionTimeoutEnabled()) {
augmentedStream = Flux.merge(connectableStream.take(1).timeout(Duration.ofMillis(configuration.getConnectionTimeoutMs())).ignoreElements().onErrorMap(TimeoutException.class, error -> new TimeoutException(String.format("No event received from stream in %sms", configuration.getConnectionTimeoutMs()))), connectableStream);
} else {
augmentedStream = connectableStream;
}
Disposable disposable = augmentedStream.subscribe(jobEvent -> {
long started = titusRuntime.getClock().wallTime();
try {
cacheUpdater.onEvent(jobEvent).ifPresent(sink::next);
eventProcessingLatencies.recordLevel(titusRuntime.getClock().wallTime() - started);
} catch (Exception e) {
// Throw error to force the cache reconnect.
logger.warn("Unexpected error when handling the job change notification: {}", jobEvent, e);
ExceptionExt.silent(() -> sink.error(e));
}
}, e -> ExceptionExt.silent(() -> sink.error(e)), () -> ExceptionExt.silent(sink::complete));
sink.onDispose(disposable);
connectableStream.connect();
}).doOnSubscribe(subscription -> subscriptionCounter.incrementAndGet()).doFinally(signal -> subscriptionCounter.decrementAndGet());
}
use of com.netflix.titus.runtime.connector.jobmanager.snapshot.JobSnapshot in project titus-control-plane by Netflix.
the class GrpcJobReplicatorEventStreamTest method testCacheSnapshotFiltersCompletedJobs.
@Test
public void testCacheSnapshotFiltersCompletedJobs() {
Job<?> acceptedJob = JobGenerator.oneBatchJob();
BatchJobTask acceptedTask = JobGenerator.oneBatchTask().toBuilder().withJobId(acceptedJob.getId()).withStatus(TaskStatus.newBuilder().withState(TaskState.Accepted).build()).build();
Job<?> finishedJob = JobFunctions.changeJobStatus(acceptedJob, JobStatus.newBuilder().withState(JobState.Finished).build());
Task finishedTask = JobFunctions.changeTaskStatus(acceptedTask, TaskStatus.newBuilder().withState(TaskState.Finished).build());
CacheUpdater cacheUpdater = new CacheUpdater(JobSnapshotFactories.newDefault(titusRuntime), false, titusRuntime);
assertThat(cacheUpdater.onEvent(JobUpdateEvent.newJob(acceptedJob, CallMetadataConstants.UNDEFINED_CALL_METADATA))).isEmpty();
assertThat(cacheUpdater.onEvent(TaskUpdateEvent.newTask(acceptedJob, acceptedTask, CallMetadataConstants.UNDEFINED_CALL_METADATA))).isEmpty();
assertThat(cacheUpdater.onEvent(TaskUpdateEvent.taskChange(acceptedJob, finishedTask, acceptedTask, CallMetadataConstants.UNDEFINED_CALL_METADATA))).isEmpty();
assertThat(cacheUpdater.onEvent(JobUpdateEvent.jobChange(finishedJob, acceptedJob, CallMetadataConstants.UNDEFINED_CALL_METADATA))).isEmpty();
ReplicatorEvent<JobSnapshot, JobManagerEvent<?>> snapshotEvent = cacheUpdater.onEvent(JobManagerEvent.snapshotMarker()).orElse(null);
assertThat(snapshotEvent).isNotNull();
assertThat(snapshotEvent.getSnapshot().getJobMap()).isEmpty();
assertThat(snapshotEvent.getSnapshot().getTaskMap()).isEmpty();
}
use of com.netflix.titus.runtime.connector.jobmanager.snapshot.JobSnapshot in project titus-control-plane by Netflix.
the class GrpcJobReplicatorEventStreamTest method testCacheTaskMove.
@Test
public void testCacheTaskMove() {
Pair<Job, List<Task>> pair = jobServiceStub.createJobAndTasks(SERVICE_JOB);
Job target = jobServiceStub.createJob(SERVICE_JOB);
Task task = pair.getRight().get(0);
String sourceJobId = pair.getLeft().getId();
String targetJobId = target.getId();
List<ReplicatorEvent<JobSnapshot, JobManagerEvent<?>>> events = new ArrayList<>();
newConnectVerifier().assertNext(next -> assertThat(next.getSnapshot().getTaskMap().values()).allSatisfy(t -> assertThat(t.getStatus().getState()).isEqualTo(TaskState.Accepted))).then(() -> jobServiceStub.moveTaskToState(task, TaskState.Started)).assertNext(next -> {
JobSnapshot snapshot = next.getSnapshot();
Optional<Pair<Job<?>, Task>> taskOpt = snapshot.findTaskById(task.getId());
assertThat(taskOpt).isPresent();
assertThat(taskOpt.get().getRight().getStatus().getState()).isEqualTo(TaskState.Started);
assertThat(snapshot.getTasks(sourceJobId)).containsKey(task.getId());
}).then(() -> jobServiceStub.getJobOperations().moveServiceTask(sourceJobId, targetJobId, task.getId(), CallMetadata.newBuilder().withCallerId("Test").withCallReason("testing").build()).test().awaitTerminalEvent().assertNoErrors()).recordWith(() -> events).thenConsumeWhile(next -> {
JobManagerEvent<?> trigger = next.getTrigger();
if (!(trigger instanceof TaskUpdateEvent)) {
return true;
}
TaskUpdateEvent taskUpdateEvent = (TaskUpdateEvent) trigger;
return !taskUpdateEvent.isMovedFromAnotherJob();
}).thenCancel().verify();
assertThat(events).hasSize(3);
events.stream().map(ReplicatorEvent::getTrigger).forEach(jobManagerEvent -> {
if (jobManagerEvent instanceof JobUpdateEvent) {
JobUpdateEvent jobUpdateEvent = (JobUpdateEvent) jobManagerEvent;
String eventJobId = jobUpdateEvent.getCurrent().getId();
assertThat(eventJobId).isIn(sourceJobId, targetJobId);
} else if (jobManagerEvent instanceof TaskUpdateEvent) {
TaskUpdateEvent taskUpdateEvent = (TaskUpdateEvent) jobManagerEvent;
assertThat(taskUpdateEvent.isMovedFromAnotherJob()).isTrue();
assertThat(taskUpdateEvent.getCurrentJob().getId()).isEqualTo(targetJobId);
assertThat(taskUpdateEvent.getCurrent().getJobId()).isEqualTo(targetJobId);
assertThat(taskUpdateEvent.getCurrent().getTaskContext().get(TaskAttributes.TASK_ATTRIBUTES_MOVED_FROM_JOB)).isEqualTo(sourceJobId);
} else {
fail("Unexpected event type: %s", jobManagerEvent);
}
});
}
use of com.netflix.titus.runtime.connector.jobmanager.snapshot.JobSnapshot in project titus-control-plane by Netflix.
the class LocalCacheQueryProcessor method observeJobs.
public Observable<JobChangeNotification> observeJobs(ObserveJobsQuery query) {
JobQueryCriteria<TaskStatus.TaskState, JobDescriptor.JobSpecCase> criteria = toJobQueryCriteria(query);
V3JobQueryCriteriaEvaluator jobsPredicate = new V3JobQueryCriteriaEvaluator(criteria, titusRuntime);
V3TaskQueryCriteriaEvaluator tasksPredicate = new V3TaskQueryCriteriaEvaluator(criteria, titusRuntime);
Set<String> jobFields = newFieldsFilter(query.getJobFieldsList(), JOB_MINIMUM_FIELD_SET);
Set<String> taskFields = newFieldsFilter(query.getTaskFieldsList(), TASK_MINIMUM_FIELD_SET);
Flux<JobChangeNotification> eventStream = Flux.defer(() -> {
AtomicBoolean first = new AtomicBoolean(true);
return jobDataReplicator.events().subscribeOn(scheduler).publishOn(scheduler).flatMap(event -> {
JobManagerEvent<?> jobManagerEvent = event.getRight();
long now = titusRuntime.getClock().wallTime();
JobSnapshot snapshot = event.getLeft();
Optional<JobChangeNotification> grpcEvent = toObserveJobsEvent(snapshot, jobManagerEvent, now, jobsPredicate, tasksPredicate, jobFields, taskFields);
// On first event emit full snapshot first
if (first.getAndSet(false)) {
List<JobChangeNotification> snapshotEvents = buildSnapshot(snapshot, now, jobsPredicate, tasksPredicate, jobFields, taskFields);
grpcEvent.ifPresent(snapshotEvents::add);
return Flux.fromIterable(snapshotEvents);
}
// subscribe again. Snapshot marker indicates that the underlying GRPC stream was disconnected.
if (jobManagerEvent == JobManagerEvent.snapshotMarker()) {
return Mono.error(new StatusRuntimeException(Status.ABORTED.augmentDescription("Downstream event stream reconnected.")));
}
// to filter them out here.
if (jobManagerEvent instanceof JobKeepAliveEvent) {
// Check if staleness is not too high.
if (jobDataReplicator.getStalenessMs() > configuration.getObserveJobsStalenessDisconnectMs()) {
rejectedByStalenessTooHighMetric.increment();
return Mono.error(new StatusRuntimeException(Status.ABORTED.augmentDescription("Data staleness in the event stream is too high. Most likely caused by connectivity issue to the downstream server.")));
}
return Mono.empty();
}
return grpcEvent.map(Flux::just).orElseGet(Flux::empty);
});
});
return ReactorExt.toObservable(eventStream);
}
Aggregations