use of com.netflix.titus.common.runtime.TitusRuntime in project titus-control-plane by Netflix.
the class GrpcJobReplicatorEventStream method newConnection.
@Override
protected Flux<ReplicatorEvent<JobSnapshot, JobManagerEvent<?>>> newConnection() {
return Flux.<ReplicatorEvent<JobSnapshot, JobManagerEvent<?>>>create(sink -> {
CacheUpdater cacheUpdater = new CacheUpdater(jobSnapshotFactory, keepAliveEnabled, titusRuntime);
logger.info("Connecting to the job event stream (filteringCriteria={})...", filteringCriteria);
ConnectableFlux<JobManagerEvent<?>> connectableStream = client.observeJobs(filteringCriteria).publish();
Flux<JobManagerEvent<?>> augmentedStream;
if (configuration.isConnectionTimeoutEnabled()) {
augmentedStream = Flux.merge(connectableStream.take(1).timeout(Duration.ofMillis(configuration.getConnectionTimeoutMs())).ignoreElements().onErrorMap(TimeoutException.class, error -> new TimeoutException(String.format("No event received from stream in %sms", configuration.getConnectionTimeoutMs()))), connectableStream);
} else {
augmentedStream = connectableStream;
}
Disposable disposable = augmentedStream.subscribe(jobEvent -> {
long started = titusRuntime.getClock().wallTime();
try {
cacheUpdater.onEvent(jobEvent).ifPresent(sink::next);
eventProcessingLatencies.recordLevel(titusRuntime.getClock().wallTime() - started);
} catch (Exception e) {
// Throw error to force the cache reconnect.
logger.warn("Unexpected error when handling the job change notification: {}", jobEvent, e);
ExceptionExt.silent(() -> sink.error(e));
}
}, e -> ExceptionExt.silent(() -> sink.error(e)), () -> ExceptionExt.silent(sink::complete));
sink.onDispose(disposable);
connectableStream.connect();
}).doOnSubscribe(subscription -> subscriptionCounter.incrementAndGet()).doFinally(signal -> subscriptionCounter.decrementAndGet());
}
use of com.netflix.titus.common.runtime.TitusRuntime in project titus-control-plane by Netflix.
the class TaskEventsGeneratorTest method checkPublisherState.
@Test
public void checkPublisherState() {
int numTasks = 5;
final TaskEventsGenerator taskEventsGenerator = new TaskEventsGenerator(mockTitusClient(numTasks), Collections.emptyMap(), titusRuntime);
try {
EsPublisher esPublisher = new EsPublisher(taskEventsGenerator, mockElasticSearchClient(), mockEsPublisherConfiguration(), new DefaultRegistry());
esPublisher.activate();
final CountDownLatch latch = new CountDownLatch(1);
Flux.interval(Duration.ofSeconds(1), Schedulers.elastic()).take(1).doOnNext(i -> {
final int numTasksUpdated = esPublisher.getNumTasksPublished();
final int numErrors = esPublisher.getNumErrorsInPublishing();
assertThat(numErrors).isEqualTo(0);
assertThat(numTasksUpdated).isGreaterThanOrEqualTo(numTasks);
latch.countDown();
}).subscribe();
try {
latch.await(2, TimeUnit.MINUTES);
} catch (InterruptedException e) {
fail("Timeout in checkPublisherState ", e);
}
} finally {
taskEventsGenerator.shutdown();
}
}
use of com.netflix.titus.common.runtime.TitusRuntime in project titus-control-plane by Netflix.
the class KubeNotificationProcessor method handlePodUpdatedEvent.
private Mono<Void> handlePodUpdatedEvent(PodEvent event, Job job, Task task) {
// This is basic sanity check. If it fails, we have a major problem with pod state.
if (event.getPod() == null || event.getPod().getStatus() == null || event.getPod().getStatus().getPhase() == null) {
logger.warn("Pod notification with pod without status or phase set: taskId={}, pod={}", task.getId(), event.getPod());
metricsNoChangesApplied.increment();
return Mono.empty();
}
PodWrapper podWrapper = new PodWrapper(event.getPod());
Optional<V1Node> node;
if (event instanceof PodUpdatedEvent) {
node = ((PodUpdatedEvent) event).getNode();
} else if (event instanceof PodDeletedEvent) {
node = ((PodDeletedEvent) event).getNode();
} else {
node = Optional.empty();
}
Either<TaskStatus, String> newTaskStatusOrError = new PodToTaskMapper(podWrapper, node, task, event instanceof PodDeletedEvent, containerResultCodeResolver, titusRuntime).getNewTaskStatus();
if (newTaskStatusOrError.hasError()) {
logger.info(newTaskStatusOrError.getError());
metricsNoChangesApplied.increment();
return Mono.empty();
}
TaskStatus newTaskStatus = newTaskStatusOrError.getValue();
if (TaskStatus.areEquivalent(task.getStatus(), newTaskStatus)) {
logger.info("Pod change notification does not change task status: taskId={}, status={}, eventSequenceNumber={}", task.getId(), newTaskStatus, event.getSequenceNumber());
} else {
logger.info("Pod notification changes task status: taskId={}, fromStatus={}, toStatus={}, eventSequenceNumber={}", task.getId(), task.getStatus(), newTaskStatus, event.getSequenceNumber());
}
// against most up to date task version.
if (!updateTaskStatus(podWrapper, newTaskStatus, node, task, true).isPresent()) {
return Mono.empty();
}
return ReactorExt.toMono(v3JobOperations.updateTask(task.getId(), current -> updateTaskStatus(podWrapper, newTaskStatus, node, current, false), V3JobOperations.Trigger.Kube, "Pod status updated from kubernetes node (k8phase='" + event.getPod().getStatus().getPhase() + "', taskState=" + task.getStatus().getState() + ")", KUBE_CALL_METADATA));
}
use of com.netflix.titus.common.runtime.TitusRuntime in project titus-control-plane by Netflix.
the class ObserveJobsSubscription method tryInitialize.
private boolean tryInitialize() {
ObserveJobsQuery query = getLastObserveJobsQueryEvent();
if (query == null) {
return false;
}
Stopwatch start = Stopwatch.createStarted();
String trxId = UUID.randomUUID().toString();
CallMetadata callMetadata = context.getCallMetadataResolver().resolve().orElse(CallMetadataConstants.UNDEFINED_CALL_METADATA);
metrics.observeJobsStarted(trxId, callMetadata);
JobQueryCriteria<TaskStatus.TaskState, JobDescriptor.JobSpecCase> criteria = toJobQueryCriteria(query);
V3JobQueryCriteriaEvaluator jobsPredicate = new V3JobQueryCriteriaEvaluator(criteria, titusRuntime);
V3TaskQueryCriteriaEvaluator tasksPredicate = new V3TaskQueryCriteriaEvaluator(criteria, titusRuntime);
Observable<JobChangeNotification> eventStream = context.getJobOperations().observeJobs(jobsPredicate, tasksPredicate, true).filter(event -> withArchived || !event.isArchived()).observeOn(context.getObserveJobsScheduler()).subscribeOn(context.getObserveJobsScheduler(), false).map(event -> GrpcJobManagementModelConverters.toGrpcJobChangeNotification(event, context.getGrpcObjectsCache(), titusRuntime.getClock().wallTime())).compose(ObservableExt.head(() -> {
List<JobChangeNotification> snapshot = createJobsSnapshot(jobsPredicate, tasksPredicate);
snapshot.add(SNAPSHOT_END_MARKER);
return snapshot;
})).doOnError(e -> logger.error("Unexpected error in jobs event stream", e));
AtomicBoolean closingProcessed = new AtomicBoolean();
this.jobServiceSubscription = eventStream.doOnUnsubscribe(() -> {
if (!closingProcessed.getAndSet(true)) {
metrics.observeJobsUnsubscribed(trxId, start.elapsed(TimeUnit.MILLISECONDS));
}
}).subscribe(event -> {
metrics.observeJobsEventEmitted(trxId);
jobServiceEvents.add(event);
drain();
}, e -> {
if (!closingProcessed.getAndSet(true)) {
metrics.observeJobsError(trxId, start.elapsed(TimeUnit.MILLISECONDS), e);
}
jobServiceCompleted = true;
jobServiceError = new StatusRuntimeException(Status.INTERNAL.withDescription("All jobs monitoring stream terminated with an error").withCause(e));
drain();
}, () -> {
if (!closingProcessed.getAndSet(true)) {
metrics.observeJobsCompleted(trxId, start.elapsed(TimeUnit.MILLISECONDS));
}
jobServiceCompleted = true;
drain();
});
this.grpcStreamInitiated = true;
return true;
}
use of com.netflix.titus.common.runtime.TitusRuntime in project titus-control-plane by Netflix.
the class SupervisorServiceModule method getLocalMasterInstanceResolver.
/**
* As MasterInstance data contain a lot of details that are deployment specific, this binding is provided here
* for completeness/as an example only. It should be overridden by deployment specific configuration.
*/
@Provides
@Singleton
public LocalMasterInstanceResolver getLocalMasterInstanceResolver(SupervisorConfiguration configuration, GrpcMasterEndpointConfiguration grpcServerConfiguration, LocalMasterReadinessResolver localMasterReadinessResolver, TitusRuntime titusRuntime) {
String ipAddress = NetworkExt.getLocalIPs().flatMap(ips -> ips.stream().filter(NetworkExt::isIpV4).findFirst()).orElse("127.0.0.1");
ServerPort grpcPort = ServerPort.newBuilder().withPortNumber(grpcServerConfiguration.getPort()).withSecure(false).withProtocol("grpc").withDescription("TitusMaster GRPC endpoint").build();
MasterInstance initial = MasterInstance.newBuilder().withInstanceId(configuration.getTitusMasterInstanceId()).withInstanceGroupId(configuration.getTitusMasterInstanceId() + "Group").withIpAddress(ipAddress).withStatusHistory(Collections.emptyList()).withStatus(MasterStatus.newBuilder().withState(MasterState.Starting).withMessage("Bootstrapping").withTimestamp(titusRuntime.getClock().wallTime()).build()).withServerPorts(Collections.singletonList(grpcPort)).build();
return new DefaultLocalMasterInstanceResolver(localMasterReadinessResolver, initial);
}
Aggregations