use of com.netflix.titus.common.framework.scheduler.ExecutionContext in project titus-control-plane by Netflix.
the class DefaultClusterMembershipService method clusterStateEvaluator.
private Mono<Void> clusterStateEvaluator(ExecutionContext context) {
return Mono.defer(() -> {
ClusterMember localMember = connector.getLocalClusterMemberRevision().getCurrent();
ClusterMemberLeadershipState localLeadershipState = connector.getLocalLeadershipRevision().getCurrent().getLeadershipState();
HealthStatus health = healthIndicator.health();
// Explicitly disabled
if (!configuration.isLeaderElectionEnabled() || !localMember.isEnabled()) {
if (localLeadershipState == ClusterMemberLeadershipState.NonLeader) {
logger.info("Local member excluded from the leader election. Leaving the leader election process");
return connector.leaveLeadershipGroup(true).flatMap(success -> success ? connector.register(current -> toInactive(current, "Marked by a user as disabled")).ignoreElement().cast(Void.class) : Mono.empty());
}
if (localLeadershipState == ClusterMemberLeadershipState.Disabled && localMember.isActive()) {
return connector.register(current -> toInactive(current, "Marked by a user as disabled")).ignoreElement().cast(Void.class);
}
return Mono.empty();
}
// Re-enable if healthy
if (health.getHealthState() == HealthState.Healthy) {
if (localLeadershipState == ClusterMemberLeadershipState.Disabled) {
logger.info("Re-enabling local member which is in the disabled state");
return connector.joinLeadershipGroup().then(connector.register(this::toActive).ignoreElement().cast(Void.class));
}
if (!localMember.isActive()) {
return connector.register(this::toActive).ignoreElement().cast(Void.class);
}
return Mono.empty();
}
// Disable if unhealthy (and not the leader)
if (localLeadershipState != ClusterMemberLeadershipState.Disabled && localLeadershipState != ClusterMemberLeadershipState.Leader) {
logger.info("Disabling local member as it is unhealthy: {}", health);
return connector.leaveLeadershipGroup(true).flatMap(success -> success ? connector.register(current -> toInactive(current, "Unhealthy: " + health)).ignoreElement().cast(Void.class) : Mono.empty());
}
if (localLeadershipState == ClusterMemberLeadershipState.Disabled && localMember.isActive()) {
return connector.register(current -> toInactive(current, "Unhealthy: " + health)).ignoreElement().cast(Void.class);
}
return Mono.empty();
}).doOnError(error -> {
logger.info("Cluster membership health evaluation error: {}", error.getMessage());
logger.debug("Stack trace", error);
}).doOnTerminate(() -> {
metrics.updateLocal(connector.getLocalLeadershipRevision().getCurrent().getLeadershipState(), healthIndicator.health());
metrics.updateSiblings(connector.getClusterMemberSiblings());
});
}
use of com.netflix.titus.common.framework.scheduler.ExecutionContext in project titus-control-plane by Netflix.
the class DefaultNodeConditionControllerTest method checkTasksTerminatedDueToBadNodeConditions.
@Test
public void checkTasksTerminatedDueToBadNodeConditions() {
// Mock jobs, tasks & nodes
Map<String, TitusNode> nodeMap = buildNodes();
List<Job<BatchJobExt>> jobs = getJobs(true);
Map<String, List<Task>> tasksByJobIdMap = buildTasksForJobAndNodeAssignment(new ArrayList<>(nodeMap.values()), jobs);
TitusRuntime titusRuntime = mock(TitusRuntime.class);
when(titusRuntime.getRegistry()).thenReturn(new DefaultRegistry());
RelocationConfiguration configuration = mock(RelocationConfiguration.class);
when(configuration.getBadNodeConditionPattern()).thenReturn(".*Failure");
when(configuration.isTaskTerminationOnBadNodeConditionEnabled()).thenReturn(true);
NodeDataResolver nodeDataResolver = mock(NodeDataResolver.class);
when(nodeDataResolver.resolve()).thenReturn(nodeMap);
JobDataReplicator jobDataReplicator = mock(JobDataReplicator.class);
when(jobDataReplicator.getStalenessMs()).thenReturn(0L);
ReadOnlyJobOperations readOnlyJobOperations = mock(ReadOnlyJobOperations.class);
when(readOnlyJobOperations.getJobs()).thenReturn(new ArrayList<>(jobs));
tasksByJobIdMap.forEach((key, value) -> when(readOnlyJobOperations.getTasks(key)).thenReturn(value));
JobManagementClient jobManagementClient = mock(JobManagementClient.class);
Set<String> terminatedTaskIds = new HashSet<>();
when(jobManagementClient.killTask(anyString(), anyBoolean(), any())).thenAnswer(invocation -> {
String taskIdToBeTerminated = invocation.getArgument(0);
terminatedTaskIds.add(taskIdToBeTerminated);
return Mono.empty();
});
DefaultNodeConditionController nodeConditionCtrl = new DefaultNodeConditionController(configuration, nodeDataResolver, jobDataReplicator, readOnlyJobOperations, jobManagementClient, titusRuntime);
ExecutionContext executionContext = ExecutionContext.newBuilder().withIteration(ExecutionId.initial()).build();
StepVerifier.create(nodeConditionCtrl.handleNodesWithBadCondition(executionContext)).verifyComplete();
assertThat(terminatedTaskIds).isNotEmpty();
assertThat(terminatedTaskIds.size()).isEqualTo(2);
verifyTerminatedTasksOnBadNodes(terminatedTaskIds, tasksByJobIdMap, nodeMap);
}
use of com.netflix.titus.common.framework.scheduler.ExecutionContext in project titus-control-plane by Netflix.
the class DefaultNodeConditionControllerTest method noTerminationsOnDataStaleness.
@Test
public void noTerminationsOnDataStaleness() {
TitusRuntime titusRuntime = mock(TitusRuntime.class);
when(titusRuntime.getRegistry()).thenReturn(new DefaultRegistry());
RelocationConfiguration configuration = mock(RelocationConfiguration.class);
when(configuration.getBadNodeConditionPattern()).thenReturn(".*Problem");
when(configuration.isTaskTerminationOnBadNodeConditionEnabled()).thenReturn(true);
when(configuration.getDataStalenessThresholdMs()).thenReturn(8000L);
NodeDataResolver nodeDataResolver = mock(NodeDataResolver.class);
when(nodeDataResolver.getStalenessMs()).thenReturn(5L);
JobDataReplicator jobDataReplicator = mock(JobDataReplicator.class);
when(jobDataReplicator.getStalenessMs()).thenReturn(10L);
ReadOnlyJobOperations readOnlyJobOperations = mock(ReadOnlyJobOperations.class);
JobManagementClient jobManagementClient = mock(JobManagementClient.class);
Set<String> terminatedTaskIds = new HashSet<>();
when(jobManagementClient.killTask(anyString(), anyBoolean(), any())).thenAnswer(invocation -> {
String taskIdToBeTerminated = invocation.getArgument(0);
terminatedTaskIds.add(taskIdToBeTerminated);
return Mono.empty();
});
DefaultNodeConditionController nodeConditionCtrl = new DefaultNodeConditionController(configuration, nodeDataResolver, jobDataReplicator, readOnlyJobOperations, jobManagementClient, titusRuntime);
ExecutionContext executionContext = ExecutionContext.newBuilder().withIteration(ExecutionId.initial()).build();
StepVerifier.create(nodeConditionCtrl.handleNodesWithBadCondition(executionContext)).verifyComplete();
// No tasks terminated
assertThat(terminatedTaskIds).isEmpty();
}
use of com.netflix.titus.common.framework.scheduler.ExecutionContext in project titus-control-plane by Netflix.
the class DefaultNodeConditionControllerTest method badNodeConditionsIgnoredForJobsNotOptingIn.
@Test
public void badNodeConditionsIgnoredForJobsNotOptingIn() {
Map<String, TitusNode> nodeMap = buildNodes();
List<Job<BatchJobExt>> jobs = getJobs(false);
Map<String, List<Task>> stringListMap = buildTasksForJobAndNodeAssignment(new ArrayList<>(nodeMap.values()), jobs);
TitusRuntime titusRuntime = mock(TitusRuntime.class);
when(titusRuntime.getRegistry()).thenReturn(new DefaultRegistry());
RelocationConfiguration configuration = mock(RelocationConfiguration.class);
when(configuration.getBadNodeConditionPattern()).thenReturn(".*Failure");
when(configuration.isTaskTerminationOnBadNodeConditionEnabled()).thenReturn(true);
NodeDataResolver nodeDataResolver = mock(NodeDataResolver.class);
when(nodeDataResolver.resolve()).thenReturn(nodeMap);
JobDataReplicator jobDataReplicator = mock(JobDataReplicator.class);
when(jobDataReplicator.getStalenessMs()).thenReturn(0L);
// Job attribute "terminateContainerOnBadAgent" = False
ReadOnlyJobOperations readOnlyJobOperations = mock(ReadOnlyJobOperations.class);
when(readOnlyJobOperations.getJobs()).thenReturn(new ArrayList<>(jobs));
stringListMap.forEach((key, value) -> when(readOnlyJobOperations.getTasks(key)).thenReturn(value));
JobManagementClient jobManagementClient = mock(JobManagementClient.class);
Set<String> terminatedTaskIds = new HashSet<>();
when(jobManagementClient.killTask(anyString(), anyBoolean(), any())).thenAnswer(invocation -> {
String taskIdToBeTerminated = invocation.getArgument(0);
terminatedTaskIds.add(taskIdToBeTerminated);
return Mono.empty();
});
DefaultNodeConditionController nodeConditionController = new DefaultNodeConditionController(configuration, nodeDataResolver, jobDataReplicator, readOnlyJobOperations, jobManagementClient, titusRuntime);
ExecutionContext executionContext = ExecutionContext.newBuilder().withIteration(ExecutionId.initial()).build();
StepVerifier.create(nodeConditionController.handleNodesWithBadCondition(executionContext)).verifyComplete();
// no tasks should be terminated for jobs
assertThat(terminatedTaskIds).isEmpty();
}
Aggregations