Search in sources :

Example 16 with BatchJobExt

use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.

the class DifferenceResolverUtils method shouldRetry.

public static boolean shouldRetry(Job<?> job, Task task) {
    TaskStatus taskStatus = task.getStatus();
    if (taskStatus.getState() != TaskState.Finished || job.getStatus().getState() != JobState.Accepted) {
        return false;
    }
    if (hasReachedRetryLimit(job, task)) {
        return false;
    }
    if (!isBatch(job)) {
        return true;
    }
    // Batch job
    String killInitiatedReason = JobFunctions.findTaskStatus(task, TaskState.KillInitiated).map(ExecutableStatus::getReasonCode).orElse("N/A");
    if (TaskStatus.REASON_RUNTIME_LIMIT_EXCEEDED.equals(killInitiatedReason)) {
        BatchJobExt batchExt = (BatchJobExt) job.getJobDescriptor().getExtensions();
        if (!batchExt.isRetryOnRuntimeLimit()) {
            return false;
        }
    }
    return !TaskStatus.REASON_NORMAL.equals(taskStatus.getReasonCode());
}
Also used : BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) TaskStatus(com.netflix.titus.api.jobmanager.model.job.TaskStatus)

Example 17 with BatchJobExt

use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.

the class JobCriteriaQueryTest method testSearchByJobState.

@Test(timeout = 30_000)
public void testSearchByJobState() throws Exception {
    JobDescriptor<BatchJobExt> jobDescriptor = batchJobDescriptors().getValue().toBuilder().withApplicationName("testSearchByJobState").build();
    String acceptedJobId = jobsScenarioBuilder.scheduleAndReturnJob(jobDescriptor, jobScenarioBuilder -> jobScenarioBuilder.template(ScenarioTemplates.launchJob())).getId();
    String killInitiatedJobId = jobsScenarioBuilder.scheduleAndReturnJob(jobDescriptor, jobScenarioBuilder -> jobScenarioBuilder.template(ScenarioTemplates.launchJob()).killJob().expectJobUpdateEvent(job -> job.getStatus().getState() == JobState.KillInitiated, "Expected state: " + JobState.KillInitiated)).getId();
    String acceptedTaskId = jobsScenarioBuilder.takeJob(acceptedJobId).getTaskByIndex(0).getTask().getId();
    String killInitiatedTaskId = jobsScenarioBuilder.takeJob(killInitiatedJobId).getTaskByIndex(0).getTask().getId();
    // Indexes are recomputed after events are sent, so if we run findJobs/findTasks immediately, they may use stale index.
    Thread.sleep(10);
    JobQuery.Builder jobQueryBuilder = JobQuery.newBuilder().putFilteringCriteria("applicationName", "testSearchByJobState").setPage(PAGE);
    TaskQuery.Builder taskQueryBuilder = TaskQuery.newBuilder().putFilteringCriteria("applicationName", "testSearchByJobState").setPage(PAGE);
    // Jobs (Accepted)
    JobQueryResult acceptedJobQueryResult = client.findJobs(jobQueryBuilder.putFilteringCriteria("jobState", "Accepted").build());
    assertThat(acceptedJobQueryResult.getItemsList()).hasSize(1);
    Job acceptedJobQueryResultItem = acceptedJobQueryResult.getItems(0);
    assertThat(acceptedJobQueryResultItem.getId()).isEqualTo(acceptedJobId);
    // Jobs (KillInitiated)
    JobQueryResult killInitJobQueryResult = client.findJobs(jobQueryBuilder.putFilteringCriteria("jobState", "KillInitiated").setPage(PAGE).build());
    assertThat(killInitJobQueryResult.getItemsList()).hasSize(1);
    Job killInitJobQueryResultItem = killInitJobQueryResult.getItems(0);
    assertThat(killInitJobQueryResultItem.getId()).isEqualTo(killInitiatedJobId);
    // Tasks (Accepted)
    TaskQueryResult acceptedTaskQueryResult = client.findTasks(taskQueryBuilder.putFilteringCriteria("jobState", "Accepted").setPage(PAGE).build());
    assertThat(acceptedTaskQueryResult.getItemsList()).hasSize(1);
    assertThat(acceptedTaskQueryResult.getItems(0).getId()).isEqualTo(acceptedTaskId);
    // Tasks (KillInitiated)
    TaskQueryResult killInitTaskQueryResult = client.findTasks(taskQueryBuilder.putFilteringCriteria("jobState", "KillInitiated").setPage(PAGE).build());
    assertThat(killInitTaskQueryResult.getItemsList()).hasSize(1);
    assertThat(killInitTaskQueryResult.getItems(0).getId()).isEqualTo(killInitiatedTaskId);
}
Also used : TaskScenarioBuilder(com.netflix.titus.master.integration.v3.scenario.TaskScenarioBuilder) CollectionsExt(com.netflix.titus.common.util.CollectionsExt) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) ScenarioTemplates(com.netflix.titus.master.integration.v3.scenario.ScenarioTemplates) EmbeddedTitusMasters(com.netflix.titus.testkit.embedded.cell.master.EmbeddedTitusMasters) JobQueryResult(com.netflix.titus.grpc.protogen.JobQueryResult) JobGroupInfo(com.netflix.titus.api.jobmanager.model.job.JobGroupInfo) JobState(com.netflix.titus.api.jobmanager.model.job.JobState) EmbeddedKubeClusters(com.netflix.titus.testkit.embedded.kube.EmbeddedKubeClusters) ClassRule(org.junit.ClassRule) JobDescriptorGenerator.oneTaskServiceJobDescriptor(com.netflix.titus.testkit.model.job.JobDescriptorGenerator.oneTaskServiceJobDescriptor) BaseIntegrationTest(com.netflix.titus.master.integration.BaseIntegrationTest) ImmutableMap(com.google.common.collect.ImmutableMap) PlatformSidecar(com.netflix.titus.api.jobmanager.model.job.PlatformSidecar) TaskQueryResult(com.netflix.titus.grpc.protogen.TaskQueryResult) Task(com.netflix.titus.grpc.protogen.Task) JobDescriptorGenerator.oneTaskBatchJobDescriptor(com.netflix.titus.testkit.model.job.JobDescriptorGenerator.oneTaskBatchJobDescriptor) Set(java.util.Set) JobFunctions(com.netflix.titus.api.jobmanager.model.job.JobFunctions) IntegrationTest(com.netflix.titus.testkit.junit.category.IntegrationTest) Category(org.junit.experimental.categories.Category) Collectors(java.util.stream.Collectors) Page(com.netflix.titus.grpc.protogen.Page) List(java.util.List) JobsScenarioBuilder(com.netflix.titus.master.integration.v3.scenario.JobsScenarioBuilder) CellAssertions.assertCellInfo(com.netflix.titus.master.integration.v3.job.CellAssertions.assertCellInfo) BeforeClass(org.junit.BeforeClass) Job(com.netflix.titus.grpc.protogen.Job) Image(com.netflix.titus.api.jobmanager.model.job.Image) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) JobQuery(com.netflix.titus.grpc.protogen.JobQuery) Owner(com.netflix.titus.api.jobmanager.model.job.Owner) TaskStatus(com.netflix.titus.grpc.protogen.TaskStatus) BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) TaskQuery(com.netflix.titus.grpc.protogen.TaskQuery) JobDescriptor(com.netflix.titus.api.jobmanager.model.job.JobDescriptor) ServiceJobExt(com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt) JobManagementServiceGrpc(com.netflix.titus.grpc.protogen.JobManagementServiceGrpc) Test(org.junit.Test) Pagination(com.netflix.titus.grpc.protogen.Pagination) JobScenarioBuilder(com.netflix.titus.master.integration.v3.scenario.JobScenarioBuilder) RuleChain(org.junit.rules.RuleChain) TitusStackResource(com.netflix.titus.testkit.junit.master.TitusStackResource) JobDescriptorGenerator.batchJobDescriptors(com.netflix.titus.testkit.model.job.JobDescriptorGenerator.batchJobDescriptors) EmbeddedTitusMaster(com.netflix.titus.testkit.embedded.cell.master.EmbeddedTitusMaster) EmbeddedTitusCell(com.netflix.titus.testkit.embedded.cell.EmbeddedTitusCell) Collections(java.util.Collections) TaskQuery(com.netflix.titus.grpc.protogen.TaskQuery) BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) JobQuery(com.netflix.titus.grpc.protogen.JobQuery) Job(com.netflix.titus.grpc.protogen.Job) TaskQueryResult(com.netflix.titus.grpc.protogen.TaskQueryResult) JobQueryResult(com.netflix.titus.grpc.protogen.JobQueryResult) BaseIntegrationTest(com.netflix.titus.master.integration.BaseIntegrationTest) IntegrationTest(com.netflix.titus.testkit.junit.category.IntegrationTest) Test(org.junit.Test)

Example 18 with BatchJobExt

use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.

the class TitusQuotasManagerTest method isJobExemptFromSystemDisruptionWindow.

@Test
public void isJobExemptFromSystemDisruptionWindow() {
    Job<BatchJobExt> job1 = JobGenerator.oneBatchJob().but(withApplicationName("app1Test"));
    EvictionConfiguration config1 = mock(EvictionConfiguration.class);
    when(config1.getAppsExemptFromSystemDisruptionWindow()).thenReturn("app1.*");
    TitusQuotasManager titusQuotasManager = new TitusQuotasManager(null, null, null, null, config1, null);
    boolean jobExemptFromSystemDisruptionBudget = titusQuotasManager.isJobExemptFromSystemDisruptionWindow(job1);
    assertThat(jobExemptFromSystemDisruptionBudget).isTrue();
    EvictionConfiguration config2 = mock(EvictionConfiguration.class);
    when(config2.getAppsExemptFromSystemDisruptionWindow()).thenReturn("app2.*");
    TitusQuotasManager titusQuotasManager2 = new TitusQuotasManager(null, null, null, null, config2, null);
    boolean jobExemptFromSystemDisruptionBudget2 = titusQuotasManager2.isJobExemptFromSystemDisruptionWindow(job1);
    assertThat(jobExemptFromSystemDisruptionBudget2).isFalse();
}
Also used : BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) EvictionConfiguration(com.netflix.titus.runtime.connector.eviction.EvictionConfiguration) Test(org.junit.Test)

Example 19 with BatchJobExt

use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.

the class TitusQuotasManagerTest method tryConsumeSystemAndJobQuota.

@Test
public void tryConsumeSystemAndJobQuota() {
    String taskId = "job1Task1";
    String jobQuotaRejectionReason = "Job does not allow any more terminations";
    Job<BatchJobExt> job1 = JobGenerator.oneBatchJob().but(withApplicationName("app1Test"));
    EvictionConfiguration config1 = mock(EvictionConfiguration.class);
    when(config1.getAppsExemptFromSystemDisruptionWindow()).thenReturn("app1.*");
    SystemQuotaController systemQuotaController = mock(SystemQuotaController.class);
    when(systemQuotaController.consume(taskId)).thenReturn(ConsumptionResult.approved());
    JobQuotaController jobQuotaController = mock(JobQuotaController.class);
    when(jobQuotaController.consume(taskId)).thenReturn(ConsumptionResult.rejected(jobQuotaRejectionReason));
    TitusQuotasManager titusQuotasManager = new TitusQuotasManager(null, null, null, systemQuotaController, config1, null);
    ConsumptionResult consumptionResult = titusQuotasManager.tryConsumeSystemAndJobQuota(jobQuotaController, job1, taskId);
    assertThat(consumptionResult.isApproved()).isFalse();
    assertThat(consumptionResult.getRejectionReason()).isPresent();
    assertThat(consumptionResult.getRejectionReason().get()).isEqualTo(jobQuotaRejectionReason);
    JobQuotaController jobQuotaController2 = mock(JobQuotaController.class);
    when(jobQuotaController2.consume(taskId)).thenReturn(ConsumptionResult.approved());
    ConsumptionResult consumptionResult2 = titusQuotasManager.tryConsumeSystemAndJobQuota(jobQuotaController2, job1, taskId);
    assertThat(consumptionResult2.isApproved()).isTrue();
    String quotaLimitExceededReason = SystemQuotaConsumptionResults.QUOTA_LIMIT_EXCEEDED.getRejectionReason().get();
    when(systemQuotaController.consume(taskId)).thenReturn(ConsumptionResult.rejected(quotaLimitExceededReason));
    ConsumptionResult consumptionResult3 = titusQuotasManager.tryConsumeSystemAndJobQuota(jobQuotaController2, job1, taskId);
    assertThat(consumptionResult3.isApproved()).isFalse();
    assertThat(consumptionResult3.getRejectionReason()).isPresent();
    assertThat(consumptionResult3.getRejectionReason().get()).isEqualTo(quotaLimitExceededReason);
    String outsideSystemWindowReason = SystemQuotaConsumptionResults.OUTSIDE_SYSTEM_TIME_WINDOW.getRejectionReason().get();
    when(systemQuotaController.consume(taskId)).thenReturn(ConsumptionResult.rejected(outsideSystemWindowReason));
    ConsumptionResult consumptionResult4 = titusQuotasManager.tryConsumeSystemAndJobQuota(jobQuotaController2, job1, taskId);
    assertThat(consumptionResult4.isApproved()).isTrue();
}
Also used : BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) EvictionConfiguration(com.netflix.titus.runtime.connector.eviction.EvictionConfiguration) JobQuotaController(com.netflix.titus.master.eviction.service.quota.job.JobQuotaController) SystemQuotaController(com.netflix.titus.master.eviction.service.quota.system.SystemQuotaController) Test(org.junit.Test)

Example 20 with BatchJobExt

use of com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt in project titus-control-plane by Netflix.

the class JobQuotaControllerTest method testMergeTaskRelocationLimitController.

@Test
public void testMergeTaskRelocationLimitController() {
    // First version
    Job<BatchJobExt> job = newBatchJob(10, budget(perTaskRelocationLimitPolicy(1), unlimitedRate(), Collections.emptyList()));
    scheduleJob(job, 10);
    List<QuotaController<Job<?>>> controllers = buildQuotaControllers(job, jobOperations, SelfJobDisruptionBudgetResolver.getInstance(), titusRuntime);
    TaskRelocationLimitController controller = (TaskRelocationLimitController) controllers.get(0);
    Task task = jobOperations.getTasks(job.getId()).get(0);
    assertThat(controller.consume(task.getId()).isApproved()).isTrue();
    jobComponentStub.killTask(task, false, false, V3JobOperations.Trigger.Eviction);
    assertThat(controller.consume(task.getId()).isApproved()).isFalse();
    Task replacement1 = jobComponentStub.getJobOperations().getTasks().stream().filter(t -> t.getOriginalId().equals(task.getId())).findFirst().get();
    jobComponentStub.moveTaskToState(replacement1, TaskState.Started);
    // Change job descriptor and consume some quota
    Job<BatchJobExt> updatedJob = jobComponentStub.changeJob(exceptPolicy(job, perTaskRelocationLimitPolicy(3)));
    List<QuotaController<Job<?>>> merged = mergeQuotaControllers(controllers, updatedJob, jobOperations, SelfJobDisruptionBudgetResolver.getInstance(), titusRuntime);
    TaskRelocationLimitController updatedController = (TaskRelocationLimitController) merged.get(0);
    // Evict replacement 1
    assertThat(updatedController.consume(replacement1.getId()).isApproved()).isTrue();
    jobComponentStub.killTask(replacement1, false, false, V3JobOperations.Trigger.Eviction);
    Task replacement2 = jobComponentStub.getJobOperations().getTasks().stream().filter(t -> t.getOriginalId().equals(task.getId())).findFirst().get();
    jobComponentStub.moveTaskToState(replacement2, TaskState.Started);
    // Evict replacement 2
    assertThat(updatedController.consume(replacement2.getId()).isApproved()).isTrue();
    jobComponentStub.killTask(replacement2, false, false, V3JobOperations.Trigger.Eviction);
    Task replacement3 = jobComponentStub.getJobOperations().getTasks().stream().filter(t -> t.getOriginalId().equals(task.getId())).findFirst().get();
    jobComponentStub.moveTaskToState(replacement3, TaskState.Started);
    assertThat(updatedController.consume(replacement3.getId()).isApproved()).isFalse();
}
Also used : Task(com.netflix.titus.api.jobmanager.model.job.Task) QuotaController(com.netflix.titus.master.eviction.service.quota.QuotaController) BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) Test(org.junit.Test)

Aggregations

BatchJobExt (com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt)73 Test (org.junit.Test)55 Task (com.netflix.titus.api.jobmanager.model.job.Task)30 BatchJobTask (com.netflix.titus.api.jobmanager.model.job.BatchJobTask)25 List (java.util.List)20 ArrayList (java.util.ArrayList)19 JobStore (com.netflix.titus.api.jobmanager.store.JobStore)17 HashMap (java.util.HashMap)16 V1Affinity (io.kubernetes.client.openapi.models.V1Affinity)14 IntegrationNotParallelizableTest (com.netflix.titus.testkit.junit.category.IntegrationNotParallelizableTest)13 ServiceJobTask (com.netflix.titus.api.jobmanager.model.job.ServiceJobTask)11 V1Pod (io.kubernetes.client.openapi.models.V1Pod)11 Job (com.netflix.titus.api.jobmanager.model.job.Job)10 JobDescriptor (com.netflix.titus.api.jobmanager.model.job.JobDescriptor)10 Container (com.netflix.titus.api.jobmanager.model.job.Container)6 Map (java.util.Map)6 Assertions.assertThat (org.assertj.core.api.Assertions.assertThat)6 V1Container (io.kubernetes.client.openapi.models.V1Container)5 BasicContainer (com.netflix.titus.api.jobmanager.model.job.BasicContainer)4 Image (com.netflix.titus.api.jobmanager.model.job.Image)4