use of com.netflix.titus.api.jobmanager.store.JobStore in project titus-control-plane by Netflix.
the class KillInitiatedActions method userInitiateTaskKillAction.
/**
* Change a task to {@link TaskState#KillInitiated} state, store it, and send the kill command to the compute provider.
* All models are updated when both operations complete.
* This method is used for user initiated kill operations, so the store operation happens before response is sent back to the user.
*/
public static ChangeAction userInitiateTaskKillAction(ReconciliationEngine<JobManagerReconcilerEvent> engine, JobServiceRuntime executionContext, JobStore jobStore, VersionSupplier versionSupplier, String taskId, boolean shrink, boolean preventMinSizeUpdate, String reasonCode, String reason, TitusRuntime titusRuntime, CallMetadata callMetadata) {
return TitusChangeAction.newAction("userInitiateTaskKill").id(taskId).trigger(V3JobOperations.Trigger.API).summary(reason).callMetadata(callMetadata).changeWithModelUpdates(self -> JobEntityHolders.toTaskObservable(engine, taskId, titusRuntime).flatMap(task -> {
TaskState taskState = task.getStatus().getState();
if (taskState == TaskState.KillInitiated || taskState == TaskState.Finished) {
return Observable.just(Collections.<ModelActionHolder>emptyList());
}
if (shrink) {
Job<ServiceJobExt> job = engine.getReferenceView().getEntity();
Capacity capacity = job.getJobDescriptor().getExtensions().getCapacity();
if (preventMinSizeUpdate && capacity.getDesired() <= capacity.getMin()) {
return Observable.<List<ModelActionHolder>>error(JobManagerException.terminateAndShrinkNotAllowed(job, task));
}
}
Task taskWithKillInitiated = VersionSuppliers.nextVersion(JobFunctions.changeTaskStatus(task, TaskState.KillInitiated, reasonCode, reason, titusRuntime.getClock()), versionSupplier);
Callable<List<ModelActionHolder>> modelUpdateActions = () -> JobEntityHolders.expectTask(engine, task.getId(), titusRuntime).map(current -> {
List<ModelActionHolder> updateActions = new ArrayList<>();
TitusModelAction stateUpdateAction = TitusModelAction.newModelUpdate(self).taskUpdate(taskWithKillInitiated);
updateActions.addAll(ModelActionHolder.allModels(stateUpdateAction));
if (shrink) {
TitusModelAction shrinkAction = createShrinkAction(self, versionSupplier);
updateActions.add(ModelActionHolder.reference(shrinkAction));
}
return updateActions;
}).orElse(Collections.emptyList());
return jobStore.updateTask(taskWithKillInitiated).andThen(createKillAction(executionContext, task)).andThen(Observable.fromCallable(modelUpdateActions));
}));
}
use of com.netflix.titus.api.jobmanager.store.JobStore in project titus-control-plane by Netflix.
the class DifferenceResolverUtils method findTaskStateTimeouts.
/**
* Find all tasks that are stuck in a specific state. The number of {@link ChangeAction changes} will be limited
* by the {@link TokenBucket stuckInStateRateLimiter}
*/
public static List<ChangeAction> findTaskStateTimeouts(ReconciliationEngine<JobManagerReconcilerEvent> engine, JobView runningJobView, JobManagerConfiguration configuration, JobServiceRuntime runtime, JobStore jobStore, VersionSupplier versionSupplier, TokenBucket stuckInStateRateLimiter, TitusRuntime titusRuntime) {
Clock clock = titusRuntime.getClock();
List<ChangeAction> actions = new ArrayList<>();
runningJobView.getJobHolder().getChildren().forEach(taskHolder -> {
Task task = taskHolder.getEntity();
TaskState taskState = task.getStatus().getState();
if (JobFunctions.isBatchJob(runningJobView.getJob()) && taskState == TaskState.Started) {
Job<BatchJobExt> batchJob = runningJobView.getJob();
// We expect runtime limit to be always set, so this is just extra safety measure.
long runtimeLimitMs = Math.max(BatchJobExt.RUNTIME_LIMIT_MIN, batchJob.getJobDescriptor().getExtensions().getRuntimeLimitMs());
long deadline = task.getStatus().getTimestamp() + runtimeLimitMs;
if (deadline < clock.wallTime()) {
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_RUNTIME_LIMIT_EXCEEDED, "Task running too long (runtimeLimit=" + runtimeLimitMs + "ms)", titusRuntime));
}
return;
}
TaskTimeoutChangeActions.TimeoutStatus timeoutStatus = TaskTimeoutChangeActions.getTimeoutStatus(taskHolder, clock);
switch(timeoutStatus) {
case Ignore:
case Pending:
break;
case NotSet:
long timeoutMs = -1;
switch(taskState) {
case Launched:
timeoutMs = configuration.getTaskInLaunchedStateTimeoutMs();
break;
case StartInitiated:
timeoutMs = isBatch(runningJobView.getJob()) ? configuration.getBatchTaskInStartInitiatedStateTimeoutMs() : configuration.getServiceTaskInStartInitiatedStateTimeoutMs();
break;
case KillInitiated:
timeoutMs = configuration.getTaskInKillInitiatedStateTimeoutMs();
break;
}
if (timeoutMs > 0) {
actions.add(TaskTimeoutChangeActions.setTimeout(taskHolder.getId(), task.getStatus().getState(), timeoutMs, clock));
}
break;
case TimedOut:
if (!stuckInStateRateLimiter.tryTake()) {
break;
}
if (task.getStatus().getState() == TaskState.KillInitiated) {
int attempts = TaskTimeoutChangeActions.getKillInitiatedAttempts(taskHolder) + 1;
if (attempts >= configuration.getTaskKillAttempts()) {
actions.add(BasicTaskActions.updateTaskInRunningModel(task.getId(), V3JobOperations.Trigger.Reconciler, configuration, engine, taskParam -> Optional.of(taskParam.toBuilder().withStatus(taskParam.getStatus().toBuilder().withState(TaskState.Finished).withReasonCode(TaskStatus.REASON_STUCK_IN_KILLING_STATE).withReasonMessage("stuck in " + taskState + "state").build()).build()), "TimedOut in KillInitiated state", versionSupplier, titusRuntime, JobManagerConstants.RECONCILER_CALLMETADATA.toBuilder().withCallReason("Kill initiated").build()));
} else {
actions.add(TaskTimeoutChangeActions.incrementTaskKillAttempt(task.getId(), configuration.getTaskInKillInitiatedStateTimeoutMs(), clock));
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_STUCK_IN_KILLING_STATE, "Another kill attempt (" + (attempts + 1) + ')', titusRuntime));
}
} else {
actions.add(KillInitiatedActions.reconcilerInitiatedTaskKillInitiated(engine, task, runtime, jobStore, versionSupplier, TaskStatus.REASON_STUCK_IN_STATE, "Task stuck in " + taskState + " state", titusRuntime));
}
break;
}
});
return actions;
}
use of com.netflix.titus.api.jobmanager.store.JobStore in project titus-control-plane by Netflix.
the class EmbeddedTitusMaster method boot.
public EmbeddedTitusMaster boot() {
Stopwatch timer = Stopwatch.createStarted();
logger.info("Starting Titus Master");
Module embeddedKubeModule;
if (embeddedKubeCluster == null) {
embeddedKubeModule = new AbstractModule() {
@Override
protected void configure() {
}
};
} else {
embeddedKubeModule = new EmbeddedKubeModule(embeddedKubeCluster);
}
injector = InjectorBuilder.fromModules(Modules.override(new TitusRuntimeModule(false)).with(new AbstractModule() {
@Override
protected void configure() {
bind(Archaius2ConfigurationLogger.class).asEagerSingleton();
bind(Registry.class).toInstance(new DefaultRegistry());
}
}), embeddedKubeModule, Modules.override(new TitusMasterModule(enableREST, TitusMasterModule.Mode.EMBEDDED_KUBE)).with(new AbstractModule() {
@Override
protected void configure() {
bind(InstanceCloudConnector.class).toInstance(new NoOpInstanceCloudConnector());
bind(MasterDescription.class).toInstance(masterDescription);
bind(MasterMonitor.class).to(LocalMasterMonitor.class);
bind(AppScalePolicyStore.class).to(InMemoryPolicyStore.class);
bind(LoadBalancerStore.class).to(InMemoryLoadBalancerStore.class);
bind(LoadBalancerConnector.class).to(NoOpLoadBalancerConnector.class);
bind(LoadBalancerJobValidator.class).to(NoOpLoadBalancerJobValidator.class);
}
@Provides
@Singleton
public JobStore getJobStore(TitusRuntime titusRuntime) {
if (!cassandraJobStore) {
return jobStore;
}
try {
JobStore jobStore = EmbeddedCassandraStoreFactory.newBuilder().withTitusRuntime(titusRuntime).build().getJobStore();
return jobStore;
} catch (Throwable e) {
e.printStackTrace();
return null;
}
}
}), newJettyModule(), new ArchaiusModule() {
@Override
protected void configureArchaius() {
bindApplicationConfigurationOverride().toInstance(config);
}
}).createInjector();
if (grpcPort <= 0) {
grpcPort = getGrpcPort();
config.setProperty("titus.master.grpcServer.port", "" + grpcPort);
}
injector.getInstance(ContainerEventBus.class).submitInOrder(new ContainerEventBus.ContainerStartedEvent());
injector.getInstance(LeaderActivator.class).becomeLeader();
injector.getInstance(AuditLogService.class).auditLogEvents().subscribe(auditLogs::add);
if (enableREST) {
// Since jetty API server is run on a separate thread, it may not be ready yet
// We do not have better way, but call it until it replies.
getClient().findAllApplicationSLA().retryWhen(attempts -> {
return attempts.zipWith(Observable.range(1, 5), (n, i) -> i).flatMap(i -> {
return Observable.timer(i, TimeUnit.SECONDS);
});
}).timeout(30, TimeUnit.SECONDS).toBlocking().firstOrDefault(null);
}
logger.info("Embedded TitusMaster started in {}ms", timer.elapsed(TimeUnit.MILLISECONDS));
return this;
}
use of com.netflix.titus.api.jobmanager.store.JobStore in project titus-control-plane by Netflix.
the class CassandraJobStoreTest method testUpdateTask.
@Test
public void testUpdateTask() {
JobStore store = getJobStore();
Job<BatchJobExt> job = createBatchJobObject();
store.init().await();
store.storeJob(job).await();
Pair<List<Job<?>>, Integer> jobsAndErrors = store.retrieveJobs().toBlocking().first();
checkRetrievedJob(job, jobsAndErrors.getLeft().get(0));
Task task = createTaskObject(job);
store.storeTask(task).await();
Task retrievedTask = store.retrieveTask(task.getId()).toBlocking().first();
checkRetrievedTask(task, retrievedTask);
BatchJobTask newTask = BatchJobTask.newBuilder((BatchJobTask) task).withStatus(TaskStatus.newBuilder().withState(TaskState.Finished).build()).build();
store.updateTask(newTask).await();
Task newRetrievedTask = store.retrieveTask(newTask.getId()).toBlocking().first();
checkRetrievedTask(newTask, newRetrievedTask);
}
use of com.netflix.titus.api.jobmanager.store.JobStore in project titus-control-plane by Netflix.
the class CassandraJobStoreTest method testUpdateJob.
@Test
public void testUpdateJob() {
JobStore store = getJobStore();
Job<BatchJobExt> job = createBatchJobObject();
store.init().await();
store.storeJob(job).await();
Pair<List<Job<?>>, Integer> jobsAndErrors = store.retrieveJobs().toBlocking().first();
checkRetrievedJob(job, jobsAndErrors.getLeft().get(0));
Job<BatchJobExt> newJob = job.toBuilder().withStatus(JobStatus.newBuilder().withState(JobState.Finished).build()).build();
store.updateJob(newJob).await();
Pair<List<Job<?>>, Integer> newJobsAndErrors = store.retrieveJobs().toBlocking().first();
checkRetrievedJob(newJob, newJobsAndErrors.getLeft().get(0));
}
Aggregations