Search in sources :

Example 16 with EntityHolder

use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.

the class RetryActionInterceptorTest method testRetry.

@Test
public void testRetry() throws Exception {
    TitusChangeAction changeAction = SampleTitusChangeActions.failingJob(2);
    // First two calls should fail
    ModelAction updateAction1 = expectUpdateActionOfType(changeAction, RetryActionInterceptor.RetryModelUpdateAction.class);
    EntityHolder modelWithTag1 = expectAboveExecutionLimits(updateAction1, EntityHolder.newRoot("rootId", "data"));
    expectBelowExecutionLimitsWhenTimeAdvanced(modelWithTag1, INITIAL_DELAY_MS);
    ModelAction updateAction2 = expectUpdateActionOfType(changeAction, RetryActionInterceptor.RetryModelUpdateAction.class);
    EntityHolder modelWithTag2 = expectAboveExecutionLimits(updateAction2, modelWithTag1);
    expectBelowExecutionLimitsWhenTimeAdvanced(modelWithTag2, INITIAL_DELAY_MS * 2);
    // Third call should succeed
    ModelAction updateAction3 = expectUpdateActionOfType(changeAction, RetryActionInterceptor.RemoveRetryRecord.class);
    expectNoRetryTag(updateAction3, modelWithTag2);
}
Also used : ModelAction(com.netflix.titus.common.framework.reconciler.ModelAction) TitusChangeAction(com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) Test(org.junit.Test)

Example 17 with EntityHolder

use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.

the class DefaultV3JobOperations method enterActiveMode.

@Activator
public void enterActiveMode() {
    this.reconciliationFramework = jobReconciliationFrameworkFactory.newInstance();
    // BUG: event stream breaks permanently, and cannot be retried.
    // As we cannot fix the underlying issue yet, we have to be able to discover when it happens.
    AtomicLong eventStreamLastError = new AtomicLong();
    Clock clock = titusRuntime.getClock();
    this.transactionLoggerSubscription = JobTransactionLogger.logEvents(reconciliationFramework, eventStreamLastError, clock);
    PolledMeter.using(titusRuntime.getRegistry()).withName(METRIC_EVENT_STREAM_LAST_ERROR).monitorValue(eventStreamLastError, value -> value.get() <= 0 ? 0 : clock.wallTime() - value.get());
    // Remove finished jobs from the reconciliation framework.
    Observable<JobManagerReconcilerEvent> reconciliationEventsObservable = reconciliationFramework.events().onBackpressureBuffer(OBSERVE_JOBS_BACKPRESSURE_BUFFER_SIZE, () -> logger.warn("Overflowed the buffer size: " + OBSERVE_JOBS_BACKPRESSURE_BUFFER_SIZE), BackpressureOverflow.ON_OVERFLOW_ERROR).doOnSubscribe(() -> {
        List<EntityHolder> entityHolders = reconciliationFramework.orderedView(IndexKind.StatusCreationTime);
        for (EntityHolder entityHolder : entityHolders) {
            handleJobCompletedEvent(entityHolder);
        }
    });
    this.reconcilerEventSubscription = titusRuntime.persistentStream(reconciliationEventsObservable).subscribe(event -> {
        if (event instanceof JobModelUpdateReconcilerEvent) {
            JobModelUpdateReconcilerEvent jobUpdateEvent = (JobModelUpdateReconcilerEvent) event;
            handleJobCompletedEvent(jobUpdateEvent.getChangedEntityHolder());
        }
    }, e -> logger.error("Event stream terminated with an error", e), () -> logger.info("Event stream completed"));
    reconciliationFramework.start();
}
Also used : Arrays(java.util.Arrays) JobCompatibility(com.netflix.titus.api.jobmanager.model.job.JobCompatibility) TitusChangeAction(com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction) Task(com.netflix.titus.api.jobmanager.model.job.Task) LoggerFactory(org.slf4j.LoggerFactory) BasicServiceJobActions(com.netflix.titus.master.jobmanager.service.service.action.BasicServiceJobActions) StringExt(com.netflix.titus.common.util.StringExt) ReactorExt(com.netflix.titus.common.util.rx.ReactorExt) JobStatus(com.netflix.titus.api.jobmanager.model.job.JobStatus) PreDestroy(javax.annotation.PreDestroy) FeatureActivationConfiguration(com.netflix.titus.api.FeatureActivationConfiguration) Map(java.util.Map) JobState(com.netflix.titus.api.jobmanager.model.job.JobState) BasicJobActions(com.netflix.titus.master.jobmanager.service.common.action.task.BasicJobActions) JobEntityHolders(com.netflix.titus.master.jobmanager.service.common.action.JobEntityHolders) JobStore(com.netflix.titus.api.jobmanager.store.JobStore) CallMetadata(com.netflix.titus.api.model.callmetadata.CallMetadata) FunctionExt.alwaysTrue(com.netflix.titus.common.util.FunctionExt.alwaysTrue) JobNewModelReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobModelReconcilerEvent.JobNewModelReconcilerEvent) ImmutableSet(com.google.common.collect.ImmutableSet) Job(com.netflix.titus.api.jobmanager.model.job.Job) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) TaskStatus(com.netflix.titus.api.jobmanager.model.job.TaskStatus) Set(java.util.Set) JobFunctions(com.netflix.titus.api.jobmanager.model.job.JobFunctions) UUID(java.util.UUID) JobManagerEvent(com.netflix.titus.api.jobmanager.model.job.event.JobManagerEvent) Collectors(java.util.stream.Collectors) TaskState(com.netflix.titus.api.jobmanager.model.job.TaskState) ProtobufExt(com.netflix.titus.common.util.ProtobufExt) List(java.util.List) JobModelUpdateReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobModelReconcilerEvent.JobModelUpdateReconcilerEvent) Stream(java.util.stream.Stream) TaskUpdateEvent(com.netflix.titus.api.jobmanager.model.job.event.TaskUpdateEvent) ReconciliationEngine(com.netflix.titus.common.framework.reconciler.ReconciliationEngine) DisruptionBudget(com.netflix.titus.api.jobmanager.model.job.disruptionbudget.DisruptionBudget) ProxyConfiguration(com.netflix.titus.common.util.guice.annotation.ProxyConfiguration) Optional(java.util.Optional) JobManagerReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobManagerReconcilerEvent) JobAttributes(com.netflix.titus.api.jobmanager.JobAttributes) ObservableExt(com.netflix.titus.common.util.rx.ObservableExt) Clock(com.netflix.titus.common.util.time.Clock) Subscription(rx.Subscription) KillInitiatedActions(com.netflix.titus.master.jobmanager.service.common.action.task.KillInitiatedActions) Completable(rx.Completable) JobManagerConstants(com.netflix.titus.api.jobmanager.service.JobManagerConstants) EntitySanitizer(com.netflix.titus.common.model.sanitizer.EntitySanitizer) ServiceJobProcesses(com.netflix.titus.api.jobmanager.model.job.ServiceJobProcesses) MoveTaskBetweenJobsAction(com.netflix.titus.master.jobmanager.service.service.action.MoveTaskBetweenJobsAction) ProxyType(com.netflix.titus.common.util.guice.ProxyType) MetricConstants(com.netflix.titus.master.MetricConstants) Singleton(javax.inject.Singleton) Function(java.util.function.Function) ArrayList(java.util.ArrayList) Observable(rx.Observable) Inject(javax.inject.Inject) CallMetadataUtils(com.netflix.titus.runtime.endpoint.metadata.CallMetadataUtils) Pair(com.netflix.titus.common.util.tuple.Pair) Model(com.netflix.titus.common.framework.reconciler.ModelActionHolder.Model) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) JobManagerException(com.netflix.titus.api.jobmanager.service.JobManagerException) Named(javax.inject.Named) BackpressureOverflow(rx.BackpressureOverflow) JobDescriptor(com.netflix.titus.api.jobmanager.model.job.JobDescriptor) JobCheckpointReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobCheckpointReconcilerEvent) Logger(org.slf4j.Logger) JobUpdateEvent(com.netflix.titus.api.jobmanager.model.job.event.JobUpdateEvent) ServiceJobExt(com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt) Mono(reactor.core.publisher.Mono) GrpcJobManagementModelConverters(com.netflix.titus.runtime.endpoint.v3.grpc.GrpcJobManagementModelConverters) ManagementSubsystemInitializer(com.netflix.titus.master.service.management.ManagementSubsystemInitializer) JOB_STRICT_SANITIZER(com.netflix.titus.api.jobmanager.model.job.sanitizer.JobSanitizerBuilder.JOB_STRICT_SANITIZER) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) Activator(com.netflix.titus.common.util.guice.annotation.Activator) AtomicLong(java.util.concurrent.atomic.AtomicLong) ModelActionHolder(com.netflix.titus.common.framework.reconciler.ModelActionHolder) V3JobOperations(com.netflix.titus.api.jobmanager.service.V3JobOperations) TaskAttributes(com.netflix.titus.api.jobmanager.TaskAttributes) CapacityAttributes(com.netflix.titus.api.jobmanager.model.job.CapacityAttributes) ReconciliationFramework(com.netflix.titus.common.framework.reconciler.ReconciliationFramework) BasicTaskActions(com.netflix.titus.master.jobmanager.service.common.action.task.BasicTaskActions) JobSubmitLimiter(com.netflix.titus.master.jobmanager.service.limiter.JobSubmitLimiter) PolledMeter(com.netflix.spectator.api.patterns.PolledMeter) TitusRuntime(com.netflix.titus.common.runtime.TitusRuntime) Evaluators(com.netflix.titus.common.util.Evaluators) Collections(java.util.Collections) AtomicLong(java.util.concurrent.atomic.AtomicLong) JobModelUpdateReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobModelReconcilerEvent.JobModelUpdateReconcilerEvent) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) Clock(com.netflix.titus.common.util.time.Clock) JobManagerReconcilerEvent(com.netflix.titus.master.jobmanager.service.event.JobManagerReconcilerEvent) Activator(com.netflix.titus.common.util.guice.annotation.Activator)

Example 18 with EntityHolder

use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.

the class DefaultV3JobOperations method findJobs.

@Override
public List<Job<?>> findJobs(Predicate<Pair<Job<?>, List<Task>>> queryPredicate, int offset, int limit) {
    if (limit <= 0) {
        return Collections.emptyList();
    }
    List<EntityHolder> jobHolders = reconciliationFramework.orderedView(IndexKind.StatusCreationTime);
    List<Job<?>> result = new ArrayList<>();
    int toDrop = offset;
    int toTake = limit;
    for (EntityHolder holder : jobHolders) {
        Pair<Job<?>, List<Task>> jobTasksPair = toJobTasksPair(holder);
        if (queryPredicate.test(jobTasksPair)) {
            if (toDrop > 0) {
                toDrop--;
            } else {
                result.add(jobTasksPair.getLeft());
                toTake--;
                if (toTake <= 0) {
                    break;
                }
            }
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder) List(java.util.List) ArrayList(java.util.ArrayList) Job(com.netflix.titus.api.jobmanager.model.job.Job)

Example 19 with EntityHolder

use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.

the class BatchDifferenceResolver method apply.

@Override
public List<ChangeAction> apply(ReconciliationEngine<JobManagerReconcilerEvent> engine) {
    List<ChangeAction> actions = new ArrayList<>();
    BatchJobView refJobView = new BatchJobView(engine.getReferenceView());
    EntityHolder storeModel = engine.getStoreView();
    int activeNotStartedTasks = DifferenceResolverUtils.countActiveNotStartedTasks(refJobView.getJobHolder(), engine.getRunningView());
    AtomicInteger allowedNewTasks = new AtomicInteger(Math.max(0, configuration.getActiveNotStartedTasksLimit() - activeNotStartedTasks));
    actions.addAll(applyStore(engine, refJobView, storeModel, allowedNewTasks));
    actions.addAll(applyRuntime(engine, refJobView, engine.getRunningView(), storeModel, allowedNewTasks));
    if (actions.isEmpty()) {
        actions.addAll(removeCompletedJob(refJobView, engine.getReferenceView(), storeModel, jobStore, versionSupplier));
    }
    return actions;
}
Also used : TitusChangeAction(com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ArrayList(java.util.ArrayList) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder)

Example 20 with EntityHolder

use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.

the class BatchDifferenceResolver method applyStore.

private List<ChangeAction> applyStore(ReconciliationEngine<JobManagerReconcilerEvent> engine, BatchJobView refJobView, EntityHolder storeJob, AtomicInteger allowedNewTasks) {
    if (!storeWriteRetryInterceptor.executionLimits(storeJob)) {
        return Collections.emptyList();
    }
    List<ChangeAction> actions = new ArrayList<>();
    EntityHolder refJobHolder = refJobView.getJobHolder();
    Job<BatchJobExt> refJob = refJobHolder.getEntity();
    if (!refJobHolder.getEntity().equals(storeJob.getEntity())) {
        actions.add(storeWriteRetryInterceptor.apply(BasicJobActions.updateJobInStore(engine, jobStore)));
    }
    boolean isJobTerminating = refJob.getStatus().getState() == JobState.KillInitiated;
    for (EntityHolder referenceTask : refJobHolder.getChildren()) {
        Optional<EntityHolder> storeHolder = storeJob.findById(referenceTask.getId());
        boolean refAndStoreInSync = storeHolder.isPresent() && DifferenceResolverUtils.areEquivalent(storeHolder.get(), referenceTask);
        boolean shouldRetry = !isJobTerminating && DifferenceResolverUtils.shouldRetry(refJob, referenceTask.getEntity()) && allowedNewTasks.get() > 0;
        if (refAndStoreInSync) {
            BatchJobTask storeTask = storeHolder.get().getEntity();
            if (shouldRetry && TaskRetryers.shouldRetryNow(referenceTask, clock)) {
                logger.info("Retrying task: oldTaskId={}, index={}", referenceTask.getId(), storeTask.getIndex());
                createNewTaskAction(refJobView, storeTask.getIndex(), Optional.of(referenceTask), Collections.emptyList(), Collections.emptyList()).ifPresent(actions::add);
            }
        } else {
            Task task = referenceTask.getEntity();
            CallMetadata callMetadata = RECONCILER_CALLMETADATA.toBuilder().withCallReason("Writing runtime state changes to store").build();
            actions.add(storeWriteRetryInterceptor.apply(BasicTaskActions.writeReferenceTaskToStore(jobStore, engine, task.getId(), callMetadata, titusRuntime)));
        }
        // Both current and delayed retries are counted
        if (shouldRetry) {
            allowedNewTasks.decrementAndGet();
        }
    }
    return actions;
}
Also used : Task(com.netflix.titus.api.jobmanager.model.job.Task) BatchJobTask(com.netflix.titus.api.jobmanager.model.job.BatchJobTask) TitusChangeAction(com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction) ChangeAction(com.netflix.titus.common.framework.reconciler.ChangeAction) CallMetadata(com.netflix.titus.api.model.callmetadata.CallMetadata) BatchJobExt(com.netflix.titus.api.jobmanager.model.job.ext.BatchJobExt) ArrayList(java.util.ArrayList) BatchJobTask(com.netflix.titus.api.jobmanager.model.job.BatchJobTask) EntityHolder(com.netflix.titus.common.framework.reconciler.EntityHolder)

Aggregations

EntityHolder (com.netflix.titus.common.framework.reconciler.EntityHolder)31 ArrayList (java.util.ArrayList)17 Task (com.netflix.titus.api.jobmanager.model.job.Task)12 ChangeAction (com.netflix.titus.common.framework.reconciler.ChangeAction)12 ModelActionHolder (com.netflix.titus.common.framework.reconciler.ModelActionHolder)12 TitusChangeAction (com.netflix.titus.master.jobmanager.service.common.action.TitusChangeAction)12 List (java.util.List)10 TaskState (com.netflix.titus.api.jobmanager.model.job.TaskState)7 TitusModelAction (com.netflix.titus.master.jobmanager.service.common.action.TitusModelAction)7 Job (com.netflix.titus.api.jobmanager.model.job.Job)6 TaskStatus (com.netflix.titus.api.jobmanager.model.job.TaskStatus)6 ServiceJobExt (com.netflix.titus.api.jobmanager.model.job.ext.ServiceJobExt)6 ReconciliationEngine (com.netflix.titus.common.framework.reconciler.ReconciliationEngine)6 JobManagerReconcilerEvent (com.netflix.titus.master.jobmanager.service.event.JobManagerReconcilerEvent)6 Test (org.junit.Test)6 JobFunctions (com.netflix.titus.api.jobmanager.model.job.JobFunctions)5 JobStore (com.netflix.titus.api.jobmanager.store.JobStore)5 CallMetadata (com.netflix.titus.api.model.callmetadata.CallMetadata)5 TitusRuntime (com.netflix.titus.common.runtime.TitusRuntime)5 Optional (java.util.Optional)5