use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.
the class RetryActionInterceptorTest method testRetry.
@Test
public void testRetry() throws Exception {
TitusChangeAction changeAction = SampleTitusChangeActions.failingJob(2);
// First two calls should fail
ModelAction updateAction1 = expectUpdateActionOfType(changeAction, RetryActionInterceptor.RetryModelUpdateAction.class);
EntityHolder modelWithTag1 = expectAboveExecutionLimits(updateAction1, EntityHolder.newRoot("rootId", "data"));
expectBelowExecutionLimitsWhenTimeAdvanced(modelWithTag1, INITIAL_DELAY_MS);
ModelAction updateAction2 = expectUpdateActionOfType(changeAction, RetryActionInterceptor.RetryModelUpdateAction.class);
EntityHolder modelWithTag2 = expectAboveExecutionLimits(updateAction2, modelWithTag1);
expectBelowExecutionLimitsWhenTimeAdvanced(modelWithTag2, INITIAL_DELAY_MS * 2);
// Third call should succeed
ModelAction updateAction3 = expectUpdateActionOfType(changeAction, RetryActionInterceptor.RemoveRetryRecord.class);
expectNoRetryTag(updateAction3, modelWithTag2);
}
use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.
the class DefaultV3JobOperations method enterActiveMode.
@Activator
public void enterActiveMode() {
this.reconciliationFramework = jobReconciliationFrameworkFactory.newInstance();
// BUG: event stream breaks permanently, and cannot be retried.
// As we cannot fix the underlying issue yet, we have to be able to discover when it happens.
AtomicLong eventStreamLastError = new AtomicLong();
Clock clock = titusRuntime.getClock();
this.transactionLoggerSubscription = JobTransactionLogger.logEvents(reconciliationFramework, eventStreamLastError, clock);
PolledMeter.using(titusRuntime.getRegistry()).withName(METRIC_EVENT_STREAM_LAST_ERROR).monitorValue(eventStreamLastError, value -> value.get() <= 0 ? 0 : clock.wallTime() - value.get());
// Remove finished jobs from the reconciliation framework.
Observable<JobManagerReconcilerEvent> reconciliationEventsObservable = reconciliationFramework.events().onBackpressureBuffer(OBSERVE_JOBS_BACKPRESSURE_BUFFER_SIZE, () -> logger.warn("Overflowed the buffer size: " + OBSERVE_JOBS_BACKPRESSURE_BUFFER_SIZE), BackpressureOverflow.ON_OVERFLOW_ERROR).doOnSubscribe(() -> {
List<EntityHolder> entityHolders = reconciliationFramework.orderedView(IndexKind.StatusCreationTime);
for (EntityHolder entityHolder : entityHolders) {
handleJobCompletedEvent(entityHolder);
}
});
this.reconcilerEventSubscription = titusRuntime.persistentStream(reconciliationEventsObservable).subscribe(event -> {
if (event instanceof JobModelUpdateReconcilerEvent) {
JobModelUpdateReconcilerEvent jobUpdateEvent = (JobModelUpdateReconcilerEvent) event;
handleJobCompletedEvent(jobUpdateEvent.getChangedEntityHolder());
}
}, e -> logger.error("Event stream terminated with an error", e), () -> logger.info("Event stream completed"));
reconciliationFramework.start();
}
use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.
the class DefaultV3JobOperations method findJobs.
@Override
public List<Job<?>> findJobs(Predicate<Pair<Job<?>, List<Task>>> queryPredicate, int offset, int limit) {
if (limit <= 0) {
return Collections.emptyList();
}
List<EntityHolder> jobHolders = reconciliationFramework.orderedView(IndexKind.StatusCreationTime);
List<Job<?>> result = new ArrayList<>();
int toDrop = offset;
int toTake = limit;
for (EntityHolder holder : jobHolders) {
Pair<Job<?>, List<Task>> jobTasksPair = toJobTasksPair(holder);
if (queryPredicate.test(jobTasksPair)) {
if (toDrop > 0) {
toDrop--;
} else {
result.add(jobTasksPair.getLeft());
toTake--;
if (toTake <= 0) {
break;
}
}
}
}
return result;
}
use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.
the class BatchDifferenceResolver method apply.
@Override
public List<ChangeAction> apply(ReconciliationEngine<JobManagerReconcilerEvent> engine) {
List<ChangeAction> actions = new ArrayList<>();
BatchJobView refJobView = new BatchJobView(engine.getReferenceView());
EntityHolder storeModel = engine.getStoreView();
int activeNotStartedTasks = DifferenceResolverUtils.countActiveNotStartedTasks(refJobView.getJobHolder(), engine.getRunningView());
AtomicInteger allowedNewTasks = new AtomicInteger(Math.max(0, configuration.getActiveNotStartedTasksLimit() - activeNotStartedTasks));
actions.addAll(applyStore(engine, refJobView, storeModel, allowedNewTasks));
actions.addAll(applyRuntime(engine, refJobView, engine.getRunningView(), storeModel, allowedNewTasks));
if (actions.isEmpty()) {
actions.addAll(removeCompletedJob(refJobView, engine.getReferenceView(), storeModel, jobStore, versionSupplier));
}
return actions;
}
use of com.netflix.titus.common.framework.reconciler.EntityHolder in project titus-control-plane by Netflix.
the class BatchDifferenceResolver method applyStore.
private List<ChangeAction> applyStore(ReconciliationEngine<JobManagerReconcilerEvent> engine, BatchJobView refJobView, EntityHolder storeJob, AtomicInteger allowedNewTasks) {
if (!storeWriteRetryInterceptor.executionLimits(storeJob)) {
return Collections.emptyList();
}
List<ChangeAction> actions = new ArrayList<>();
EntityHolder refJobHolder = refJobView.getJobHolder();
Job<BatchJobExt> refJob = refJobHolder.getEntity();
if (!refJobHolder.getEntity().equals(storeJob.getEntity())) {
actions.add(storeWriteRetryInterceptor.apply(BasicJobActions.updateJobInStore(engine, jobStore)));
}
boolean isJobTerminating = refJob.getStatus().getState() == JobState.KillInitiated;
for (EntityHolder referenceTask : refJobHolder.getChildren()) {
Optional<EntityHolder> storeHolder = storeJob.findById(referenceTask.getId());
boolean refAndStoreInSync = storeHolder.isPresent() && DifferenceResolverUtils.areEquivalent(storeHolder.get(), referenceTask);
boolean shouldRetry = !isJobTerminating && DifferenceResolverUtils.shouldRetry(refJob, referenceTask.getEntity()) && allowedNewTasks.get() > 0;
if (refAndStoreInSync) {
BatchJobTask storeTask = storeHolder.get().getEntity();
if (shouldRetry && TaskRetryers.shouldRetryNow(referenceTask, clock)) {
logger.info("Retrying task: oldTaskId={}, index={}", referenceTask.getId(), storeTask.getIndex());
createNewTaskAction(refJobView, storeTask.getIndex(), Optional.of(referenceTask), Collections.emptyList(), Collections.emptyList()).ifPresent(actions::add);
}
} else {
Task task = referenceTask.getEntity();
CallMetadata callMetadata = RECONCILER_CALLMETADATA.toBuilder().withCallReason("Writing runtime state changes to store").build();
actions.add(storeWriteRetryInterceptor.apply(BasicTaskActions.writeReferenceTaskToStore(jobStore, engine, task.getId(), callMetadata, titusRuntime)));
}
// Both current and delayed retries are counted
if (shouldRetry) {
allowedNewTasks.decrementAndGet();
}
}
return actions;
}
Aggregations