use of com.hubspot.singularity.SingularityPendingRequest in project Singularity by HubSpot.
the class SingularityMesosStatusUpdateHandler method unsafeProcessStatusUpdate.
private StatusUpdateResult unsafeProcessStatusUpdate(Protos.TaskStatus status, SingularityTaskId taskIdObj) {
final String taskId = status.getTaskId().getValue();
long timestamp = System.currentTimeMillis();
if (status.hasTimestamp()) {
timestamp = (long) (status.getTimestamp() * 1000);
}
long now = System.currentTimeMillis();
long delta = now - timestamp;
LOG.debug("Update: task {} is now {} ({}) at {} (delta: {})", taskId, status.getState(), status.getMessage(), timestamp, JavaUtils.durationFromMillis(delta));
statusUpdateDeltas.update(delta);
final SingularityTaskStatusHolder newTaskStatusHolder = new SingularityTaskStatusHolder(taskIdObj, Optional.of(mesosProtosUtils.taskStatusFromProtos(status)), System.currentTimeMillis(), serverId, Optional.<String>empty());
final Optional<SingularityTaskStatusHolder> previousTaskStatusHolder = taskManager.getLastActiveTaskStatus(taskIdObj);
final ExtendedTaskState taskState = MesosUtils.fromTaskState(status.getState());
if (taskState == ExtendedTaskState.TASK_ERROR && status.getMessage() != null && status.getMessage().contains(RESOURCE_MISMATCH_ERR)) {
LOG.error("Possible duplicate resource allocation", new IllegalStateException(String.format("Duplicate resource allocation for %s: %s", taskId, status.getMessage())));
}
if (isRecoveryStatusUpdate(previousTaskStatusHolder, status.getReason(), taskState, newTaskStatusHolder)) {
LOG.info("Found recovery status update with reason {} for task {}", status.getReason(), taskId);
final Optional<SingularityTaskHistory> maybeTaskHistory = taskManager.getTaskHistory(taskIdObj);
if (!maybeTaskHistory.isPresent() || !maybeTaskHistory.get().getLastTaskUpdate().isPresent()) {
LOG.warn("Task {} not found to recover, it may have already been persisted. Triggering a kill via mesos", taskIdObj);
return StatusUpdateResult.KILL_TASK;
} else if (status.getReason() == Reason.REASON_AGENT_REREGISTERED) {
Optional<SingularityLoadBalancerUpdate> maybeLbUpdate = taskManager.getLoadBalancerState(taskIdObj, LoadBalancerRequestType.REMOVE);
if (maybeLbUpdate.isPresent()) {
LOG.info("LB removal for recovered task {} was already started. Attempting to clear and start as new task", taskId);
boolean canRecoverLbState = true;
if (maybeLbUpdate.get().getLoadBalancerState().isInProgress()) {
try {
if (lbClient.getState(maybeLbUpdate.get().getLoadBalancerRequestId()).getLoadBalancerState().isInProgress()) {
// We don't want to block here and wait for LB removal to finish in case it is stuck. Mark this task for cleaning
canRecoverLbState = false;
}
} catch (Exception e) {
LOG.warn("Could not verify LB state for {}", taskId, e);
canRecoverLbState = false;
}
}
if (canRecoverLbState && deployManager.getActiveDeployId(taskIdObj.getRequestId()).map(d -> d.equals(taskIdObj.getDeployId())).orElse(false) && taskManager.reactivateTask(taskIdObj, taskState, newTaskStatusHolder, Optional.ofNullable(status.getMessage()), status.hasReason() ? Optional.of(status.getReason().name()) : Optional.empty())) {
Optional<SingularityTask> maybeTask = taskManager.getTask(taskIdObj);
Optional<SingularityRequestWithState> maybeRequest = requestManager.getRequest(taskIdObj.getRequestId());
if (maybeTask.isPresent() && maybeRequest.isPresent() && maybeRequest.get().getState().isRunnable()) {
LOG.info("Task {} can be recovered. Clearing LB state and enqueuing check as new task", taskId);
taskManager.clearLoadBalancerHistory(taskIdObj);
newTaskChecker.enqueueCheckWithDelay(maybeTask.get(), 0, healthchecker);
requestManager.addToPendingQueue(new SingularityPendingRequest(taskIdObj.getRequestId(), taskIdObj.getDeployId(), now, Optional.empty(), PendingType.TASK_RECOVERED, Optional.empty(), Optional.of(String.format("Agent %s recovered", status.getAgentId().getValue()))));
return StatusUpdateResult.DONE;
}
} else {
LOG.info("Could not recover task {}, will clean up", taskId);
taskManager.createTaskCleanup(new SingularityTaskCleanup(Optional.empty(), TaskCleanupType.DECOMISSIONING, System.currentTimeMillis(), taskIdObj, Optional.of("Agent re-registered after load balancer removal started. Task cannot be reactivated."), Optional.empty(), Optional.empty()));
requestManager.addToPendingQueue(new SingularityPendingRequest(taskIdObj.getRequestId(), taskIdObj.getDeployId(), now, Optional.empty(), PendingType.TASK_RECOVERED, Optional.empty(), Optional.of(String.format("Agent %s recovered", status.getAgentId().getValue()))));
return StatusUpdateResult.DONE;
}
}
}
// Check tasks with no lb component or not yet removed from LB
boolean reactivated = deployManager.getActiveDeployId(taskIdObj.getRequestId()).map(d -> d.equals(taskIdObj.getDeployId())).orElse(false) && taskManager.reactivateTask(taskIdObj, taskState, newTaskStatusHolder, Optional.ofNullable(status.getMessage()), status.hasReason() ? Optional.of(status.getReason().name()) : Optional.empty());
requestManager.addToPendingQueue(new SingularityPendingRequest(taskIdObj.getRequestId(), taskIdObj.getDeployId(), now, Optional.empty(), PendingType.TASK_RECOVERED, Optional.empty(), Optional.of(String.format("Agent %s recovered", status.getAgentId().getValue()))));
if (reactivated) {
return StatusUpdateResult.DONE;
} else {
return StatusUpdateResult.KILL_TASK;
}
} else if (isDuplicateOrIgnorableStatusUpdate(previousTaskStatusHolder, newTaskStatusHolder)) {
LOG.trace("Ignoring status update {} to {}", taskState, taskIdObj);
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
return StatusUpdateResult.IGNORED;
}
final Optional<SingularityTask> task = taskManager.getTask(taskIdObj);
if (status.getState() == TaskState.TASK_LOST) {
boolean isMesosFailure = status.getReason() == Reason.REASON_INVALID_OFFERS || status.getReason() == Reason.REASON_AGENT_REMOVED || status.getReason() == Reason.REASON_AGENT_RESTARTED || status.getReason() == Reason.REASON_AGENT_UNKNOWN || status.getReason() == Reason.REASON_MASTER_DISCONNECTED || status.getReason() == Reason.REASON_AGENT_DISCONNECTED;
RequestType requestType = task.isPresent() ? task.get().getTaskRequest().getRequest().getRequestType() : null;
boolean isRelaunchable = requestType != null && !requestType.isLongRunning();
if (isMesosFailure && isRelaunchable) {
LOG.info("Relaunching lost task {}", task);
relaunchTask(task.get());
}
lostTasksMeter.mark();
if (configuration.getDisasterDetection().isEnabled()) {
taskLostReasons.add(status.getReason());
}
}
if (!taskState.isDone()) {
if (task.isPresent()) {
final Optional<SingularityPendingDeploy> pendingDeploy = deployManager.getPendingDeploy(taskIdObj.getRequestId());
Optional<SingularityRequestWithState> requestWithState = Optional.empty();
if (taskState == ExtendedTaskState.TASK_RUNNING) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
healthchecker.enqueueHealthcheck(task.get(), pendingDeploy, requestWithState);
}
if (!pendingDeploy.isPresent() || !pendingDeploy.get().getDeployMarker().getDeployId().equals(taskIdObj.getDeployId())) {
if (!requestWithState.isPresent()) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
}
newTaskChecker.enqueueNewTaskCheck(task.get(), requestWithState, healthchecker);
}
} else {
final String message = String.format("Task %s is active but is missing task data", taskId);
exceptionNotifier.notify(message);
LOG.error(message);
}
}
final Optional<String> statusMessage = getStatusMessage(status, task);
final SingularityTaskHistoryUpdate taskUpdate = new SingularityTaskHistoryUpdate(taskIdObj, timestamp, taskState, statusMessage, status.hasReason() ? Optional.of(status.getReason().name()) : Optional.<String>empty());
final SingularityCreateResult taskHistoryUpdateCreateResult = taskManager.saveTaskHistoryUpdate(taskUpdate);
logSupport.checkDirectoryAndContainerId(taskIdObj);
if (taskState.isDone()) {
healthchecker.cancelHealthcheck(taskId);
newTaskChecker.cancelNewTaskCheck(taskId);
taskManager.deleteKilledRecord(taskIdObj);
handleCompletedTaskState(status, taskIdObj, taskState, taskHistoryUpdateCreateResult, task, timestamp);
}
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
return StatusUpdateResult.DONE;
}
use of com.hubspot.singularity.SingularityPendingRequest in project Singularity by HubSpot.
the class SingularityMesosStatusUpdateHandler method relaunchTask.
private void relaunchTask(SingularityTask task) {
SingularityPendingTask pendingTask = task.getTaskRequest().getPendingTask();
SingularityPendingRequest pendingRequest = new SingularityPendingRequestBuilder().setRequestId(task.getTaskRequest().getRequest().getId()).setDeployId(task.getTaskRequest().getDeploy().getId()).setPendingType(PendingType.RETRY).setUser(pendingTask.getUser()).setRunId(pendingTask.getRunId()).setCmdLineArgsList(pendingTask.getCmdLineArgsList()).setSkipHealthchecks(pendingTask.getSkipHealthchecks()).setMessage(pendingTask.getMessage()).setResources(pendingTask.getResources()).setS3UploaderAdditionalFiles(pendingTask.getS3UploaderAdditionalFiles()).setRunAsUserOverride(pendingTask.getRunAsUserOverride()).setEnvOverrides(pendingTask.getEnvOverrides()).setExtraArtifacts(pendingTask.getExtraArtifacts()).setActionId(pendingTask.getActionId()).setRunAt(pendingTask.getPendingTaskId().getNextRunAt()).setTimestamp(System.currentTimeMillis()).build();
requestManager.addToPendingQueue(pendingRequest);
}
use of com.hubspot.singularity.SingularityPendingRequest in project Singularity by HubSpot.
the class DeployResource method deploy.
public SingularityRequestParent deploy(SingularityDeployRequest deployRequest, SingularityUser user) {
validator.checkActionEnabled(SingularityAction.DEPLOY);
SingularityDeploy deploy = deployRequest.getDeploy();
checkNotNullBadRequest(deploy, "DeployRequest must have a deploy object");
final Optional<String> deployUser = user.getEmail();
final String requestId = checkNotNullBadRequest(deploy.getRequestId(), "DeployRequest must have a non-null requestId");
SingularityRequestWithState requestWithState = fetchRequestWithState(requestId, user);
authorizationHelper.checkForAuthorization(requestWithState.getRequest(), user, SingularityAuthorizationScope.WRITE);
SingularityRequest request = requestWithState.getRequest();
authorizationHelper.checkForAuthorization(request, deploy, user, SingularityAuthorizationScope.WRITE);
final Optional<SingularityRequest> updatedValidatedRequest;
if (deployRequest.getUpdatedRequest().isPresent()) {
authorizationHelper.checkForAuthorizedChanges(deployRequest.getUpdatedRequest().get(), requestWithState.getRequest(), user);
updatedValidatedRequest = Optional.of(validator.checkSingularityRequest(deployRequest.getUpdatedRequest().get(), Optional.of(requestWithState.getRequest()), Optional.<SingularityDeploy>empty(), Optional.of(deploy)));
} else {
updatedValidatedRequest = Optional.empty();
}
if (updatedValidatedRequest.isPresent()) {
request = updatedValidatedRequest.get();
}
validator.checkScale(request, Optional.of(taskManager.getActiveTaskIdsForRequest(request.getId()).size()));
if (!deployRequest.isUnpauseOnSuccessfulDeploy() && !configuration.isAllowDeployOfPausedRequests()) {
checkConflict(requestWithState.getState() != RequestState.PAUSED, "Request %s is paused. Unable to deploy (it must be manually unpaused first)", requestWithState.getRequest().getId());
}
deploy = validator.checkDeploy(request, deploy);
final long now = System.currentTimeMillis();
SingularityDeployMarker deployMarker = new SingularityDeployMarker(requestId, deploy.getId(), now, deployUser, deployRequest.getMessage());
SingularityDeployProgress deployProgress;
if (request.isLongRunning()) {
int firstTargetInstances = deploy.getCanaryDeploySettings().isEnableCanaryDeploy() ? Math.min(deploy.getCanaryDeploySettings().getInstanceGroupSize(), request.getInstancesSafe()) : request.getInstancesSafe();
deployProgress = SingularityDeployProgress.forNewDeploy(firstTargetInstances, deploy.getCanaryDeploySettings().isEnableCanaryDeploy());
} else {
deployProgress = SingularityDeployProgress.forNonLongRunning();
}
SingularityPendingDeploy pendingDeployObj = new SingularityPendingDeploy(deployMarker, DeployState.WAITING, deployProgress, updatedValidatedRequest);
boolean deployToUnpause = false;
if (requestWithState.getState() == RequestState.PAUSED && deployRequest.isUnpauseOnSuccessfulDeploy()) {
deployToUnpause = true;
requestManager.deployToUnpause(request, now, deployUser, deployRequest.getMessage());
}
AtomicBoolean deployAlreadyInProgress = new AtomicBoolean(deployManager.pendingDeployInProgress(requestId));
// Short circuit outside lock so we don't wait too long
if (!deployAlreadyInProgress.get()) {
SingularityRequest updatedRequest = request;
SingularityDeploy validatedDeploy = deploy;
// This can cause a conflict if run outside the lock, causing the pending deploy to be checked before deploy data is saved
schedulerLock.runWithRequestLock(() -> {
deployManager.createDeployIfNotExists(updatedRequest, deployMarker, validatedDeploy);
deployAlreadyInProgress.set(deployManager.createPendingDeploy(pendingDeployObj) == SingularityCreateResult.EXISTED);
if (deployAlreadyInProgress.get()) {
return;
}
deployManager.saveDeploy(updatedRequest, deployMarker, validatedDeploy);
}, requestId, "submitNewDeploy");
}
if (deployAlreadyInProgress.get() && deployToUnpause) {
requestManager.pause(request, now, deployUser, Optional.empty());
}
checkConflict(!deployAlreadyInProgress.get(), "Pending deploy already in progress for %s - cancel it or wait for it to complete (%s)", requestId, deployManager.getPendingDeploy(requestId).orElse(null));
deployManager.saveDeploy(request, deployMarker, deploy);
if (request.isDeployable() && !(requestWithState.getState() == RequestState.PAUSED && configuration.isAllowDeployOfPausedRequests())) {
requestManager.addToPendingQueue(new SingularityPendingRequest(requestId, deployMarker.getDeployId(), now, deployUser, PendingType.NEW_DEPLOY, deployRequest.getDeploy().getSkipHealthchecksOnDeploy(), deployRequest.getMessage()));
}
return fillEntireRequest(requestWithState, Optional.of(request));
}
use of com.hubspot.singularity.SingularityPendingRequest in project Singularity by HubSpot.
the class RequestResource method scheduleImmediately.
public SingularityPendingRequestParent scheduleImmediately(SingularityUser user, String requestId, SingularityRunNowRequest runNowRequest, boolean minimalReturn) {
final Optional<SingularityRunNowRequest> maybeRunNowRequest = Optional.ofNullable(runNowRequest);
SingularityRequestWithState requestWithState = fetchRequestWithState(requestId, user);
authorizationHelper.checkForAuthorization(requestWithState.getRequest(), user, SingularityAuthorizationScope.WRITE, SingularityUserFacingAction.EXEC);
checkConflict(requestWithState.getState() != RequestState.PAUSED, "Request %s is paused. Unable to run now (it must be manually unpaused first)", requestWithState.getRequest().getId());
// Check these to avoid unnecessary calls to taskManager
int activeTasks = 0;
int pendingTasks = 0;
boolean isOneoffWithInstances = requestWithState.getRequest().isOneOff() && requestWithState.getRequest().getInstances().isPresent();
if (requestWithState.getRequest().isScheduled() || isOneoffWithInstances) {
activeTasks = taskManager.getActiveTaskIdsForRequest(requestId).size();
}
if (isOneoffWithInstances) {
pendingTasks = taskManager.getPendingTaskIdsForRequest(requestId).size();
}
final SingularityPendingRequest pendingRequest = validator.checkRunNowRequest(getAndCheckDeployId(requestId), user.getEmail(), requestWithState.getRequest(), maybeRunNowRequest, activeTasks, pendingTasks);
SingularityCreateResult result = requestManager.addToPendingQueue(pendingRequest);
checkConflict(result != SingularityCreateResult.EXISTED, "%s is already pending, please try again soon", requestId);
if (minimalReturn) {
return SingularityPendingRequestParent.minimalFromRequestWithState(requestWithState, pendingRequest);
} else {
return SingularityPendingRequestParent.fromSingularityRequestParent(fillEntireRequest(requestWithState), pendingRequest);
}
}
use of com.hubspot.singularity.SingularityPendingRequest in project Singularity by HubSpot.
the class SingularityValidator method checkRunNowRequest.
public SingularityPendingRequest checkRunNowRequest(String deployId, Optional<String> userEmail, SingularityRequest request, Optional<SingularityRunNowRequest> maybeRunNowRequest, Integer activeTasks, Integer pendingTasks) {
SingularityRunNowRequest runNowRequest = fillRunNowRequest(maybeRunNowRequest);
PendingType pendingType;
if (request.isScheduled()) {
pendingType = PendingType.IMMEDIATE;
checkConflict(activeTasks == 0, "Cannot request immediate run of a scheduled job which is currently running (%s)", activeTasks);
} else if (request.isOneOff()) {
pendingType = PendingType.ONEOFF;
if (request.getInstances().isPresent()) {
checkRateLimited(activeTasks + pendingTasks < request.getInstances().get(), "No more than %s tasks allowed to run concurrently for request %s (%s active, %s pending)", request.getInstances().get(), request, activeTasks, pendingTasks);
}
} else {
throw badRequest("Can not request an immediate run of a non-scheduled / always running request (%s)", request);
}
if (runNowRequest.getRunAt().isPresent() && runNowRequest.getRunAt().get() > (System.currentTimeMillis() + TimeUnit.DAYS.toMillis(maxRunNowTaskLaunchDelay))) {
throw badRequest("Task launch delay can be at most %d days from now.", maxRunNowTaskLaunchDelay);
}
return new SingularityPendingRequest(request.getId(), deployId, System.currentTimeMillis(), userEmail, pendingType, runNowRequest.getCommandLineArgs(), Optional.of(getRunId(runNowRequest.getRunId())), runNowRequest.getSkipHealthchecks(), runNowRequest.getMessage(), Optional.empty(), runNowRequest.getResources(), runNowRequest.getS3UploaderAdditionalFiles(), runNowRequest.getRunAsUserOverride(), runNowRequest.getEnvOverrides(), runNowRequest.getRequiredAgentAttributeOverrides(), runNowRequest.getAllowedAgentAttributeOverrides(), runNowRequest.getExtraArtifacts(), runNowRequest.getRunAt());
}
Aggregations