use of com.hubspot.singularity.SingularityTaskStatusHolder in project Singularity by HubSpot.
the class LastTaskStatusMigration method applyMigration.
@Override
public void applyMigration() {
final long start = System.currentTimeMillis();
final List<SingularityTaskId> taskIds = taskManager.getActiveTaskIds();
for (SingularityTaskId taskId : taskIds) {
List<SingularityTaskHistoryUpdate> updates = Lists.reverse(taskManager.getTaskHistoryUpdates(taskId));
Optional<MesosTaskStatusObject> taskStatus = Optional.empty();
for (SingularityTaskHistoryUpdate update : updates) {
if (update.getTaskState().toTaskState().isPresent()) {
Optional<SingularityTask> task = taskManager.getTask(taskId);
taskStatus = Optional.of(mesosProtosUtils.taskStatusFromProtos(TaskStatus.newBuilder().setTaskId(TaskID.newBuilder().setValue(taskId.getId())).setAgentId(MesosProtosUtils.toAgentId(task.get().getAgentId())).setState(MesosProtosUtils.toTaskState(update.getTaskState())).build()));
break;
}
}
SingularityTaskStatusHolder taskStatusHolder = new SingularityTaskStatusHolder(taskId, taskStatus, start, serverId, Optional.empty());
taskManager.saveLastActiveTaskStatus(taskStatusHolder);
}
}
use of com.hubspot.singularity.SingularityTaskStatusHolder in project Singularity by HubSpot.
the class SingularityMesosStatusUpdateHandler method unsafeProcessStatusUpdate.
private StatusUpdateResult unsafeProcessStatusUpdate(Protos.TaskStatus status, SingularityTaskId taskIdObj) {
final String taskId = status.getTaskId().getValue();
long timestamp = System.currentTimeMillis();
if (status.hasTimestamp()) {
timestamp = (long) (status.getTimestamp() * 1000);
}
long now = System.currentTimeMillis();
long delta = now - timestamp;
LOG.debug("Update: task {} is now {} ({}) at {} (delta: {})", taskId, status.getState(), status.getMessage(), timestamp, JavaUtils.durationFromMillis(delta));
statusUpdateDeltas.update(delta);
final SingularityTaskStatusHolder newTaskStatusHolder = new SingularityTaskStatusHolder(taskIdObj, Optional.of(mesosProtosUtils.taskStatusFromProtos(status)), System.currentTimeMillis(), serverId, Optional.<String>empty());
final Optional<SingularityTaskStatusHolder> previousTaskStatusHolder = taskManager.getLastActiveTaskStatus(taskIdObj);
final ExtendedTaskState taskState = MesosUtils.fromTaskState(status.getState());
if (taskState == ExtendedTaskState.TASK_ERROR && status.getMessage() != null && status.getMessage().contains(RESOURCE_MISMATCH_ERR)) {
LOG.error("Possible duplicate resource allocation", new IllegalStateException(String.format("Duplicate resource allocation for %s: %s", taskId, status.getMessage())));
}
if (isRecoveryStatusUpdate(previousTaskStatusHolder, status.getReason(), taskState, newTaskStatusHolder)) {
LOG.info("Found recovery status update with reason {} for task {}", status.getReason(), taskId);
final Optional<SingularityTaskHistory> maybeTaskHistory = taskManager.getTaskHistory(taskIdObj);
if (!maybeTaskHistory.isPresent() || !maybeTaskHistory.get().getLastTaskUpdate().isPresent()) {
LOG.warn("Task {} not found to recover, it may have already been persisted. Triggering a kill via mesos", taskIdObj);
return StatusUpdateResult.KILL_TASK;
} else if (status.getReason() == Reason.REASON_AGENT_REREGISTERED) {
Optional<SingularityLoadBalancerUpdate> maybeLbUpdate = taskManager.getLoadBalancerState(taskIdObj, LoadBalancerRequestType.REMOVE);
if (maybeLbUpdate.isPresent()) {
LOG.info("LB removal for recovered task {} was already started. Attempting to clear and start as new task", taskId);
boolean canRecoverLbState = true;
if (maybeLbUpdate.get().getLoadBalancerState().isInProgress()) {
try {
if (lbClient.getState(maybeLbUpdate.get().getLoadBalancerRequestId()).getLoadBalancerState().isInProgress()) {
// We don't want to block here and wait for LB removal to finish in case it is stuck. Mark this task for cleaning
canRecoverLbState = false;
}
} catch (Exception e) {
LOG.warn("Could not verify LB state for {}", taskId, e);
canRecoverLbState = false;
}
}
if (canRecoverLbState && deployManager.getActiveDeployId(taskIdObj.getRequestId()).map(d -> d.equals(taskIdObj.getDeployId())).orElse(false) && taskManager.reactivateTask(taskIdObj, taskState, newTaskStatusHolder, Optional.ofNullable(status.getMessage()), status.hasReason() ? Optional.of(status.getReason().name()) : Optional.empty())) {
Optional<SingularityTask> maybeTask = taskManager.getTask(taskIdObj);
Optional<SingularityRequestWithState> maybeRequest = requestManager.getRequest(taskIdObj.getRequestId());
if (maybeTask.isPresent() && maybeRequest.isPresent() && maybeRequest.get().getState().isRunnable()) {
LOG.info("Task {} can be recovered. Clearing LB state and enqueuing check as new task", taskId);
taskManager.clearLoadBalancerHistory(taskIdObj);
newTaskChecker.enqueueCheckWithDelay(maybeTask.get(), 0, healthchecker);
requestManager.addToPendingQueue(new SingularityPendingRequest(taskIdObj.getRequestId(), taskIdObj.getDeployId(), now, Optional.empty(), PendingType.TASK_RECOVERED, Optional.empty(), Optional.of(String.format("Agent %s recovered", status.getAgentId().getValue()))));
return StatusUpdateResult.DONE;
}
} else {
LOG.info("Could not recover task {}, will clean up", taskId);
taskManager.createTaskCleanup(new SingularityTaskCleanup(Optional.empty(), TaskCleanupType.DECOMISSIONING, System.currentTimeMillis(), taskIdObj, Optional.of("Agent re-registered after load balancer removal started. Task cannot be reactivated."), Optional.empty(), Optional.empty()));
requestManager.addToPendingQueue(new SingularityPendingRequest(taskIdObj.getRequestId(), taskIdObj.getDeployId(), now, Optional.empty(), PendingType.TASK_RECOVERED, Optional.empty(), Optional.of(String.format("Agent %s recovered", status.getAgentId().getValue()))));
return StatusUpdateResult.DONE;
}
}
}
// Check tasks with no lb component or not yet removed from LB
boolean reactivated = deployManager.getActiveDeployId(taskIdObj.getRequestId()).map(d -> d.equals(taskIdObj.getDeployId())).orElse(false) && taskManager.reactivateTask(taskIdObj, taskState, newTaskStatusHolder, Optional.ofNullable(status.getMessage()), status.hasReason() ? Optional.of(status.getReason().name()) : Optional.empty());
requestManager.addToPendingQueue(new SingularityPendingRequest(taskIdObj.getRequestId(), taskIdObj.getDeployId(), now, Optional.empty(), PendingType.TASK_RECOVERED, Optional.empty(), Optional.of(String.format("Agent %s recovered", status.getAgentId().getValue()))));
if (reactivated) {
return StatusUpdateResult.DONE;
} else {
return StatusUpdateResult.KILL_TASK;
}
} else if (isDuplicateOrIgnorableStatusUpdate(previousTaskStatusHolder, newTaskStatusHolder)) {
LOG.trace("Ignoring status update {} to {}", taskState, taskIdObj);
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
return StatusUpdateResult.IGNORED;
}
final Optional<SingularityTask> task = taskManager.getTask(taskIdObj);
if (status.getState() == TaskState.TASK_LOST) {
boolean isMesosFailure = status.getReason() == Reason.REASON_INVALID_OFFERS || status.getReason() == Reason.REASON_AGENT_REMOVED || status.getReason() == Reason.REASON_AGENT_RESTARTED || status.getReason() == Reason.REASON_AGENT_UNKNOWN || status.getReason() == Reason.REASON_MASTER_DISCONNECTED || status.getReason() == Reason.REASON_AGENT_DISCONNECTED;
RequestType requestType = task.isPresent() ? task.get().getTaskRequest().getRequest().getRequestType() : null;
boolean isRelaunchable = requestType != null && !requestType.isLongRunning();
if (isMesosFailure && isRelaunchable) {
LOG.info("Relaunching lost task {}", task);
relaunchTask(task.get());
}
lostTasksMeter.mark();
if (configuration.getDisasterDetection().isEnabled()) {
taskLostReasons.add(status.getReason());
}
}
if (!taskState.isDone()) {
if (task.isPresent()) {
final Optional<SingularityPendingDeploy> pendingDeploy = deployManager.getPendingDeploy(taskIdObj.getRequestId());
Optional<SingularityRequestWithState> requestWithState = Optional.empty();
if (taskState == ExtendedTaskState.TASK_RUNNING) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
healthchecker.enqueueHealthcheck(task.get(), pendingDeploy, requestWithState);
}
if (!pendingDeploy.isPresent() || !pendingDeploy.get().getDeployMarker().getDeployId().equals(taskIdObj.getDeployId())) {
if (!requestWithState.isPresent()) {
requestWithState = requestManager.getRequest(taskIdObj.getRequestId());
}
newTaskChecker.enqueueNewTaskCheck(task.get(), requestWithState, healthchecker);
}
} else {
final String message = String.format("Task %s is active but is missing task data", taskId);
exceptionNotifier.notify(message);
LOG.error(message);
}
}
final Optional<String> statusMessage = getStatusMessage(status, task);
final SingularityTaskHistoryUpdate taskUpdate = new SingularityTaskHistoryUpdate(taskIdObj, timestamp, taskState, statusMessage, status.hasReason() ? Optional.of(status.getReason().name()) : Optional.<String>empty());
final SingularityCreateResult taskHistoryUpdateCreateResult = taskManager.saveTaskHistoryUpdate(taskUpdate);
logSupport.checkDirectoryAndContainerId(taskIdObj);
if (taskState.isDone()) {
healthchecker.cancelHealthcheck(taskId);
newTaskChecker.cancelNewTaskCheck(taskId);
taskManager.deleteKilledRecord(taskIdObj);
handleCompletedTaskState(status, taskIdObj, taskState, taskHistoryUpdateCreateResult, task, timestamp);
}
saveNewTaskStatusHolder(taskIdObj, newTaskStatusHolder, taskState);
return StatusUpdateResult.DONE;
}
use of com.hubspot.singularity.SingularityTaskStatusHolder in project Singularity by HubSpot.
the class ZkMigrationTest method testNamespaceTasksMigration.
@Test
public void testNamespaceTasksMigration() throws Exception {
metadataManager.setZkDataVersion("11");
long now = System.currentTimeMillis();
SingularityPendingTaskId testPending = new SingularityPendingTaskId("test", "deploy", now, 1, PendingType.IMMEDIATE, now);
SingularityPendingTask pendingTask = new SingularityPendingTaskBuilder().setPendingTaskId(testPending).build();
curator.create().creatingParentsIfNeeded().forPath("/tasks/scheduled/" + testPending.getId(), objectMapper.writeValueAsBytes(pendingTask));
SingularityTaskId taskId = new SingularityTaskId("test", "deploy", now, 1, "host", "rack");
curator.create().creatingParentsIfNeeded().forPath("/tasks/active/" + taskId.getId());
SingularityTaskStatusHolder statusHolder = new SingularityTaskStatusHolder(taskId, Optional.empty(), now, "1234", Optional.empty());
curator.create().creatingParentsIfNeeded().forPath("/tasks/statuses/" + taskId.getId(), objectMapper.writeValueAsBytes(statusHolder));
migrationRunner.checkMigrations();
List<SingularityPendingTaskId> pendingTaskIds = taskManager.getPendingTaskIds();
Assertions.assertThat(pendingTaskIds).contains(testPending);
Assertions.assertThat(pendingTask).isEqualTo(taskManager.getPendingTask(testPending).get());
List<SingularityTaskId> active = taskManager.getActiveTaskIds();
Assertions.assertThat(active).contains(taskId);
}
use of com.hubspot.singularity.SingularityTaskStatusHolder in project Singularity by HubSpot.
the class TaskManager method createTaskAndDeletePendingTaskPrivate.
private void createTaskAndDeletePendingTaskPrivate(SingularityTask task) throws Exception {
// TODO: Should more of the below be done within a transaction?
deletePendingTask(task.getTaskRequest().getPendingTask().getPendingTaskId());
final long now = System.currentTimeMillis();
String msg = String.format("Task launched because of %s", task.getTaskRequest().getPendingTask().getPendingTaskId().getPendingType().name());
if (task.getTaskRequest().getPendingTask().getUser().isPresent()) {
msg = String.format("%s by %s", msg, task.getTaskRequest().getPendingTask().getUser().get());
}
if (task.getTaskRequest().getPendingTask().getMessage().isPresent()) {
msg = String.format("%s (%s)", msg, task.getTaskRequest().getPendingTask().getMessage().get());
}
saveTaskHistoryUpdate(new SingularityTaskHistoryUpdate(task.getTaskId(), now, ExtendedTaskState.TASK_LAUNCHED, Optional.of(msg), Optional.empty()));
SingularityTaskStatusHolder taskStatusHolder = new SingularityTaskStatusHolder(task.getTaskId(), Optional.empty(), now, serverId, Optional.of(task.getAgentId().getValue()));
String taskStatusParent = getLastActiveTaskParent(task.getTaskId().getRequestId());
if (!exists(taskStatusParent)) {
try {
curator.create().forPath(taskStatusParent);
} catch (NodeExistsException nee) {
LOG.debug("Node {} already existed", taskStatusParent);
}
}
try {
final String path = getTaskPath(task.getTaskId());
CuratorTransactionFinal transaction = curator.inTransaction().create().forPath(path, taskTranscoder.toBytes(task)).and();
transaction.create().forPath(getLastActiveTaskStatusPath(task.getTaskId()), taskStatusTranscoder.toBytes(taskStatusHolder)).and().commit();
// Not checking isActive here, already called within offer check flow
leaderCache.putActiveTask(task.getTaskId());
taskCache.set(path, task);
} catch (KeeperException.NodeExistsException nee) {
LOG.error("Task or active path already existed for {}", task.getTaskId());
}
}
use of com.hubspot.singularity.SingularityTaskStatusHolder in project Singularity by HubSpot.
the class SingularityTaskReconciliation method checkReconciliation.
private void checkReconciliation(final long reconciliationStart, final Collection<SingularityTaskId> remainingTaskIds, final int numTimes, final Histogram histogram) {
final List<SingularityTaskStatusHolder> taskStatusHolders = taskManager.getLastActiveTaskStatusesFor(remainingTaskIds);
final List<MesosTaskStatusObject> taskStatuses = Lists.newArrayListWithCapacity(taskStatusHolders.size());
for (SingularityTaskStatusHolder taskStatusHolder : taskStatusHolders) {
if (taskStatusHolder.getServerId().equals(serverId) && taskStatusHolder.getServerTimestamp() > reconciliationStart) {
histogram.update(taskStatusHolder.getServerTimestamp() - reconciliationStart);
continue;
}
if (taskStatusHolder.getTaskStatus().isPresent()) {
LOG.debug("Re-requesting task status for {}", taskStatusHolder.getTaskId());
taskStatuses.add(taskStatusHolder.getTaskStatus().get());
} else {
TaskStatus.Builder fakeTaskStatusBuilder = TaskStatus.newBuilder().setTaskId(TaskID.newBuilder().setValue(taskStatusHolder.getTaskId().getId())).setState(TaskState.TASK_STARTING);
if (taskStatusHolder.getAgentId().isPresent()) {
fakeTaskStatusBuilder.setAgentId(AgentID.newBuilder().setValue(taskStatusHolder.getAgentId().get()));
}
LOG.info("Task {} didn't have a TaskStatus yet, submitting fake status", taskStatusHolder.getTaskId());
taskStatuses.add(mesosProtosUtils.taskStatusFromProtos(fakeTaskStatusBuilder.build()));
}
}
if (taskStatuses.isEmpty()) {
LOG.info("Task reconciliation ended after {} checks and {}", numTimes, JavaUtils.duration(reconciliationStart));
final Snapshot snapshot = histogram.getSnapshot();
stateManager.saveTaskReconciliationStatistics(new SingularityTaskReconciliationStatistics(reconciliationStart, System.currentTimeMillis() - reconciliationStart, numTimes, histogram.getCount(), snapshot.getMax(), snapshot.getMean(), snapshot.getMin(), snapshot.getMedian(), snapshot.get75thPercentile(), snapshot.get95thPercentile(), snapshot.get98thPercentile(), snapshot.get99thPercentile(), snapshot.get999thPercentile(), snapshot.getStdDev()));
isRunningReconciliation.set(false);
return;
}
LOG.info("Requesting reconciliation of {} taskStatuses, task reconciliation has been running for {}", taskStatuses.size(), JavaUtils.duration(reconciliationStart));
schedulerClient.reconcile(taskStatuses.stream().map(t -> Task.newBuilder().setTaskId(MesosProtosUtils.toTaskId(t.getTaskId())).setAgentId(MesosProtosUtils.toAgentId(t.getAgentId())).build()).collect(Collectors.toList()));
scheduleReconciliationCheck(reconciliationStart, remainingTaskIds, numTimes, histogram);
}
Aggregations