use of com.hubspot.singularity.SingularityTask in project Singularity by HubSpot.
the class SingularityScheduler method checkForDecomissions.
@Timed
public void checkForDecomissions() {
final long start = System.currentTimeMillis();
final Map<String, Optional<String>> requestIdsToUserToReschedule = Maps.newHashMap();
final Set<SingularityTaskId> matchingTaskIds = Sets.newHashSet();
final Collection<SingularityTaskId> activeTaskIds = leaderCache.getActiveTaskIds();
final Map<SingularitySlave, MachineState> slaves = getDefaultMap(slaveManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION));
for (SingularitySlave slave : slaves.keySet()) {
boolean foundTask = false;
for (SingularityTask activeTask : taskManager.getTasksOnSlave(activeTaskIds, slave)) {
cleanupTaskDueToDecomission(requestIdsToUserToReschedule, matchingTaskIds, activeTask, slave);
foundTask = true;
}
if (!foundTask) {
slaves.put(slave, MachineState.DECOMMISSIONED);
}
}
final Map<SingularityRack, MachineState> racks = getDefaultMap(rackManager.getObjectsFiltered(MachineState.STARTING_DECOMMISSION));
for (SingularityRack rack : racks.keySet()) {
final String sanitizedRackId = JavaUtils.getReplaceHyphensWithUnderscores(rack.getId());
boolean foundTask = false;
for (SingularityTaskId activeTaskId : activeTaskIds) {
if (sanitizedRackId.equals(activeTaskId.getSanitizedRackId())) {
foundTask = true;
}
if (matchingTaskIds.contains(activeTaskId)) {
continue;
}
if (sanitizedRackId.equals(activeTaskId.getSanitizedRackId())) {
Optional<SingularityTask> maybeTask = taskManager.getTask(activeTaskId);
cleanupTaskDueToDecomission(requestIdsToUserToReschedule, matchingTaskIds, maybeTask.get(), rack);
}
}
if (!foundTask) {
racks.put(rack, MachineState.DECOMMISSIONED);
}
}
for (Entry<String, Optional<String>> requestIdAndUser : requestIdsToUserToReschedule.entrySet()) {
final String requestId = requestIdAndUser.getKey();
LOG.trace("Rescheduling request {} due to decomissions", requestId);
Optional<String> maybeDeployId = deployManager.getInUseDeployId(requestId);
if (maybeDeployId.isPresent()) {
requestManager.addToPendingQueue(new SingularityPendingRequest(requestId, maybeDeployId.get(), start, requestIdAndUser.getValue(), PendingType.DECOMISSIONED_SLAVE_OR_RACK, Optional.<Boolean>absent(), Optional.<String>absent()));
} else {
LOG.warn("Not rescheduling a request ({}) because of no active deploy", requestId);
}
}
changeState(slaves, slaveManager);
changeState(racks, rackManager);
if (slaves.isEmpty() && racks.isEmpty() && requestIdsToUserToReschedule.isEmpty() && matchingTaskIds.isEmpty()) {
LOG.trace("Decomission check found nothing");
} else {
LOG.info("Found {} decomissioning slaves, {} decomissioning racks, rescheduling {} requests and scheduling {} tasks for cleanup in {}", slaves.size(), racks.size(), requestIdsToUserToReschedule.size(), matchingTaskIds.size(), JavaUtils.duration(start));
}
}
use of com.hubspot.singularity.SingularityTask in project Singularity by HubSpot.
the class TaskResource method killTask.
public SingularityTaskCleanup killTask(String taskId, Optional<SingularityKillTaskRequest> killTaskRequest, SingularityUser user) {
final SingularityTask task = checkActiveTask(taskId, SingularityAuthorizationScope.WRITE, user);
Optional<String> message = Optional.absent();
Optional<Boolean> override = Optional.absent();
Optional<String> actionId = Optional.absent();
Optional<Boolean> waitForReplacementTask = Optional.absent();
Optional<SingularityTaskShellCommandRequestId> runBeforeKillId = Optional.absent();
if (killTaskRequest.isPresent()) {
actionId = killTaskRequest.get().getActionId();
message = killTaskRequest.get().getMessage();
override = killTaskRequest.get().getOverride();
waitForReplacementTask = killTaskRequest.get().getWaitForReplacementTask();
if (killTaskRequest.get().getRunShellCommandBeforeKill().isPresent()) {
SingularityTaskShellCommandRequest shellCommandRequest = startShellCommand(task.getTaskId(), killTaskRequest.get().getRunShellCommandBeforeKill().get(), user);
runBeforeKillId = Optional.of(shellCommandRequest.getId());
}
}
TaskCleanupType cleanupType = TaskCleanupType.USER_REQUESTED;
if (waitForReplacementTask.or(Boolean.FALSE)) {
cleanupType = TaskCleanupType.USER_REQUESTED_TASK_BOUNCE;
validator.checkActionEnabled(SingularityAction.BOUNCE_TASK);
} else {
validator.checkActionEnabled(SingularityAction.KILL_TASK);
}
final long now = System.currentTimeMillis();
final SingularityTaskCleanup taskCleanup;
if (override.isPresent() && override.get()) {
LOG.debug("Requested destroy of {}", taskId);
cleanupType = TaskCleanupType.USER_REQUESTED_DESTROY;
taskCleanup = new SingularityTaskCleanup(user.getEmail(), cleanupType, now, task.getTaskId(), message, actionId, runBeforeKillId);
taskManager.saveTaskCleanup(taskCleanup);
} else {
taskCleanup = new SingularityTaskCleanup(user.getEmail(), cleanupType, now, task.getTaskId(), message, actionId, runBeforeKillId);
SingularityCreateResult result = taskManager.createTaskCleanup(taskCleanup);
if (result == SingularityCreateResult.EXISTED && userRequestedKillTakesPriority(taskId)) {
taskManager.saveTaskCleanup(taskCleanup);
} else {
while (result == SingularityCreateResult.EXISTED) {
Optional<SingularityTaskCleanup> cleanup = taskManager.getTaskCleanup(taskId);
if (cleanup.isPresent()) {
throw new WebApplicationException(Response.status(Status.CONFLICT).entity(cleanup.get()).type(MediaType.APPLICATION_JSON).build());
}
result = taskManager.createTaskCleanup(taskCleanup);
}
}
}
if (cleanupType == TaskCleanupType.USER_REQUESTED_TASK_BOUNCE) {
requestManager.addToPendingQueue(new SingularityPendingRequest(task.getTaskId().getRequestId(), task.getTaskId().getDeployId(), now, user.getEmail(), PendingType.TASK_BOUNCE, Optional.<List<String>>absent(), Optional.<String>absent(), Optional.<Boolean>absent(), message, actionId));
}
return taskCleanup;
}
use of com.hubspot.singularity.SingularityTask in project Singularity by HubSpot.
the class SingularityUsagePoller method collectSlaveUage.
private void collectSlaveUage(SingularitySlave slave, long now, Map<String, RequestUtilization> utilizationPerRequestId, Map<SingularitySlaveUsage, List<TaskIdWithUsage>> overLoadedHosts, AtomicLong totalMemBytesUsed, AtomicLong totalMemBytesAvailable, AtomicDouble totalCpuUsed, AtomicDouble totalCpuAvailable, AtomicLong totalDiskBytesUsed, AtomicLong totalDiskBytesAvailable) {
Map<ResourceUsageType, Number> longRunningTasksUsage = new HashMap<>();
longRunningTasksUsage.put(ResourceUsageType.MEMORY_BYTES_USED, 0);
longRunningTasksUsage.put(ResourceUsageType.CPU_USED, 0);
longRunningTasksUsage.put(ResourceUsageType.DISK_BYTES_USED, 0);
Optional<Long> memoryMbTotal = Optional.absent();
Optional<Double> cpusTotal = Optional.absent();
Optional<Long> diskMbTotal = Optional.absent();
long memoryMbReservedOnSlave = 0;
double cpuReservedOnSlave = 0;
long diskMbReservedOnSlave = 0;
long memoryBytesUsedOnSlave = 0;
double cpusUsedOnSlave = 0;
long diskMbUsedOnSlave = 0;
try {
List<MesosTaskMonitorObject> allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost());
MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost());
double systemMemTotalBytes = 0;
double systemMemFreeBytes = 0;
double systemLoad1Min = 0;
double systemLoad5Min = 0;
double systemLoad15Min = 0;
double slaveDiskUsed = 0;
double slaveDiskTotal = 0;
double systemCpusTotal = 0;
if (slaveMetricsSnapshot != null) {
systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes();
systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes();
systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min();
systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min();
systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min();
slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed();
slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal();
systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal();
}
double systemLoad;
switch(configuration.getMesosConfiguration().getScoreUsingSystemLoad()) {
case LOAD_1:
systemLoad = systemLoad1Min;
break;
case LOAD_15:
systemLoad = systemLoad15Min;
break;
case LOAD_5:
default:
systemLoad = systemLoad5Min;
break;
}
boolean slaveOverloaded = systemCpusTotal > 0 && systemLoad / systemCpusTotal > 1.0;
List<TaskIdWithUsage> possibleTasksToShuffle = new ArrayList<>();
for (MesosTaskMonitorObject taskUsage : allTaskUsage) {
String taskId = taskUsage.getSource();
SingularityTaskId task;
try {
task = SingularityTaskId.valueOf(taskId);
} catch (InvalidSingularityTaskIdException e) {
LOG.error("Couldn't get SingularityTaskId for {}", taskUsage);
continue;
}
SingularityTaskUsage latestUsage = getUsage(taskUsage);
List<SingularityTaskUsage> pastTaskUsages = usageManager.getTaskUsage(taskId);
clearOldUsage(taskId);
usageManager.saveSpecificTaskUsage(taskId, latestUsage);
Optional<SingularityTask> maybeTask = taskManager.getTask(task);
Optional<Resources> maybeResources = Optional.absent();
if (maybeTask.isPresent()) {
maybeResources = maybeTask.get().getTaskRequest().getPendingTask().getResources().or(maybeTask.get().getTaskRequest().getDeploy().getResources());
if (maybeResources.isPresent()) {
Resources taskResources = maybeResources.get();
double memoryMbReservedForTask = taskResources.getMemoryMb();
double cpuReservedForTask = taskResources.getCpus();
double diskMbReservedForTask = taskResources.getDiskMb();
memoryMbReservedOnSlave += memoryMbReservedForTask;
cpuReservedOnSlave += cpuReservedForTask;
diskMbReservedOnSlave += diskMbReservedForTask;
updateRequestUtilization(utilizationPerRequestId, pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask);
}
}
memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes();
diskMbUsedOnSlave += latestUsage.getDiskTotalBytes();
SingularityTaskCurrentUsage currentUsage = null;
if (pastTaskUsages.isEmpty()) {
Optional<SingularityTaskHistoryUpdate> maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING);
if (maybeStartingUpdate.isPresent()) {
long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp());
double usedCpusSinceStart = latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds);
if (isLongRunning(task) || isConsideredLongRunning(task)) {
updateLongRunningTasksUsage(longRunningTasksUsage, latestUsage.getMemoryTotalBytes(), usedCpusSinceStart, latestUsage.getDiskTotalBytes());
}
currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, usedCpusSinceStart, latestUsage.getDiskTotalBytes());
usageManager.saveCurrentTaskUsage(taskId, currentUsage);
cpusUsedOnSlave += usedCpusSinceStart;
}
} else {
SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1);
double taskCpusUsed = ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp()));
if (isLongRunning(task) || isConsideredLongRunning(task)) {
updateLongRunningTasksUsage(longRunningTasksUsage, latestUsage.getMemoryTotalBytes(), taskCpusUsed, latestUsage.getDiskTotalBytes());
}
currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, taskCpusUsed, latestUsage.getDiskTotalBytes());
usageManager.saveCurrentTaskUsage(taskId, currentUsage);
cpusUsedOnSlave += taskCpusUsed;
}
if (configuration.isShuffleTasksForOverloadedSlaves() && currentUsage != null && currentUsage.getCpusUsed() > 0) {
if (isLongRunning(task) && !configuration.getDoNotShuffleRequests().contains(task.getRequestId())) {
Optional<SingularityTaskHistoryUpdate> maybeCleanupUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_CLEANING);
if (maybeCleanupUpdate.isPresent() && isTaskAlreadyCleanedUpForShuffle(maybeCleanupUpdate.get())) {
LOG.trace("Task {} already being cleaned up to spread cpu usage, skipping", taskId);
} else {
if (maybeResources.isPresent()) {
possibleTasksToShuffle.add(new TaskIdWithUsage(task, maybeResources.get(), currentUsage));
}
}
}
}
}
if (!slave.getResources().isPresent() || !slave.getResources().get().getMemoryMegaBytes().isPresent() || !slave.getResources().get().getNumCpus().isPresent()) {
LOG.debug("Could not find slave or resources for slave {}", slave.getId());
} else {
memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue());
cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue());
diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get());
}
SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, longRunningTasksUsage, allTaskUsage.size(), now, systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal);
if (slaveOverloaded) {
overLoadedHosts.put(slaveUsage, possibleTasksToShuffle);
}
List<Long> slaveTimestamps = usageManager.getSlaveUsageTimestamps(slave.getId());
if (slaveTimestamps.size() + 1 > configuration.getNumUsageToKeep()) {
usageManager.deleteSpecificSlaveUsage(slave.getId(), slaveTimestamps.get(0));
}
if (slaveUsage.getMemoryBytesTotal().isPresent() && slaveUsage.getCpusTotal().isPresent()) {
totalMemBytesUsed.getAndAdd(slaveUsage.getMemoryBytesUsed());
totalCpuUsed.getAndAdd(slaveUsage.getCpusUsed());
totalDiskBytesUsed.getAndAdd(slaveUsage.getDiskBytesUsed());
totalMemBytesAvailable.getAndAdd(slaveUsage.getMemoryBytesTotal().get());
totalCpuAvailable.getAndAdd(slaveUsage.getCpusTotal().get());
totalDiskBytesAvailable.getAndAdd(slaveUsage.getDiskBytesTotal().get());
}
LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage);
usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage);
} catch (Throwable t) {
String message = String.format("Could not get slave usage for host %s", slave.getHost());
LOG.error(message, t);
exceptionNotifier.notify(message, t);
}
}
use of com.hubspot.singularity.SingularityTask in project Singularity by HubSpot.
the class UsageResource method getSlaveCurrentTaskUsage.
@GET
@Path("/slaves/{slaveId}/tasks/current")
public List<SingularityTaskCurrentUsageWithId> getSlaveCurrentTaskUsage(@Auth SingularityUser user, @PathParam("slaveId") String slaveId) {
authorizationHelper.checkAdminAuthorization(user);
Optional<SingularitySlave> slave = slaveManager.getObject(slaveId);
WebExceptions.checkNotFound(slave.isPresent(), "No slave found with id %s", slaveId);
List<SingularityTask> tasksOnSlave = taskManager.getTasksOnSlave(taskManager.getActiveTaskIds(), slave.get());
List<SingularityTaskId> taskIds = new ArrayList<>(tasksOnSlave.size());
for (SingularityTask task : tasksOnSlave) {
taskIds.add(task.getTaskId());
}
return usageManager.getTaskCurrentUsages(taskIds);
}
use of com.hubspot.singularity.SingularityTask in project Singularity by HubSpot.
the class SingularityCleaner method checkLbState.
private CheckLBState checkLbState(SingularityTaskId taskId) {
Optional<SingularityLoadBalancerUpdate> lbAddUpdate = taskManager.getLoadBalancerState(taskId, LoadBalancerRequestType.ADD);
if (!lbAddUpdate.isPresent()) {
return CheckLBState.NOT_LOAD_BALANCED;
}
if (!shouldRemoveLbState(taskId, lbAddUpdate.get())) {
return CheckLBState.LOAD_BALANCE_FAILED;
}
Optional<SingularityLoadBalancerUpdate> maybeLbRemoveUpdate = taskManager.getLoadBalancerState(taskId, LoadBalancerRequestType.REMOVE);
SingularityLoadBalancerUpdate lbRemoveUpdate = null;
final LoadBalancerRequestId loadBalancerRequestId = getLoadBalancerRequestId(taskId, maybeLbRemoveUpdate);
if (shouldEnqueueLbRequest(maybeLbRemoveUpdate)) {
final Optional<SingularityTask> task = taskManager.getTask(taskId);
if (!task.isPresent()) {
LOG.error("Missing task {}", taskId);
return CheckLBState.MISSING_TASK;
}
lbRemoveUpdate = lbClient.enqueue(loadBalancerRequestId, task.get().getTaskRequest().getRequest(), task.get().getTaskRequest().getDeploy(), Collections.emptyList(), Collections.singletonList(task.get()));
taskManager.saveLoadBalancerState(taskId, LoadBalancerRequestType.REMOVE, lbRemoveUpdate);
} else if (maybeLbRemoveUpdate.get().getLoadBalancerState() == BaragonRequestState.WAITING || maybeLbRemoveUpdate.get().getLoadBalancerState() == BaragonRequestState.CANCELING) {
lbRemoveUpdate = lbClient.getState(loadBalancerRequestId);
taskManager.saveLoadBalancerState(taskId, LoadBalancerRequestType.REMOVE, lbRemoveUpdate);
} else {
lbRemoveUpdate = maybeLbRemoveUpdate.get();
}
switch(lbRemoveUpdate.getLoadBalancerState()) {
case SUCCESS:
if (configuration.getLoadBalancerRemovalGracePeriodMillis() > 0) {
final long duration = System.currentTimeMillis() - lbRemoveUpdate.getTimestamp();
if (duration < configuration.getLoadBalancerRemovalGracePeriodMillis()) {
LOG.trace("LB removal for {} succeeded - waiting at least {} to kill task (current duration {})", taskId, JavaUtils.durationFromMillis(configuration.getLoadBalancerRemovalGracePeriodMillis()), JavaUtils.durationFromMillis(duration));
return CheckLBState.WAITING;
}
}
return CheckLBState.DONE;
case FAILED:
case CANCELED:
LOG.error("LB removal request {} ({}) got unexpected response {}", lbRemoveUpdate, loadBalancerRequestId, lbRemoveUpdate.getLoadBalancerState());
exceptionNotifier.notify("LB removal failed", ImmutableMap.of("state", lbRemoveUpdate.getLoadBalancerState().name(), "loadBalancerRequestId", loadBalancerRequestId.toString(), "addUpdate", lbRemoveUpdate.toString()));
return CheckLBState.RETRY;
case UNKNOWN:
case CANCELING:
case WAITING:
LOG.trace("Waiting on LB cleanup request {} in state {}", loadBalancerRequestId, lbRemoveUpdate.getLoadBalancerState());
break;
case INVALID_REQUEST_NOOP:
exceptionNotifier.notify("LB removal failed", ImmutableMap.of("state", lbRemoveUpdate.getLoadBalancerState().name(), "loadBalancerRequestId", loadBalancerRequestId.toString(), "addUpdate", lbRemoveUpdate.toString()));
return CheckLBState.LOAD_BALANCE_FAILED;
}
return CheckLBState.WAITING;
}
Aggregations