use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.
the class SingularityScheduler method handleCompletedTask.
@Timed
public void handleCompletedTask(Optional<SingularityTask> task, SingularityTaskId taskId, boolean wasActive, long timestamp, ExtendedTaskState state, SingularityCreateResult taskHistoryUpdateCreateResult, Protos.TaskStatus status) {
final SingularityDeployStatistics deployStatistics = getDeployStatistics(taskId.getRequestId(), taskId.getDeployId());
if (wasActive) {
taskManager.deleteActiveTask(taskId.getId());
}
if (!task.isPresent() || task.get().getTaskRequest().getRequest().isLoadBalanced()) {
taskManager.createLBCleanupTask(taskId);
}
if (requestManager.isBouncing(taskId.getRequestId())) {
List<SingularityTaskId> activeTaskIds = taskManager.getActiveTaskIdsForRequest(taskId.getRequestId());
boolean foundBouncingTask = false;
for (SingularityTaskId activeTaskId : activeTaskIds) {
Optional<SingularityTaskHistoryUpdate> maybeCleaningUpdate = taskManager.getTaskHistoryUpdate(activeTaskId, ExtendedTaskState.TASK_CLEANING);
if (maybeCleaningUpdate.isPresent()) {
if (maybeCleaningUpdate.get().getStatusReason().or("").contains("BOUNCE")) {
// TaskCleanupType enum is included in status message
LOG.debug("Found task {} still waiting for bounce to complete", activeTaskId);
foundBouncingTask = true;
break;
} else if (!maybeCleaningUpdate.get().getPrevious().isEmpty()) {
for (SingularityTaskHistoryUpdate previousUpdate : maybeCleaningUpdate.get().getPrevious()) {
if (previousUpdate.getStatusMessage().or("").contains("BOUNCE")) {
LOG.debug("Found task {} still waiting for bounce to complete", activeTaskId);
foundBouncingTask = true;
break;
}
}
}
}
}
if (!foundBouncingTask) {
LOG.info("Bounce completed for request {}, no cleaning tasks due to bounce found", taskId.getRequestId());
Optional<SingularityExpiringBounce> expiringBounce = requestManager.getExpiringBounce(taskId.getRequestId());
if (expiringBounce.isPresent() && expiringBounce.get().getDeployId().equals(taskId.getDeployId())) {
requestManager.deleteExpiringObject(SingularityExpiringBounce.class, taskId.getRequestId());
}
requestManager.markBounceComplete(taskId.getRequestId());
}
}
final Optional<PendingType> scheduleResult = handleCompletedTaskWithStatistics(task, taskId, timestamp, state, deployStatistics, taskHistoryUpdateCreateResult, status);
if (taskHistoryUpdateCreateResult == SingularityCreateResult.EXISTED) {
return;
}
updateDeployStatistics(deployStatistics, taskId, task, timestamp, state, scheduleResult);
}
use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.
the class SingularityUsagePoller method collectSlaveUage.
private void collectSlaveUage(SingularitySlave slave, long now, Map<String, RequestUtilization> utilizationPerRequestId, Map<SingularitySlaveUsage, List<TaskIdWithUsage>> overLoadedHosts, AtomicLong totalMemBytesUsed, AtomicLong totalMemBytesAvailable, AtomicDouble totalCpuUsed, AtomicDouble totalCpuAvailable, AtomicLong totalDiskBytesUsed, AtomicLong totalDiskBytesAvailable) {
Map<ResourceUsageType, Number> longRunningTasksUsage = new HashMap<>();
longRunningTasksUsage.put(ResourceUsageType.MEMORY_BYTES_USED, 0);
longRunningTasksUsage.put(ResourceUsageType.CPU_USED, 0);
longRunningTasksUsage.put(ResourceUsageType.DISK_BYTES_USED, 0);
Optional<Long> memoryMbTotal = Optional.absent();
Optional<Double> cpusTotal = Optional.absent();
Optional<Long> diskMbTotal = Optional.absent();
long memoryMbReservedOnSlave = 0;
double cpuReservedOnSlave = 0;
long diskMbReservedOnSlave = 0;
long memoryBytesUsedOnSlave = 0;
double cpusUsedOnSlave = 0;
long diskMbUsedOnSlave = 0;
try {
List<MesosTaskMonitorObject> allTaskUsage = mesosClient.getSlaveResourceUsage(slave.getHost());
MesosSlaveMetricsSnapshotObject slaveMetricsSnapshot = mesosClient.getSlaveMetricsSnapshot(slave.getHost());
double systemMemTotalBytes = 0;
double systemMemFreeBytes = 0;
double systemLoad1Min = 0;
double systemLoad5Min = 0;
double systemLoad15Min = 0;
double slaveDiskUsed = 0;
double slaveDiskTotal = 0;
double systemCpusTotal = 0;
if (slaveMetricsSnapshot != null) {
systemMemTotalBytes = slaveMetricsSnapshot.getSystemMemTotalBytes();
systemMemFreeBytes = slaveMetricsSnapshot.getSystemMemFreeBytes();
systemLoad1Min = slaveMetricsSnapshot.getSystemLoad1Min();
systemLoad5Min = slaveMetricsSnapshot.getSystemLoad5Min();
systemLoad15Min = slaveMetricsSnapshot.getSystemLoad15Min();
slaveDiskUsed = slaveMetricsSnapshot.getSlaveDiskUsed();
slaveDiskTotal = slaveMetricsSnapshot.getSlaveDiskTotal();
systemCpusTotal = slaveMetricsSnapshot.getSystemCpusTotal();
}
double systemLoad;
switch(configuration.getMesosConfiguration().getScoreUsingSystemLoad()) {
case LOAD_1:
systemLoad = systemLoad1Min;
break;
case LOAD_15:
systemLoad = systemLoad15Min;
break;
case LOAD_5:
default:
systemLoad = systemLoad5Min;
break;
}
boolean slaveOverloaded = systemCpusTotal > 0 && systemLoad / systemCpusTotal > 1.0;
List<TaskIdWithUsage> possibleTasksToShuffle = new ArrayList<>();
for (MesosTaskMonitorObject taskUsage : allTaskUsage) {
String taskId = taskUsage.getSource();
SingularityTaskId task;
try {
task = SingularityTaskId.valueOf(taskId);
} catch (InvalidSingularityTaskIdException e) {
LOG.error("Couldn't get SingularityTaskId for {}", taskUsage);
continue;
}
SingularityTaskUsage latestUsage = getUsage(taskUsage);
List<SingularityTaskUsage> pastTaskUsages = usageManager.getTaskUsage(taskId);
clearOldUsage(taskId);
usageManager.saveSpecificTaskUsage(taskId, latestUsage);
Optional<SingularityTask> maybeTask = taskManager.getTask(task);
Optional<Resources> maybeResources = Optional.absent();
if (maybeTask.isPresent()) {
maybeResources = maybeTask.get().getTaskRequest().getPendingTask().getResources().or(maybeTask.get().getTaskRequest().getDeploy().getResources());
if (maybeResources.isPresent()) {
Resources taskResources = maybeResources.get();
double memoryMbReservedForTask = taskResources.getMemoryMb();
double cpuReservedForTask = taskResources.getCpus();
double diskMbReservedForTask = taskResources.getDiskMb();
memoryMbReservedOnSlave += memoryMbReservedForTask;
cpuReservedOnSlave += cpuReservedForTask;
diskMbReservedOnSlave += diskMbReservedForTask;
updateRequestUtilization(utilizationPerRequestId, pastTaskUsages, latestUsage, task, memoryMbReservedForTask, cpuReservedForTask, diskMbReservedForTask);
}
}
memoryBytesUsedOnSlave += latestUsage.getMemoryTotalBytes();
diskMbUsedOnSlave += latestUsage.getDiskTotalBytes();
SingularityTaskCurrentUsage currentUsage = null;
if (pastTaskUsages.isEmpty()) {
Optional<SingularityTaskHistoryUpdate> maybeStartingUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_STARTING);
if (maybeStartingUpdate.isPresent()) {
long startTimestampSeconds = TimeUnit.MILLISECONDS.toSeconds(maybeStartingUpdate.get().getTimestamp());
double usedCpusSinceStart = latestUsage.getCpuSeconds() / (latestUsage.getTimestamp() - startTimestampSeconds);
if (isLongRunning(task) || isConsideredLongRunning(task)) {
updateLongRunningTasksUsage(longRunningTasksUsage, latestUsage.getMemoryTotalBytes(), usedCpusSinceStart, latestUsage.getDiskTotalBytes());
}
currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, usedCpusSinceStart, latestUsage.getDiskTotalBytes());
usageManager.saveCurrentTaskUsage(taskId, currentUsage);
cpusUsedOnSlave += usedCpusSinceStart;
}
} else {
SingularityTaskUsage lastUsage = pastTaskUsages.get(pastTaskUsages.size() - 1);
double taskCpusUsed = ((latestUsage.getCpuSeconds() - lastUsage.getCpuSeconds()) / (latestUsage.getTimestamp() - lastUsage.getTimestamp()));
if (isLongRunning(task) || isConsideredLongRunning(task)) {
updateLongRunningTasksUsage(longRunningTasksUsage, latestUsage.getMemoryTotalBytes(), taskCpusUsed, latestUsage.getDiskTotalBytes());
}
currentUsage = new SingularityTaskCurrentUsage(latestUsage.getMemoryTotalBytes(), now, taskCpusUsed, latestUsage.getDiskTotalBytes());
usageManager.saveCurrentTaskUsage(taskId, currentUsage);
cpusUsedOnSlave += taskCpusUsed;
}
if (configuration.isShuffleTasksForOverloadedSlaves() && currentUsage != null && currentUsage.getCpusUsed() > 0) {
if (isLongRunning(task) && !configuration.getDoNotShuffleRequests().contains(task.getRequestId())) {
Optional<SingularityTaskHistoryUpdate> maybeCleanupUpdate = taskManager.getTaskHistoryUpdate(task, ExtendedTaskState.TASK_CLEANING);
if (maybeCleanupUpdate.isPresent() && isTaskAlreadyCleanedUpForShuffle(maybeCleanupUpdate.get())) {
LOG.trace("Task {} already being cleaned up to spread cpu usage, skipping", taskId);
} else {
if (maybeResources.isPresent()) {
possibleTasksToShuffle.add(new TaskIdWithUsage(task, maybeResources.get(), currentUsage));
}
}
}
}
}
if (!slave.getResources().isPresent() || !slave.getResources().get().getMemoryMegaBytes().isPresent() || !slave.getResources().get().getNumCpus().isPresent()) {
LOG.debug("Could not find slave or resources for slave {}", slave.getId());
} else {
memoryMbTotal = Optional.of(slave.getResources().get().getMemoryMegaBytes().get().longValue());
cpusTotal = Optional.of(slave.getResources().get().getNumCpus().get().doubleValue());
diskMbTotal = Optional.of(slave.getResources().get().getDiskSpace().get());
}
SingularitySlaveUsage slaveUsage = new SingularitySlaveUsage(cpusUsedOnSlave, cpuReservedOnSlave, cpusTotal, memoryBytesUsedOnSlave, memoryMbReservedOnSlave, memoryMbTotal, diskMbUsedOnSlave, diskMbReservedOnSlave, diskMbTotal, longRunningTasksUsage, allTaskUsage.size(), now, systemMemTotalBytes, systemMemFreeBytes, systemCpusTotal, systemLoad1Min, systemLoad5Min, systemLoad15Min, slaveDiskUsed, slaveDiskTotal);
if (slaveOverloaded) {
overLoadedHosts.put(slaveUsage, possibleTasksToShuffle);
}
List<Long> slaveTimestamps = usageManager.getSlaveUsageTimestamps(slave.getId());
if (slaveTimestamps.size() + 1 > configuration.getNumUsageToKeep()) {
usageManager.deleteSpecificSlaveUsage(slave.getId(), slaveTimestamps.get(0));
}
if (slaveUsage.getMemoryBytesTotal().isPresent() && slaveUsage.getCpusTotal().isPresent()) {
totalMemBytesUsed.getAndAdd(slaveUsage.getMemoryBytesUsed());
totalCpuUsed.getAndAdd(slaveUsage.getCpusUsed());
totalDiskBytesUsed.getAndAdd(slaveUsage.getDiskBytesUsed());
totalMemBytesAvailable.getAndAdd(slaveUsage.getMemoryBytesTotal().get());
totalCpuAvailable.getAndAdd(slaveUsage.getCpusTotal().get());
totalDiskBytesAvailable.getAndAdd(slaveUsage.getDiskBytesTotal().get());
}
LOG.debug("Saving slave {} usage {}", slave.getHost(), slaveUsage);
usageManager.saveSpecificSlaveUsageAndSetCurrent(slave.getId(), slaveUsage);
} catch (Throwable t) {
String message = String.format("Could not get slave usage for host %s", slave.getHost());
LOG.error(message, t);
exceptionNotifier.notify(message, t);
}
}
use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.
the class SingularityDeployHealthHelper method getNoHealthcheckHealthyTasks.
private List<SingularityTaskId> getNoHealthcheckHealthyTasks(final Optional<SingularityDeploy> deploy, final Collection<SingularityTaskId> matchingActiveTasks) {
final Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates = taskManager.getTaskHistoryUpdates(matchingActiveTasks);
final List<SingularityTaskId> healthyTaskIds = Lists.newArrayListWithCapacity(matchingActiveTasks.size());
for (SingularityTaskId taskId : matchingActiveTasks) {
Collection<SingularityTaskHistoryUpdate> updates = taskUpdates.get(taskId);
SimplifiedTaskState currentState = SingularityTaskHistoryUpdate.getCurrentState(updates);
if (currentState == SimplifiedTaskState.RUNNING && isRunningTaskHealthy(deploy, updates, taskId)) {
healthyTaskIds.add(taskId);
}
}
return healthyTaskIds;
}
use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.
the class SingularityDeployHealthHelper method getNonHealthcheckedTaskFailure.
private Optional<SingularityDeployFailure> getNonHealthcheckedTaskFailure(Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates, SingularityTaskId taskId) {
List<SingularityTaskHistoryUpdate> updates = taskUpdates.get(taskId);
SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);
if (lastUpdate.getTaskState().isSuccess()) {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED, Optional.of(taskId), Optional.of(String.format("Task was expected to maintain TASK_RUNNING state but finished. (%s)", lastUpdate.getStatusMessage().or("")))));
} else if (lastUpdate.getTaskState().isDone()) {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP, Optional.of(taskId), lastUpdate.getStatusMessage()));
} else if (SingularityTaskHistoryUpdate.getCurrentState(updates) == SimplifiedTaskState.WAITING) {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_NEVER_ENTERED_RUNNING, Optional.of(taskId), Optional.of(String.format("Task never entered running state, last state was %s (%s)", lastUpdate.getTaskState().getDisplayName(), lastUpdate.getStatusMessage().or("")))));
}
return Optional.absent();
}
use of com.hubspot.singularity.SingularityTaskHistoryUpdate in project Singularity by HubSpot.
the class SingularityDeployHealthHelper method getHealthcheckedTaskFailure.
private Optional<SingularityDeployFailure> getHealthcheckedTaskFailure(SingularityDeploy deploy, Map<SingularityTaskId, List<SingularityTaskHistoryUpdate>> taskUpdates, SingularityTaskHealthcheckResult healthcheckResult, SingularityTaskId taskId) {
Collection<SingularityTaskHistoryUpdate> updates = taskUpdates.get(taskId);
if (!healthcheckResult.isFailed()) {
return Optional.absent();
}
SingularityTaskHistoryUpdate lastUpdate = Iterables.getLast(updates);
if (lastUpdate.getTaskState().isDone()) {
if (lastUpdate.getTaskState().isSuccess()) {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_EXPECTED_RUNNING_FINISHED, Optional.of(taskId), Optional.of(String.format("Task was expected to maintain TASK_RUNNING state but finished. (%s)", lastUpdate.getStatusMessage().or("")))));
} else {
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_ON_STARTUP, Optional.of(taskId), lastUpdate.getStatusMessage()));
}
}
final Optional<Integer> healthcheckMaxRetries = deploy.getHealthcheck().isPresent() ? deploy.getHealthcheck().get().getMaxRetries().or(configuration.getHealthcheckMaxRetries()) : configuration.getHealthcheckMaxRetries();
if (healthcheckMaxRetries.isPresent() && taskManager.getNumNonstartupHealthchecks(taskId) > healthcheckMaxRetries.get()) {
String message = String.format("Instance %s failed %s healthchecks, the max for the deploy.", taskId.getInstanceNo(), healthcheckMaxRetries.get() + 1);
if (healthcheckResult.getStatusCode().isPresent()) {
message = String.format("%s Last check returned with status code %s", message, healthcheckResult.getStatusCode().get());
}
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
}
Optional<Long> runningAt = getRunningAt(updates);
if (runningAt.isPresent()) {
final long durationSinceRunning = System.currentTimeMillis() - runningAt.get();
if (healthcheckResult.isStartup() && deploy.getHealthcheck().isPresent() && durationSinceRunning > deploy.getHealthcheck().get().getStartupTimeoutSeconds().or(configuration.getStartupTimeoutSeconds())) {
String message = String.format("Instance %s has not responded to healthchecks after running for %s", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
}
if (isRunningLongerThanThreshold(deploy, durationSinceRunning)) {
String message = String.format("Instance %s has been running for %s and has yet to pass healthchecks.", taskId.getInstanceNo(), JavaUtils.durationFromMillis(durationSinceRunning));
if (healthcheckResult.getStatusCode().isPresent()) {
message = String.format("%s Last check returned with status code %s", message, healthcheckResult.getStatusCode().get());
}
return Optional.of(new SingularityDeployFailure(SingularityDeployFailureReason.TASK_FAILED_HEALTH_CHECKS, Optional.of(taskId), Optional.of(message)));
}
}
return Optional.absent();
}
Aggregations