use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.
the class SingularityDisasterDetectionPoller method collectDisasterStats.
private SingularityDisasterDataPoint collectDisasterStats() {
long now = System.currentTimeMillis();
int numActiveTasks = taskManager.getNumActiveTasks();
List<SingularityPendingTaskId> pendingTasks = taskManager.getPendingTaskIds();
int numPendingTasks = pendingTasks.size();
int numLateTasks = 0;
long totalTaskLagMillis = 0;
int numPastDueTasks = 0;
for (SingularityPendingTaskId pendingTask : pendingTasks) {
long taskLagMillis = now - pendingTask.getNextRunAt();
if (taskLagMillis > 0) {
numPastDueTasks++;
totalTaskLagMillis += taskLagMillis;
if (taskLagMillis > configuration.getDeltaAfterWhichTasksAreLateMillis()) {
numLateTasks++;
}
}
}
long avgTaskLagMillis = totalTaskLagMillis / Math.max(numPastDueTasks, 1);
List<SingularityAgent> slaves = agentManager.getObjects();
int numRunningSlaves = 0;
for (SingularityAgent slave : slaves) {
if (slave.getCurrentState().getState() != MachineState.DEAD && slave.getCurrentState().getState() != MachineState.MISSING_ON_STARTUP) {
numRunningSlaves++;
}
}
int numLostSlaves = activeSlavesLost.getAndSet(0);
int numLostTasks = 0;
for (Reason lostTaskReason : disasterConfiguration.getLostTaskReasons()) {
numLostTasks += taskLostReasons.count(lostTaskReason);
}
taskLostReasons.clear();
return new SingularityDisasterDataPoint(now, numActiveTasks, numPendingTasks, numLateTasks, avgTaskLagMillis, numLostTasks, numRunningSlaves, numLostSlaves);
}
use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.
the class SingularityAgentReconciliationPoller method checkInactiveAgents.
private void checkInactiveAgents() {
final long start = System.currentTimeMillis();
// filter dead and missing on startup agents for cleanup
List<SingularityAgent> deadAgents = agentManager.getObjectsFiltered(MachineState.DEAD);
LOG.debug("Found {} dead agents", deadAgents.size());
List<SingularityAgent> missingOnStartupAgents = agentManager.getObjectsFiltered(MachineState.MISSING_ON_STARTUP);
LOG.debug("Found {} agents missing on startup", missingOnStartupAgents.size());
List<SingularityAgent> inactiveAgents = new ArrayList<>();
inactiveAgents.addAll(deadAgents);
inactiveAgents.addAll(missingOnStartupAgents);
if (inactiveAgents.isEmpty()) {
LOG.trace("No inactive agents");
return;
}
int deleted = 0;
final long maxDuration = TimeUnit.HOURS.toMillis(configuration.getDeleteDeadAgentsAfterHours());
for (SingularityAgent inactiveAgent : inactiveAgents) {
final long duration = System.currentTimeMillis() - inactiveAgent.getCurrentState().getTimestamp();
if (duration > maxDuration) {
SingularityDeleteResult result = agentManager.deleteObject(inactiveAgent.getId());
// delete agent from inactive list too
inactiveAgentManager.cleanInactiveAgent(inactiveAgent.getHost());
deleted++;
LOG.info("Removing inactive agent {} ({}) after {} (max {})", inactiveAgent.getId(), result, JavaUtils.durationFromMillis(duration), JavaUtils.durationFromMillis(maxDuration));
}
}
LOG.debug("Checked {} inactive agents, deleted {} in {}", inactiveAgents.size(), deleted, JavaUtils.duration(start));
}
use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.
the class SingularityAgentAndRackManager method checkRackAfterAgentLoss.
private void checkRackAfterAgentLoss(SingularityAgent lostAgent) {
List<SingularityAgent> agents = agentManager.getObjectsFiltered(MachineState.ACTIVE);
int numInRack = 0;
for (SingularityAgent agent : agents) {
if (agent.getRackId().equals(lostAgent.getRackId())) {
numInRack++;
}
}
LOG.info("Found {} agents left in rack {}", numInRack, lostAgent.getRackId());
if (numInRack == 0) {
rackManager.changeState(lostAgent.getRackId(), MachineState.DEAD, Optional.empty(), Optional.empty());
}
}
use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.
the class SingularityAgentAndRackManager method checkDecommissionedAgentsFromMaster.
public void checkDecommissionedAgentsFromMaster(MesosMasterStateObject state, boolean isStartup) {
Map<String, SingularityAgent> agentsById = agentManager.getObjectsByIdForState(MachineState.DECOMMISSIONED);
for (MesosMasterAgentObject agentJsonObject : state.getAgents()) {
String agentId = agentJsonObject.getId();
agentsById.remove(agentId);
}
for (SingularityAgent leftOverAgent : agentsById.values()) {
MachineState newState = isStartup ? MachineState.MISSING_ON_STARTUP : MachineState.DEAD;
LOG.info("Marking decommissioned agent without mesos resources as {}", newState);
agentManager.changeState(leftOverAgent, newState, Optional.empty(), Optional.empty());
}
}
use of com.hubspot.singularity.SingularityAgent in project Singularity by HubSpot.
the class SingularityAgentAndRackManager method doesOfferMatch.
AgentMatchState doesOfferMatch(SingularityOfferHolder offerHolder, SingularityTaskRequest taskRequest, List<SingularityTaskId> activeTaskIdsForRequest, boolean isPreemptibleTask, RequestUtilization requestUtilization) {
final String host = offerHolder.getHostname();
final String rackId = offerHolder.getRackId();
final String agentId = offerHolder.getAgentId();
Optional<SingularityAgent> maybeAgent = agentManager.getObject(agentId);
if (!maybeAgent.isPresent()) {
return AgentMatchState.RESOURCES_DO_NOT_MATCH;
}
final MachineState currentState = maybeAgent.get().getCurrentState().getState();
if (currentState == MachineState.FROZEN) {
return AgentMatchState.AGENT_FROZEN;
}
if (currentState.isDecommissioning()) {
return AgentMatchState.AGENT_DECOMMISSIONING;
}
final MachineState currentRackState = rackManager.getObject(rackId).get().getCurrentState().getState();
if (currentRackState == MachineState.FROZEN) {
return AgentMatchState.RACK_FROZEN;
}
if (currentRackState.isDecommissioning()) {
return AgentMatchState.RACK_DECOMMISSIONING;
}
if (!taskRequest.getRequest().getRackAffinity().orElse(Collections.emptyList()).isEmpty()) {
if (!taskRequest.getRequest().getRackAffinity().get().contains(rackId)) {
LOG.trace("Task {} requires a rack in {} (current rack {})", taskRequest.getPendingTask().getPendingTaskId(), taskRequest.getRequest().getRackAffinity().get(), rackId);
return AgentMatchState.RACK_AFFINITY_NOT_MATCHING;
}
}
if (!isAttributesMatch(offerHolder, taskRequest, isPreemptibleTask)) {
return AgentMatchState.AGENT_ATTRIBUTES_DO_NOT_MATCH;
} else if (!areAttributeMinimumsFeasible(offerHolder, taskRequest, activeTaskIdsForRequest)) {
return AgentMatchState.AGENT_ATTRIBUTES_DO_NOT_MATCH;
}
final AgentPlacement agentPlacement = maybeOverrideAgentPlacement(taskRequest.getRequest().getAgentPlacement().orElse(configuration.getDefaultAgentPlacement()));
if (!taskRequest.getRequest().isRackSensitive() && agentPlacement == AgentPlacement.GREEDY) {
// todo: account for this or let this behavior continue?
return AgentMatchState.NOT_RACK_OR_AGENT_PARTICULAR;
}
final int numDesiredInstances = taskRequest.getRequest().getInstancesSafe();
boolean allowBounceToSameHost = isAllowBounceToSameHost(taskRequest.getRequest());
int activeRacksWithCapacityCount = getActiveRacksWithCapacityCount();
Multiset<String> countPerRack = HashMultiset.create(activeRacksWithCapacityCount);
double numOnAgent = 0;
double numCleaningOnAgent = 0;
double numFromSameBounceOnAgent = 0;
double numOtherDeploysOnAgent = 0;
boolean taskLaunchedFromBounceWithActionId = taskRequest.getPendingTask().getPendingTaskId().getPendingType() == PendingType.BOUNCE && taskRequest.getPendingTask().getActionId().isPresent();
final String sanitizedHost = offerHolder.getSanitizedHost();
final String sanitizedRackId = offerHolder.getSanitizedRackId();
Collection<SingularityTaskId> cleaningTasks = leaderCache.getCleanupTaskIds();
for (SingularityTaskId taskId : activeTaskIdsForRequest) {
if (!cleaningTasks.contains(taskId) && !taskManager.isKilledTask(taskId) && taskRequest.getDeploy().getId().equals(taskId.getDeployId())) {
countPerRack.add(taskId.getSanitizedRackId());
}
if (!taskId.getSanitizedHost().equals(sanitizedHost)) {
continue;
}
if (taskRequest.getDeploy().getId().equals(taskId.getDeployId())) {
if (cleaningTasks.contains(taskId)) {
numCleaningOnAgent++;
} else {
numOnAgent++;
}
if (taskLaunchedFromBounceWithActionId) {
Optional<SingularityTask> maybeTask = taskManager.getTask(taskId);
boolean errorInTaskData = false;
if (maybeTask.isPresent()) {
SingularityPendingTask pendingTask = maybeTask.get().getTaskRequest().getPendingTask();
if (pendingTask.getPendingTaskId().getPendingType() == PendingType.BOUNCE) {
if (pendingTask.getActionId().isPresent()) {
if (pendingTask.getActionId().get().equals(taskRequest.getPendingTask().getActionId().get())) {
numFromSameBounceOnAgent++;
}
} else {
// No actionId present on bounce, fall back to more restrictive placement strategy
errorInTaskData = true;
}
}
} else {
// Could not find appropriate task data, fall back to more restrictive placement strategy
errorInTaskData = true;
}
if (errorInTaskData) {
allowBounceToSameHost = false;
}
}
} else {
numOtherDeploysOnAgent++;
}
}
if (overrides.isAllowRackSensitivity() && taskRequest.getRequest().isRackSensitive()) {
final boolean isRackOk = isRackOk(countPerRack, sanitizedRackId, numDesiredInstances, taskRequest.getRequest().getId(), agentId, host, numCleaningOnAgent);
if (!isRackOk) {
return AgentMatchState.RACK_SATURATED;
}
}
switch(agentPlacement) {
case SEPARATE:
case SEPARATE_BY_DEPLOY:
case SPREAD_ALL_SLAVES:
case SPREAD_ALL_AGENTS:
if (allowBounceToSameHost && taskLaunchedFromBounceWithActionId) {
if (numFromSameBounceOnAgent > 0) {
LOG.trace("Rejecting SEPARATE task {} from agent {} ({}) due to numFromSameBounceOnAgent {}", taskRequest.getRequest().getId(), agentId, host, numFromSameBounceOnAgent);
return AgentMatchState.AGENT_SATURATED;
}
} else {
if (numOnAgent > 0 || numCleaningOnAgent > 0) {
LOG.trace("Rejecting {} task {} from agent {} ({}) due to numOnAgent {} numCleaningOnAgent {}", agentPlacement.name(), taskRequest.getRequest().getId(), agentId, host, numOnAgent, numCleaningOnAgent);
return AgentMatchState.AGENT_SATURATED;
}
}
break;
case SEPARATE_BY_REQUEST:
if (numOnAgent > 0 || numCleaningOnAgent > 0 || numOtherDeploysOnAgent > 0) {
LOG.trace("Rejecting SEPARATE_BY_REQUEST task {} from agent {} ({}) due to numOnAgent {} numCleaningOnAgent {} numOtherDeploysOnAgent {}", taskRequest.getRequest().getId(), agentId, host, numOnAgent, numCleaningOnAgent, numOtherDeploysOnAgent);
return AgentMatchState.AGENT_SATURATED;
}
break;
case OPTIMISTIC:
// If no tasks are active for this request yet, we can fall back to greedy.
if (activeTaskIdsForRequest.size() > 0) {
Collection<SingularityPendingTaskId> pendingTasksForRequestClusterwide = leaderCache.getPendingTaskIdsForRequest(taskRequest.getRequest().getId());
Set<String> currentHostsForRequest = activeTaskIdsForRequest.stream().map(SingularityTaskId::getSanitizedHost).collect(Collectors.toSet());
final double numPerAgent = activeTaskIdsForRequest.size() / (double) currentHostsForRequest.size();
final double leniencyCoefficient = configuration.getPlacementLeniency();
final double threshold = numPerAgent * (1 + (pendingTasksForRequestClusterwide.size() * leniencyCoefficient));
final boolean isOk = numOnAgent <= threshold;
if (!isOk) {
LOG.trace("Rejecting OPTIMISTIC task {} from agent {} ({}) because numOnAgent {} violates threshold {} (based on active tasks for request {}, current hosts for request {}, pending tasks for request {})", taskRequest.getRequest().getId(), agentId, host, numOnAgent, threshold, activeTaskIdsForRequest.size(), currentHostsForRequest.size(), pendingTasksForRequestClusterwide.size());
return AgentMatchState.AGENT_SATURATED;
}
}
break;
case GREEDY:
}
if (isPreferred(offerHolder, taskRequest, requestUtilization)) {
LOG.debug("Agent {} is preferred", offerHolder.getHostname());
return AgentMatchState.PREFERRED_AGENT;
}
return AgentMatchState.OK;
}
Aggregations