use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.
the class ZooKeeperMasterModel method getTaskStatuses.
private Map<JobId, TaskStatus> getTaskStatuses(final ZooKeeperClient client, final String host) {
final Map<JobId, TaskStatus> statuses = Maps.newHashMap();
final List<JobId> jobIds = listHostJobs(client, host);
for (final JobId jobId : jobIds) {
TaskStatus status;
try {
status = getTaskStatus(client, host, jobId);
} catch (HeliosRuntimeException e) {
// Skip this task status so we can return other available information instead of failing the
// entire thing.
status = null;
}
if (status != null) {
statuses.put(jobId, status);
} else {
log.debug("Task {} status missing for host {}", jobId, host);
}
}
return statuses;
}
use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.
the class ZooKeeperMasterModel method rollingUpdateAwaitRunning.
private RollingUpdateOp rollingUpdateAwaitRunning(final ZooKeeperClient client, final RollingUpdateOpFactory opFactory, final DeploymentGroup deploymentGroup, final String host) {
final TaskStatus taskStatus = getTaskStatus(client, host, deploymentGroup.getJobId());
final JobId jobId = deploymentGroup.getJobId();
if (taskStatus == null) {
// Handle cases where agent has not written job status to zookeeper.
// If job is not listed under /config/hosts node, it may have been deployed successfully and
// then manually undeployed. The job will not get redeployed, so treat this as a failure.
final Deployment deployment = getDeployment(host, jobId);
if (deployment == null) {
return opFactory.error("Job unexpectedly undeployed. Perhaps it was manually undeployed?", host, RollingUpdateError.JOB_UNEXPECTEDLY_UNDEPLOYED);
}
// Check if we've exceeded the timeout for the rollout operation.
if (isRolloutTimedOut(client, deploymentGroup)) {
return opFactory.error("timed out while retrieving job status", host, RollingUpdateError.TIMED_OUT_RETRIEVING_JOB_STATUS);
}
// We haven't detected any errors, so assume the agent will write the status soon.
return opFactory.yield();
} else if (!taskStatus.getState().equals(TaskStatus.State.RUNNING)) {
if (isRolloutTimedOut(client, deploymentGroup)) {
// We exceeded the configured deploy timeout, and this job is still not running
return rollingUpdateTimedoutError(opFactory, host, jobId, taskStatus);
}
return opFactory.yield();
} else {
// the job is running on the host. last thing we have to ensure is that it was
// deployed by this deployment group. otherwise some weird conflict has occurred and we
// won't be able to undeploy the job on the next update.
final Deployment deployment = getDeployment(host, deploymentGroup.getJobId());
if (deployment == null) {
return opFactory.error("deployment for this job not found in zookeeper. " + "Perhaps it was manually undeployed?", host, RollingUpdateError.JOB_UNEXPECTEDLY_UNDEPLOYED);
} else if (!Objects.equals(deployment.getDeploymentGroupName(), deploymentGroup.getName())) {
return opFactory.error("job was already deployed, either manually or by a different deployment group", host, RollingUpdateError.JOB_ALREADY_DEPLOYED);
}
return opFactory.nextTask();
}
}
use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.
the class ZooKeeperMasterModel method getJobHistory.
/**
* Given a jobId and host, returns the N most recent events in its history on that host in the
* cluster.
*/
@Override
public List<TaskStatusEvent> getJobHistory(final JobId jobId, final String host) throws JobDoesNotExistException {
final Job descriptor = getJob(jobId);
if (descriptor == null) {
throw new JobDoesNotExistException(jobId);
}
final ZooKeeperClient client = provider.get("getJobHistory");
final List<String> hosts;
try {
hosts = (!isNullOrEmpty(host)) ? singletonList(host) : client.getChildren(Paths.historyJobHosts(jobId));
} catch (NoNodeException e) {
return emptyList();
} catch (KeeperException e) {
throw Throwables.propagate(e);
}
final List<TaskStatusEvent> jsEvents = Lists.newArrayList();
for (final String h : hosts) {
final List<String> events;
try {
events = client.getChildren(Paths.historyJobHostEvents(jobId, h));
} catch (NoNodeException e) {
continue;
} catch (KeeperException e) {
throw Throwables.propagate(e);
}
for (final String event : events) {
try {
final byte[] data = client.getData(Paths.historyJobHostEventsTimestamp(jobId, h, Long.valueOf(event)));
final TaskStatus status = Json.read(data, TaskStatus.class);
jsEvents.add(new TaskStatusEvent(status, Long.valueOf(event), h));
} catch (NoNodeException e) {
// ignore, it went away before we read it
} catch (KeeperException | IOException e) {
throw Throwables.propagate(e);
}
}
}
return Ordering.from(EVENT_COMPARATOR).sortedCopy(jsEvents);
}
use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.
the class HeliosSoloDeployment method undeployLeftoverJobs.
/**
* Undeploy jobs left over by {@link TemporaryJobs}. TemporaryJobs should clean these up,
* but sometimes a few are left behind for whatever reason.
*/
@VisibleForTesting
protected void undeployLeftoverJobs() {
try {
// See if there are jobs running on any helios agent. If we are using TemporaryJobs,
// that class should've undeployed them at this point.
// Any jobs still running at this point have only been partially cleaned up.
// We look for jobs via hostStatus() because the job may have been deleted from the master,
// but the agent may still not have had enough time to undeploy the job from itself.
final List<String> hosts = heliosClient.listHosts().get();
for (final String host : hosts) {
final HostStatus hostStatus = heliosClient.hostStatus(host).get();
final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
for (final Map.Entry<JobId, TaskStatus> status : statuses.entrySet()) {
final JobId jobId = status.getKey();
final Goal goal = status.getValue().getGoal();
if (goal != Goal.UNDEPLOY) {
log.info("Job {} is still set to {} on host {}. Undeploying it now.", jobId, goal, host);
final JobUndeployResponse undeployResponse = heliosClient.undeploy(jobId, host).get();
log.info("Undeploy response for job {} is {}.", jobId, undeployResponse.getStatus());
if (undeployResponse.getStatus() != JobUndeployResponse.Status.OK) {
log.warn("Undeploy response for job {} was not OK. This could mean that something " + "beat the helios-solo master in telling the helios-solo agent to " + "undeploy.", jobId);
}
}
log.info("Waiting for job {} to actually be undeployed...", jobId);
awaitJobUndeployed(heliosClient, host, jobId, jobUndeployWaitSeconds, TimeUnit.SECONDS);
log.info("Job {} successfully undeployed.", jobId);
}
}
} catch (Exception e) {
log.warn("Exception occurred when trying to clean up leftover jobs.", e);
}
}
use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.
the class HeliosSoloLogService method runOneIteration.
@Override
protected void runOneIteration() throws Exception {
try {
// fetch all the jobs running on the solo deployment
for (final String host : get(heliosClient.listHosts())) {
final HostStatus hostStatus = get(heliosClient.hostStatus(host));
if (hostStatus == null) {
continue;
}
final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
for (final TaskStatus status : statuses.values()) {
final JobId jobId = status.getJob().getId();
final String containerId = status.getContainerId();
if (isNullOrEmpty(containerId)) {
continue;
}
if (!logFutures.containsKey(containerId)) {
// for any containers we're not already tracking, attach to their stdout/stderr
final Future<?> future = this.executor().submit(new LogFollowJob(containerId, jobId));
logFutures.put(containerId, future);
}
}
}
} catch (Exception e) {
// Ignore TimeoutException as that is to be expected sometimes
if (!(Throwables.getRootCause(e) instanceof TimeoutException)) {
log.warn("Caught exception, will ignore", e);
}
}
}
Aggregations