Search in sources :

Example 26 with TaskStatus

use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.

the class ZooKeeperMasterModel method getTaskStatuses.

private Map<JobId, TaskStatus> getTaskStatuses(final ZooKeeperClient client, final String host) {
    final Map<JobId, TaskStatus> statuses = Maps.newHashMap();
    final List<JobId> jobIds = listHostJobs(client, host);
    for (final JobId jobId : jobIds) {
        TaskStatus status;
        try {
            status = getTaskStatus(client, host, jobId);
        } catch (HeliosRuntimeException e) {
            // Skip this task status so we can return other available information instead of failing the
            // entire thing.
            status = null;
        }
        if (status != null) {
            statuses.put(jobId, status);
        } else {
            log.debug("Task {} status missing for host {}", jobId, host);
        }
    }
    return statuses;
}
Also used : HeliosRuntimeException(com.spotify.helios.common.HeliosRuntimeException) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) JobId(com.spotify.helios.common.descriptors.JobId)

Example 27 with TaskStatus

use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.

the class ZooKeeperMasterModel method rollingUpdateAwaitRunning.

private RollingUpdateOp rollingUpdateAwaitRunning(final ZooKeeperClient client, final RollingUpdateOpFactory opFactory, final DeploymentGroup deploymentGroup, final String host) {
    final TaskStatus taskStatus = getTaskStatus(client, host, deploymentGroup.getJobId());
    final JobId jobId = deploymentGroup.getJobId();
    if (taskStatus == null) {
        // Handle cases where agent has not written job status to zookeeper.
        // If job is not listed under /config/hosts node, it may have been deployed successfully and
        // then manually undeployed. The job will not get redeployed, so treat this as a failure.
        final Deployment deployment = getDeployment(host, jobId);
        if (deployment == null) {
            return opFactory.error("Job unexpectedly undeployed. Perhaps it was manually undeployed?", host, RollingUpdateError.JOB_UNEXPECTEDLY_UNDEPLOYED);
        }
        // Check if we've exceeded the timeout for the rollout operation.
        if (isRolloutTimedOut(client, deploymentGroup)) {
            return opFactory.error("timed out while retrieving job status", host, RollingUpdateError.TIMED_OUT_RETRIEVING_JOB_STATUS);
        }
        // We haven't detected any errors, so assume the agent will write the status soon.
        return opFactory.yield();
    } else if (!taskStatus.getState().equals(TaskStatus.State.RUNNING)) {
        if (isRolloutTimedOut(client, deploymentGroup)) {
            // We exceeded the configured deploy timeout, and this job is still not running
            return rollingUpdateTimedoutError(opFactory, host, jobId, taskStatus);
        }
        return opFactory.yield();
    } else {
        // the job is running on the host. last thing we have to ensure is that it was
        // deployed by this deployment group. otherwise some weird conflict has occurred and we
        // won't be able to undeploy the job on the next update.
        final Deployment deployment = getDeployment(host, deploymentGroup.getJobId());
        if (deployment == null) {
            return opFactory.error("deployment for this job not found in zookeeper. " + "Perhaps it was manually undeployed?", host, RollingUpdateError.JOB_UNEXPECTEDLY_UNDEPLOYED);
        } else if (!Objects.equals(deployment.getDeploymentGroupName(), deploymentGroup.getName())) {
            return opFactory.error("job was already deployed, either manually or by a different deployment group", host, RollingUpdateError.JOB_ALREADY_DEPLOYED);
        }
        return opFactory.nextTask();
    }
}
Also used : Deployment(com.spotify.helios.common.descriptors.Deployment) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) JobId(com.spotify.helios.common.descriptors.JobId)

Example 28 with TaskStatus

use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.

the class ZooKeeperMasterModel method getJobHistory.

/**
   * Given a jobId and host, returns the N most recent events in its history on that host in the
   * cluster.
   */
@Override
public List<TaskStatusEvent> getJobHistory(final JobId jobId, final String host) throws JobDoesNotExistException {
    final Job descriptor = getJob(jobId);
    if (descriptor == null) {
        throw new JobDoesNotExistException(jobId);
    }
    final ZooKeeperClient client = provider.get("getJobHistory");
    final List<String> hosts;
    try {
        hosts = (!isNullOrEmpty(host)) ? singletonList(host) : client.getChildren(Paths.historyJobHosts(jobId));
    } catch (NoNodeException e) {
        return emptyList();
    } catch (KeeperException e) {
        throw Throwables.propagate(e);
    }
    final List<TaskStatusEvent> jsEvents = Lists.newArrayList();
    for (final String h : hosts) {
        final List<String> events;
        try {
            events = client.getChildren(Paths.historyJobHostEvents(jobId, h));
        } catch (NoNodeException e) {
            continue;
        } catch (KeeperException e) {
            throw Throwables.propagate(e);
        }
        for (final String event : events) {
            try {
                final byte[] data = client.getData(Paths.historyJobHostEventsTimestamp(jobId, h, Long.valueOf(event)));
                final TaskStatus status = Json.read(data, TaskStatus.class);
                jsEvents.add(new TaskStatusEvent(status, Long.valueOf(event), h));
            } catch (NoNodeException e) {
            // ignore, it went away before we read it
            } catch (KeeperException | IOException e) {
                throw Throwables.propagate(e);
            }
        }
    }
    return Ordering.from(EVENT_COMPARATOR).sortedCopy(jsEvents);
}
Also used : TaskStatusEvent(com.spotify.helios.common.descriptors.TaskStatusEvent) NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) IOException(java.io.IOException) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) ZooKeeperClient(com.spotify.helios.servicescommon.coordination.ZooKeeperClient) Job(com.spotify.helios.common.descriptors.Job) KeeperException(org.apache.zookeeper.KeeperException)

Example 29 with TaskStatus

use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.

the class HeliosSoloDeployment method undeployLeftoverJobs.

/**
   * Undeploy jobs left over by {@link TemporaryJobs}. TemporaryJobs should clean these up,
   * but sometimes a few are left behind for whatever reason.
   */
@VisibleForTesting
protected void undeployLeftoverJobs() {
    try {
        // See if there are jobs running on any helios agent. If we are using TemporaryJobs,
        // that class should've undeployed them at this point.
        // Any jobs still running at this point have only been partially cleaned up.
        // We look for jobs via hostStatus() because the job may have been deleted from the master,
        // but the agent may still not have had enough time to undeploy the job from itself.
        final List<String> hosts = heliosClient.listHosts().get();
        for (final String host : hosts) {
            final HostStatus hostStatus = heliosClient.hostStatus(host).get();
            final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
            for (final Map.Entry<JobId, TaskStatus> status : statuses.entrySet()) {
                final JobId jobId = status.getKey();
                final Goal goal = status.getValue().getGoal();
                if (goal != Goal.UNDEPLOY) {
                    log.info("Job {} is still set to {} on host {}. Undeploying it now.", jobId, goal, host);
                    final JobUndeployResponse undeployResponse = heliosClient.undeploy(jobId, host).get();
                    log.info("Undeploy response for job {} is {}.", jobId, undeployResponse.getStatus());
                    if (undeployResponse.getStatus() != JobUndeployResponse.Status.OK) {
                        log.warn("Undeploy response for job {} was not OK. This could mean that something " + "beat the helios-solo master in telling the helios-solo agent to " + "undeploy.", jobId);
                    }
                }
                log.info("Waiting for job {} to actually be undeployed...", jobId);
                awaitJobUndeployed(heliosClient, host, jobId, jobUndeployWaitSeconds, TimeUnit.SECONDS);
                log.info("Job {} successfully undeployed.", jobId);
            }
        }
    } catch (Exception e) {
        log.warn("Exception occurred when trying to clean up leftover jobs.", e);
    }
}
Also used : Goal(com.spotify.helios.common.descriptors.Goal) JobUndeployResponse(com.spotify.helios.common.protocol.JobUndeployResponse) HostStatus(com.spotify.helios.common.descriptors.HostStatus) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) JobId(com.spotify.helios.common.descriptors.JobId) DockerCertificateException(com.spotify.docker.client.exceptions.DockerCertificateException) DockerException(com.spotify.docker.client.exceptions.DockerException) ImageNotFoundException(com.spotify.docker.client.exceptions.ImageNotFoundException) UnknownHostException(java.net.UnknownHostException) ExecutionException(java.util.concurrent.ExecutionException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 30 with TaskStatus

use of com.spotify.helios.common.descriptors.TaskStatus in project helios by spotify.

the class HeliosSoloLogService method runOneIteration.

@Override
protected void runOneIteration() throws Exception {
    try {
        // fetch all the jobs running on the solo deployment
        for (final String host : get(heliosClient.listHosts())) {
            final HostStatus hostStatus = get(heliosClient.hostStatus(host));
            if (hostStatus == null) {
                continue;
            }
            final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
            for (final TaskStatus status : statuses.values()) {
                final JobId jobId = status.getJob().getId();
                final String containerId = status.getContainerId();
                if (isNullOrEmpty(containerId)) {
                    continue;
                }
                if (!logFutures.containsKey(containerId)) {
                    // for any containers we're not already tracking, attach to their stdout/stderr
                    final Future<?> future = this.executor().submit(new LogFollowJob(containerId, jobId));
                    logFutures.put(containerId, future);
                }
            }
        }
    } catch (Exception e) {
        // Ignore TimeoutException as that is to be expected sometimes
        if (!(Throwables.getRootCause(e) instanceof TimeoutException)) {
            log.warn("Caught exception, will ignore", e);
        }
    }
}
Also used : HostStatus(com.spotify.helios.common.descriptors.HostStatus) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) JobId(com.spotify.helios.common.descriptors.JobId) TimeoutException(java.util.concurrent.TimeoutException) DockerException(com.spotify.docker.client.exceptions.DockerException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) ConnectionClosedException(org.apache.http.ConnectionClosedException) TimeoutException(java.util.concurrent.TimeoutException)

Aggregations

TaskStatus (com.spotify.helios.common.descriptors.TaskStatus)53 JobId (com.spotify.helios.common.descriptors.JobId)40 Test (org.junit.Test)27 DockerClient (com.spotify.docker.client.DockerClient)18 Deployment (com.spotify.helios.common.descriptors.Deployment)15 Job (com.spotify.helios.common.descriptors.Job)14 HeliosClient (com.spotify.helios.client.HeliosClient)13 LogStream (com.spotify.docker.client.LogStream)10 HostStatus (com.spotify.helios.common.descriptors.HostStatus)10 JobStatus (com.spotify.helios.common.descriptors.JobStatus)8 JobDeployResponse (com.spotify.helios.common.protocol.JobDeployResponse)8 Matchers.containsString (org.hamcrest.Matchers.containsString)8 CreateJobResponse (com.spotify.helios.common.protocol.CreateJobResponse)7 PortMapping (com.spotify.helios.common.descriptors.PortMapping)6 Map (java.util.Map)6 ImmutableMap (com.google.common.collect.ImmutableMap)4 TaskStatusEvent (com.spotify.helios.common.descriptors.TaskStatusEvent)4 JobUndeployResponse (com.spotify.helios.common.protocol.JobUndeployResponse)4 IOException (java.io.IOException)4 ExecutionException (java.util.concurrent.ExecutionException)4