Search in sources :

Example 36 with JobId

use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.

the class ZooKeeperMasterModel method getTaskStatuses.

private Map<JobId, TaskStatus> getTaskStatuses(final ZooKeeperClient client, final String host) {
    final Map<JobId, TaskStatus> statuses = Maps.newHashMap();
    final List<JobId> jobIds = listHostJobs(client, host);
    for (final JobId jobId : jobIds) {
        TaskStatus status;
        try {
            status = getTaskStatus(client, host, jobId);
        } catch (HeliosRuntimeException e) {
            // Skip this task status so we can return other available information instead of failing the
            // entire thing.
            status = null;
        if (status != null) {
            statuses.put(jobId, status);
        } else {
            log.debug("Task {} status missing for host {}", jobId, host);
    return statuses;
Also used : HeliosRuntimeException(com.spotify.helios.common.HeliosRuntimeException) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) JobId(com.spotify.helios.common.descriptors.JobId)

Example 37 with JobId

use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.

the class TaskHistoryWriter method add.

private void add(TaskStatusEvent item) throws InterruptedException {
    // If too many "globally", toss them
    while (count.get() >= MAX_TOTAL_SIZE) {
    final JobId key = item.getStatus().getJob().getId();
    final Deque<TaskStatusEvent> deque = getDeque(key);
    synchronized (deque) {
        // if too many in the particular deque, toss them
        while (deque.size() >= MAX_QUEUE_SIZE) {
    try {
    } catch (ClosedByInterruptException e) {
        log.debug("Writing task status event to backing store was interrupted");
    } catch (IOException e) {
        // We are best effort after all...
        log.warn("Failed to write task status event to backing store", e);
Also used : ClosedByInterruptException(java.nio.channels.ClosedByInterruptException) TaskStatusEvent(com.spotify.helios.common.descriptors.TaskStatusEvent) IOException( JobId(com.spotify.helios.common.descriptors.JobId)

Example 38 with JobId

use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.

the class ZooKeeperRegistrarUtil method deregisterHost.

public static void deregisterHost(final ZooKeeperClient client, final String host) throws HostNotFoundException, HostStillInUseException {"deregistering host: {}", host);
    // TODO (dano): handle retry failures
    try {
        final List<ZooKeeperOperation> operations = Lists.newArrayList();
        if (client.exists(Paths.configHost(host)) == null) {
            throw new HostNotFoundException("host [" + host + "] does not exist");
        // Remove all jobs deployed to this host
        final List<String> jobs = safeGetChildren(client, Paths.configHostJobs(host));
        for (final String jobString : jobs) {
            final JobId job = JobId.fromString(jobString);
            final String hostJobPath = Paths.configHostJob(host, job);
            final List<String> nodes = safeListRecursive(client, hostJobPath);
            for (final String node : reverse(nodes)) {
            if (client.exists(Paths.configJobHost(job, host)) != null) {
                operations.add(delete(Paths.configJobHost(job, host)));
            // Clean out the history for each job
            final List<String> history = safeListRecursive(client, Paths.historyJobHost(job, host));
            for (final String s : reverse(history)) {
        // Remove the host status
        final List<String> nodes = safeListRecursive(client, Paths.statusHost(host));
        for (final String node : reverse(nodes)) {
        // Remove port allocations
        final List<String> ports = safeGetChildren(client, Paths.configHostPorts(host));
        for (final String port : ports) {
            operations.add(delete(Paths.configHostPort(host, Integer.valueOf(port))));
        // Remove host id
        final String idPath = Paths.configHostId(host);
        if (client.exists(idPath) != null) {
        // Remove host config root
    } catch (NoNodeException e) {
        throw new HostNotFoundException(host);
    } catch (KeeperException e) {
        throw new HeliosRuntimeException(e);
Also used : NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) ZooKeeperOperation(com.spotify.helios.servicescommon.coordination.ZooKeeperOperation) HostNotFoundException(com.spotify.helios.master.HostNotFoundException) HeliosRuntimeException(com.spotify.helios.common.HeliosRuntimeException) JobId(com.spotify.helios.common.descriptors.JobId) KeeperException(org.apache.zookeeper.KeeperException)

Example 39 with JobId

use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.

the class ZooKeeperMasterModel method rollingUpdateAwaitRunning.

private RollingUpdateOp rollingUpdateAwaitRunning(final ZooKeeperClient client, final RollingUpdateOpFactory opFactory, final DeploymentGroup deploymentGroup, final String host) {
    final TaskStatus taskStatus = getTaskStatus(client, host, deploymentGroup.getJobId());
    final JobId jobId = deploymentGroup.getJobId();
    if (taskStatus == null) {
        // Handle cases where agent has not written job status to zookeeper.
        // If job is not listed under /config/hosts node, it may have been deployed successfully and
        // then manually undeployed. The job will not get redeployed, so treat this as a failure.
        final Deployment deployment = getDeployment(host, jobId);
        if (deployment == null) {
            return opFactory.error("Job unexpectedly undeployed. Perhaps it was manually undeployed?", host, RollingUpdateError.JOB_UNEXPECTEDLY_UNDEPLOYED);
        // Check if we've exceeded the timeout for the rollout operation.
        if (isRolloutTimedOut(client, deploymentGroup)) {
            return opFactory.error("timed out while retrieving job status", host, RollingUpdateError.TIMED_OUT_RETRIEVING_JOB_STATUS);
        // We haven't detected any errors, so assume the agent will write the status soon.
        return opFactory.yield();
    } else if (!taskStatus.getState().equals(TaskStatus.State.RUNNING)) {
        if (isRolloutTimedOut(client, deploymentGroup)) {
            // We exceeded the configured deploy timeout, and this job is still not running
            return rollingUpdateTimedoutError(opFactory, host, jobId, taskStatus);
        return opFactory.yield();
    } else {
        // the job is running on the host. last thing we have to ensure is that it was
        // deployed by this deployment group. otherwise some weird conflict has occurred and we
        // won't be able to undeploy the job on the next update.
        final Deployment deployment = getDeployment(host, deploymentGroup.getJobId());
        if (deployment == null) {
            return opFactory.error("deployment for this job not found in zookeeper. " + "Perhaps it was manually undeployed?", host, RollingUpdateError.JOB_UNEXPECTEDLY_UNDEPLOYED);
        } else if (!Objects.equals(deployment.getDeploymentGroupName(), deploymentGroup.getName())) {
            return opFactory.error("job was already deployed, either manually or by a different deployment group", host, RollingUpdateError.JOB_ALREADY_DEPLOYED);
        return opFactory.nextTask();
Also used : Deployment(com.spotify.helios.common.descriptors.Deployment) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) JobId(com.spotify.helios.common.descriptors.JobId)

Example 40 with JobId

use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.

the class ZooKeeperMasterModel method getDeployOperations.

private List<ZooKeeperOperation> getDeployOperations(final ZooKeeperClient client, final String host, final Deployment deployment, final String token) throws JobDoesNotExistException, JobAlreadyDeployedException, TokenVerificationException, HostNotFoundException, JobPortAllocationConflictException {
    assertHostExists(client, host);
    final JobId id = deployment.getJobId();
    final Job job = getJob(id);
    if (job == null) {
        throw new JobDoesNotExistException(id);
    verifyToken(token, job);
    final UUID operationId = UUID.randomUUID();
    final String jobPath = Paths.configJob(id);
    final String taskPath = Paths.configHostJob(host, id);
    final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId);
    final List<Integer> staticPorts = staticPorts(job);
    final Map<String, byte[]> portNodes = Maps.newHashMap();
    final byte[] idJson = id.toJsonBytes();
    for (final int port : staticPorts) {
        final String path = Paths.configHostPort(host, port);
        portNodes.put(path, idJson);
    final Task task = new Task(job, deployment.getGoal(), deployment.getDeployerUser(), deployment.getDeployerMaster(), deployment.getDeploymentGroupName());
    final List<ZooKeeperOperation> operations = Lists.newArrayList(check(jobPath), create(portNodes), create(Paths.configJobHost(id, host)));
    // Attempt to read a task here.
    try {
        // if we get here the node exists already
        throw new JobAlreadyDeployedException(host, id);
    } catch (NoNodeException e) {
        // if the real reason of the failure is that the job is already deployed.
        for (final int port : staticPorts) {
            checkForPortConflicts(client, host, port, id);
        operations.add(create(taskPath, task));
    } catch (KeeperException e) {
        throw new HeliosRuntimeException("reading existing task description failed", e);
    return ImmutableList.copyOf(operations);
Also used : Task(com.spotify.helios.common.descriptors.Task) RolloutTask(com.spotify.helios.common.descriptors.RolloutTask) NoNodeException(org.apache.zookeeper.KeeperException.NoNodeException) ZooKeeperOperation(com.spotify.helios.servicescommon.coordination.ZooKeeperOperation) HeliosRuntimeException(com.spotify.helios.common.HeliosRuntimeException) Job(com.spotify.helios.common.descriptors.Job) UUID(java.util.UUID) JobId(com.spotify.helios.common.descriptors.JobId) KeeperException(org.apache.zookeeper.KeeperException)


JobId (com.spotify.helios.common.descriptors.JobId)115 Test (org.junit.Test)68 TaskStatus (com.spotify.helios.common.descriptors.TaskStatus)41 Job (com.spotify.helios.common.descriptors.Job)37 HeliosClient (com.spotify.helios.client.HeliosClient)35 Deployment (com.spotify.helios.common.descriptors.Deployment)29 Matchers.containsString (org.hamcrest.Matchers.containsString)25 DockerClient (com.spotify.docker.client.DockerClient)19 JobStatus (com.spotify.helios.common.descriptors.JobStatus)19 JobDeployResponse (com.spotify.helios.common.protocol.JobDeployResponse)16 CreateJobResponse (com.spotify.helios.common.protocol.CreateJobResponse)13 IOException ( HostStatus (com.spotify.helios.common.descriptors.HostStatus)11 Map (java.util.Map)11 LogStream (com.spotify.docker.client.LogStream)10 HeliosRuntimeException (com.spotify.helios.common.HeliosRuntimeException)10 KeeperException (org.apache.zookeeper.KeeperException)9 TaskStatusEvent (com.spotify.helios.common.descriptors.TaskStatusEvent)8 AgentMain (com.spotify.helios.agent.AgentMain)7 PortMapping (com.spotify.helios.common.descriptors.PortMapping)7