use of com.spotify.helios.common.protocol.JobUndeployResponse in project helios by spotify.
the class AgentRestartTest method test.
@Test
public void test() throws Exception {
startDefaultMaster();
final DockerClient dockerClient = getNewDockerClient();
final HeliosClient client = defaultClient();
final AgentMain agent1 = startDefaultAgent(testHost());
// Create a job
final Job job = Job.newBuilder().setName(testJobName).setVersion(testJobVersion).setImage(BUSYBOX).setCommand(IDLE_COMMAND).setCreatingUser(TEST_USER).build();
final JobId jobId = job.getId();
final CreateJobResponse created = client.createJob(job).get();
assertEquals(CreateJobResponse.Status.OK, created.getStatus());
// Wait for agent to come up
awaitHostRegistered(client, testHost(), LONG_WAIT_SECONDS, SECONDS);
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Deploy the job on the agent
final Deployment deployment = Deployment.of(jobId, START);
final JobDeployResponse deployed = client.deploy(deployment, testHost()).get();
assertEquals(JobDeployResponse.Status.OK, deployed.getStatus());
// Wait for the job to run
final TaskStatus firstTaskStatus = awaitJobState(client, testHost(), jobId, RUNNING, LONG_WAIT_SECONDS, SECONDS);
assertJobEquals(job, firstTaskStatus.getJob());
assertEquals(1, listContainers(dockerClient, testTag).size());
assertTrue(dockerClient.inspectContainer(firstTaskStatus.getContainerId()).state().running());
// Stop the agent
agent1.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Start the agent again
final AgentMain agent2 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for a while and make sure that the same container is still running
Thread.sleep(5000);
final HostStatus hostStatus = client.hostStatus(testHost()).get();
final TaskStatus taskStatus = hostStatus.getStatuses().get(jobId);
if (firstTaskStatus.getState() == PULLING_IMAGE) {
final State state = taskStatus.getState();
assertTrue(state == RUNNING || state == PULLING_IMAGE);
} else {
assertEquals(RUNNING, taskStatus.getState());
}
assertEquals(firstTaskStatus.getContainerId(), taskStatus.getContainerId());
assertEquals(1, listContainers(dockerClient, testTag).size());
assertTrue(dockerClient.inspectContainer(firstTaskStatus.getContainerId()).state().running());
// Stop the agent
agent2.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Kill the container
dockerClient.killContainer(firstTaskStatus.getContainerId());
assertEquals(0, listContainers(dockerClient, testTag).size());
// Start the agent again
final AgentMain agent3 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for the job to be restarted in a new container
final TaskStatus secondTaskStatus = Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<TaskStatus>() {
@Override
public TaskStatus call() throws Exception {
final HostStatus hostStatus = client.hostStatus(testHost()).get();
final TaskStatus taskStatus = hostStatus.getStatuses().get(jobId);
return (taskStatus != null && taskStatus.getContainerId() != null && taskStatus.getState() == RUNNING && !taskStatus.getContainerId().equals(firstTaskStatus.getContainerId())) ? taskStatus : null;
}
});
assertEquals(1, listContainers(dockerClient, testTag).size());
assertTrue(dockerClient.inspectContainer(secondTaskStatus.getContainerId()).state().running());
// Stop the agent
agent3.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Kill and destroy the container
dockerClient.killContainer(secondTaskStatus.getContainerId());
removeContainer(dockerClient, secondTaskStatus.getContainerId());
// Start the agent again
final AgentMain agent4 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for the task to be restarted in a new container
final TaskStatus thirdTaskStatus = Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<TaskStatus>() {
@Override
public TaskStatus call() throws Exception {
final HostStatus hostStatus = client.hostStatus(testHost()).get();
final TaskStatus taskStatus = hostStatus.getStatuses().get(jobId);
return (taskStatus != null && taskStatus.getContainerId() != null && taskStatus.getState() == RUNNING && !taskStatus.getContainerId().equals(secondTaskStatus.getContainerId())) ? taskStatus : null;
}
});
assertEquals(1, listContainers(dockerClient, testTag).size());
assertTrue(dockerClient.inspectContainer(thirdTaskStatus.getContainerId()).state().running());
// Stop the agent
agent4.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Stop the job
final SetGoalResponse stopped = client.setGoal(Deployment.of(jobId, STOP), testHost()).get();
assertEquals(SetGoalResponse.Status.OK, stopped.getStatus());
// Start the agent again
final AgentMain agent5 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Verify that the task is stopped
awaitJobState(client, testHost(), jobId, STOPPED, LONG_WAIT_SECONDS, SECONDS);
assertEquals(0, listContainers(dockerClient, testTag).size());
// Stop the agent
agent5.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Start the job
final SetGoalResponse started = client.setGoal(Deployment.of(jobId, START), testHost()).get();
assertEquals(SetGoalResponse.Status.OK, started.getStatus());
// Start the agent again
final AgentMain agent6 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Verify that the task is started
awaitJobState(client, testHost(), jobId, RUNNING, LONG_WAIT_SECONDS, SECONDS);
assertEquals(1, listContainers(dockerClient, testTag).size());
// Stop the agent
agent6.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Undeploy the job
final JobUndeployResponse undeployed = client.undeploy(jobId, testHost()).get();
assertEquals(JobUndeployResponse.Status.OK, undeployed.getStatus());
// Start the agent again
startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for the task to get removed
awaitTaskGone(client, testHost(), jobId, LONG_WAIT_SECONDS, SECONDS);
assertEquals(0, listContainers(dockerClient, testTag).size());
}
use of com.spotify.helios.common.protocol.JobUndeployResponse in project helios by spotify.
the class TokenTest method undeploy.
private void undeploy(final String token, final JobUndeployResponse.Status status) throws Exception {
final List<String> args = buildArgs(token, testJobNameAndVersion, testHost());
final JobUndeployResponse response = cliJson(JobUndeployResponse.class, "undeploy", args);
assertThat(response.getStatus(), equalTo(status));
}
use of com.spotify.helios.common.protocol.JobUndeployResponse in project helios by spotify.
the class UndeployRaceTest method test.
@Test
public void test() throws Exception {
startDefaultMaster();
final String agentId = "test-agent-id";
final HeliosClient client = defaultClient();
// Register a host without the agent running
client.registerHost(testHost(), agentId);
// Create, deploy and undeploy a job on the host without the agent running
final Job job = Job.newBuilder().setName(testJobName).setVersion(testJobVersion).setImage(BUSYBOX).setCommand(IDLE_COMMAND).build();
final JobId jobId = job.getId();
final CreateJobResponse created = client.createJob(job).get();
assertEquals(CreateJobResponse.Status.OK, created.getStatus());
final Deployment deployment = Deployment.of(jobId, START);
// Wait for host to be registered in the master. Otherwise, the client.deploy() call will
// return HOST_NOT_FOUND
Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<String>() {
@Override
public String call() throws Exception {
final List<String> hosts = client.listHosts().get();
if (hosts.contains(testHost())) {
return testHost();
}
return null;
}
});
final JobDeployResponse deployed = client.deploy(deployment, testHost()).get();
assertEquals(JobDeployResponse.Status.OK, deployed.getStatus());
final JobUndeployResponse undeployed = client.undeploy(jobId, testHost()).get();
assertEquals(JobUndeployResponse.Status.OK, undeployed.getStatus());
// Start agent
startDefaultAgent(testHost(), "--id", agentId);
awaitHostRegistered(client, testHost(), LONG_WAIT_SECONDS, SECONDS);
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for the task to disappear
awaitTaskGone(client, testHost(), jobId, LONG_WAIT_SECONDS, SECONDS);
// Verify that the job can be deleted
assertEquals(JobDeleteResponse.Status.OK, client.deleteJob(jobId).get().getStatus());
}
use of com.spotify.helios.common.protocol.JobUndeployResponse in project helios by spotify.
the class HeliosSoloDeployment method undeployLeftoverJobs.
/**
* Undeploy jobs left over by {@link TemporaryJobs}. TemporaryJobs should clean these up,
* but sometimes a few are left behind for whatever reason.
*/
@VisibleForTesting
protected void undeployLeftoverJobs() {
try {
// See if there are jobs running on any helios agent. If we are using TemporaryJobs,
// that class should've undeployed them at this point.
// Any jobs still running at this point have only been partially cleaned up.
// We look for jobs via hostStatus() because the job may have been deleted from the master,
// but the agent may still not have had enough time to undeploy the job from itself.
final List<String> hosts = heliosClient.listHosts().get();
for (final String host : hosts) {
final HostStatus hostStatus = heliosClient.hostStatus(host).get();
final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
for (final Map.Entry<JobId, TaskStatus> status : statuses.entrySet()) {
final JobId jobId = status.getKey();
final Goal goal = status.getValue().getGoal();
if (goal != Goal.UNDEPLOY) {
log.info("Job {} is still set to {} on host {}. Undeploying it now.", jobId, goal, host);
final JobUndeployResponse undeployResponse = heliosClient.undeploy(jobId, host).get();
log.info("Undeploy response for job {} is {}.", jobId, undeployResponse.getStatus());
if (undeployResponse.getStatus() != JobUndeployResponse.Status.OK) {
log.warn("Undeploy response for job {} was not OK. This could mean that something " + "beat the helios-solo master in telling the helios-solo agent to " + "undeploy.", jobId);
}
}
log.info("Waiting for job {} to actually be undeployed...", jobId);
awaitJobUndeployed(heliosClient, host, jobId, jobUndeployWaitSeconds, TimeUnit.SECONDS);
log.info("Job {} successfully undeployed.", jobId);
}
}
} catch (Exception e) {
log.warn("Exception occurred when trying to clean up leftover jobs.", e);
}
}
use of com.spotify.helios.common.protocol.JobUndeployResponse in project helios by spotify.
the class ZooKeeperHeliosFailoverTest method undeploy.
private void undeploy(final JobId jobId) throws Exception {
// Check job status can be queried
final JobStatus jobStatus = client.jobStatus(jobId).get();
assertEquals(RUNNING, jobStatus.getTaskStatuses().get(testHost()).getState());
// Undeploy the job
final JobUndeployResponse undeployed = client.undeploy(jobId, testHost()).get();
assertEquals(JobUndeployResponse.Status.OK, undeployed.getStatus());
// Wait for the task to disappear
awaitTaskGone(client, testHost(), jobId, LONG_WAIT_SECONDS, SECONDS);
}
Aggregations