use of com.spotify.helios.common.descriptors.Deployment in project helios by spotify.
the class ZooKeeperMasterModel method getHostStatus.
/**
* Returns the current status of the host named by {@code host}.
*/
@Override
public HostStatus getHostStatus(final String host) {
final ZooKeeperClient client = provider.get("getHostStatus");
if (!ZooKeeperRegistrarUtil.isHostRegistered(client, host)) {
log.warn("Host {} isn't registered in ZooKeeper.", host);
return null;
}
final boolean up = checkHostUp(client, host);
final HostInfo hostInfo = getHostInfo(client, host);
final AgentInfo agentInfo = getAgentInfo(client, host);
final Map<JobId, Deployment> tasks = getTasks(client, host);
final Map<JobId, TaskStatus> statuses = getTaskStatuses(client, host);
final Map<String, String> environment = getEnvironment(client, host);
final Map<String, String> labels = getLabels(client, host);
return HostStatus.newBuilder().setJobs(tasks).setStatuses(fromNullable(statuses).or(EMPTY_STATUSES)).setHostInfo(hostInfo).setAgentInfo(agentInfo).setStatus(up ? UP : DOWN).setEnvironment(environment).setLabels(labels).build();
}
use of com.spotify.helios.common.descriptors.Deployment in project helios by spotify.
the class ZooKeeperMasterModel method rollingUpdateAwaitRunning.
private RollingUpdateOp rollingUpdateAwaitRunning(final ZooKeeperClient client, final RollingUpdateOpFactory opFactory, final DeploymentGroup deploymentGroup, final String host) {
final TaskStatus taskStatus = getTaskStatus(client, host, deploymentGroup.getJobId());
final JobId jobId = deploymentGroup.getJobId();
if (taskStatus == null) {
// Handle cases where agent has not written job status to zookeeper.
// If job is not listed under /config/hosts node, it may have been deployed successfully and
// then manually undeployed. The job will not get redeployed, so treat this as a failure.
final Deployment deployment = getDeployment(host, jobId);
if (deployment == null) {
return opFactory.error("Job unexpectedly undeployed. Perhaps it was manually undeployed?", host, RollingUpdateError.JOB_UNEXPECTEDLY_UNDEPLOYED);
}
// Check if we've exceeded the timeout for the rollout operation.
if (isRolloutTimedOut(client, deploymentGroup)) {
return opFactory.error("timed out while retrieving job status", host, RollingUpdateError.TIMED_OUT_RETRIEVING_JOB_STATUS);
}
// We haven't detected any errors, so assume the agent will write the status soon.
return opFactory.yield();
} else if (!taskStatus.getState().equals(TaskStatus.State.RUNNING)) {
if (isRolloutTimedOut(client, deploymentGroup)) {
// We exceeded the configured deploy timeout, and this job is still not running
return rollingUpdateTimedoutError(opFactory, host, jobId, taskStatus);
}
return opFactory.yield();
} else {
// the job is running on the host. last thing we have to ensure is that it was
// deployed by this deployment group. otherwise some weird conflict has occurred and we
// won't be able to undeploy the job on the next update.
final Deployment deployment = getDeployment(host, deploymentGroup.getJobId());
if (deployment == null) {
return opFactory.error("deployment for this job not found in zookeeper. " + "Perhaps it was manually undeployed?", host, RollingUpdateError.JOB_UNEXPECTEDLY_UNDEPLOYED);
} else if (!Objects.equals(deployment.getDeploymentGroupName(), deploymentGroup.getName())) {
return opFactory.error("job was already deployed, either manually or by a different deployment group", host, RollingUpdateError.JOB_ALREADY_DEPLOYED);
}
return opFactory.nextTask();
}
}
use of com.spotify.helios.common.descriptors.Deployment in project helios by spotify.
the class AgentRestartTest method test.
@Test
public void test() throws Exception {
startDefaultMaster();
final DockerClient dockerClient = getNewDockerClient();
final HeliosClient client = defaultClient();
final AgentMain agent1 = startDefaultAgent(testHost());
// Create a job
final Job job = Job.newBuilder().setName(testJobName).setVersion(testJobVersion).setImage(BUSYBOX).setCommand(IDLE_COMMAND).setCreatingUser(TEST_USER).build();
final JobId jobId = job.getId();
final CreateJobResponse created = client.createJob(job).get();
assertEquals(CreateJobResponse.Status.OK, created.getStatus());
// Wait for agent to come up
awaitHostRegistered(client, testHost(), LONG_WAIT_SECONDS, SECONDS);
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Deploy the job on the agent
final Deployment deployment = Deployment.of(jobId, START);
final JobDeployResponse deployed = client.deploy(deployment, testHost()).get();
assertEquals(JobDeployResponse.Status.OK, deployed.getStatus());
// Wait for the job to run
final TaskStatus firstTaskStatus = awaitJobState(client, testHost(), jobId, RUNNING, LONG_WAIT_SECONDS, SECONDS);
assertJobEquals(job, firstTaskStatus.getJob());
assertEquals(1, listContainers(dockerClient, testTag).size());
assertTrue(dockerClient.inspectContainer(firstTaskStatus.getContainerId()).state().running());
// Stop the agent
agent1.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Start the agent again
final AgentMain agent2 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for a while and make sure that the same container is still running
Thread.sleep(5000);
final HostStatus hostStatus = client.hostStatus(testHost()).get();
final TaskStatus taskStatus = hostStatus.getStatuses().get(jobId);
if (firstTaskStatus.getState() == PULLING_IMAGE) {
final State state = taskStatus.getState();
assertTrue(state == RUNNING || state == PULLING_IMAGE);
} else {
assertEquals(RUNNING, taskStatus.getState());
}
assertEquals(firstTaskStatus.getContainerId(), taskStatus.getContainerId());
assertEquals(1, listContainers(dockerClient, testTag).size());
assertTrue(dockerClient.inspectContainer(firstTaskStatus.getContainerId()).state().running());
// Stop the agent
agent2.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Kill the container
dockerClient.killContainer(firstTaskStatus.getContainerId());
assertEquals(0, listContainers(dockerClient, testTag).size());
// Start the agent again
final AgentMain agent3 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for the job to be restarted in a new container
final TaskStatus secondTaskStatus = Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<TaskStatus>() {
@Override
public TaskStatus call() throws Exception {
final HostStatus hostStatus = client.hostStatus(testHost()).get();
final TaskStatus taskStatus = hostStatus.getStatuses().get(jobId);
return (taskStatus != null && taskStatus.getContainerId() != null && taskStatus.getState() == RUNNING && !taskStatus.getContainerId().equals(firstTaskStatus.getContainerId())) ? taskStatus : null;
}
});
assertEquals(1, listContainers(dockerClient, testTag).size());
assertTrue(dockerClient.inspectContainer(secondTaskStatus.getContainerId()).state().running());
// Stop the agent
agent3.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Kill and destroy the container
dockerClient.killContainer(secondTaskStatus.getContainerId());
removeContainer(dockerClient, secondTaskStatus.getContainerId());
// Start the agent again
final AgentMain agent4 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for the task to be restarted in a new container
final TaskStatus thirdTaskStatus = Polling.await(LONG_WAIT_SECONDS, SECONDS, new Callable<TaskStatus>() {
@Override
public TaskStatus call() throws Exception {
final HostStatus hostStatus = client.hostStatus(testHost()).get();
final TaskStatus taskStatus = hostStatus.getStatuses().get(jobId);
return (taskStatus != null && taskStatus.getContainerId() != null && taskStatus.getState() == RUNNING && !taskStatus.getContainerId().equals(secondTaskStatus.getContainerId())) ? taskStatus : null;
}
});
assertEquals(1, listContainers(dockerClient, testTag).size());
assertTrue(dockerClient.inspectContainer(thirdTaskStatus.getContainerId()).state().running());
// Stop the agent
agent4.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Stop the job
final SetGoalResponse stopped = client.setGoal(Deployment.of(jobId, STOP), testHost()).get();
assertEquals(SetGoalResponse.Status.OK, stopped.getStatus());
// Start the agent again
final AgentMain agent5 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Verify that the task is stopped
awaitJobState(client, testHost(), jobId, STOPPED, LONG_WAIT_SECONDS, SECONDS);
assertEquals(0, listContainers(dockerClient, testTag).size());
// Stop the agent
agent5.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Start the job
final SetGoalResponse started = client.setGoal(Deployment.of(jobId, START), testHost()).get();
assertEquals(SetGoalResponse.Status.OK, started.getStatus());
// Start the agent again
final AgentMain agent6 = startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Verify that the task is started
awaitJobState(client, testHost(), jobId, RUNNING, LONG_WAIT_SECONDS, SECONDS);
assertEquals(1, listContainers(dockerClient, testTag).size());
// Stop the agent
agent6.stopAsync().awaitTerminated();
awaitHostStatus(client, testHost(), DOWN, LONG_WAIT_SECONDS, SECONDS);
// Undeploy the job
final JobUndeployResponse undeployed = client.undeploy(jobId, testHost()).get();
assertEquals(JobUndeployResponse.Status.OK, undeployed.getStatus());
// Start the agent again
startDefaultAgent(testHost());
awaitHostStatus(client, testHost(), UP, LONG_WAIT_SECONDS, SECONDS);
// Wait for the task to get removed
awaitTaskGone(client, testHost(), jobId, LONG_WAIT_SECONDS, SECONDS);
assertEquals(0, listContainers(dockerClient, testTag).size());
}
use of com.spotify.helios.common.descriptors.Deployment in project helios by spotify.
the class JobDeployCommand method runWithJobId.
@Override
protected int runWithJobId(final Namespace options, final HeliosClient client, final PrintStream out, final boolean json, final JobId jobId, final BufferedReader stdin) throws ExecutionException, InterruptedException {
final List<String> hosts = options.getList(hostsArg.getDest());
final Deployment job = Deployment.of(jobId, options.getBoolean(noStartArg.getDest()) ? STOP : START);
if (!json) {
out.printf("Deploying %s on %s%n", job, hosts);
}
int code = 0;
final HostResolver resolver = HostResolver.create(client);
final List<String> resolvedHosts = Lists.newArrayList();
for (final String candidateHost : hosts) {
final String host = resolver.resolveName(candidateHost);
resolvedHosts.add(host);
if (!json) {
out.printf("%s: ", host);
}
final String token = options.getString(tokenArg.getDest());
final JobDeployResponse result = client.deploy(job, host, token).get();
if (result.getStatus() == JobDeployResponse.Status.OK) {
if (!json) {
out.printf("done%n");
} else {
out.print(result.toJsonString());
}
} else {
if (!json) {
out.printf("failed: %s%n", result);
} else {
out.print(result.toJsonString());
}
code = 1;
}
}
if (code == 0 && options.getBoolean(watchArg.getDest())) {
JobWatchCommand.watchJobsOnHosts(out, true, resolvedHosts, ImmutableList.of(jobId), options.getInt(intervalArg.getDest()), client);
}
return code;
}
use of com.spotify.helios.common.descriptors.Deployment in project helios by spotify.
the class TemporaryJob method deploy.
void deploy() {
final TemporaryJobReports.Step createJob = reportWriter.step("create job").tag("jobId", job.getId());
try {
// Create job
log.info("Creating job {}", job.getId().toShortString());
final CreateJobResponse createResponse = get(client.createJob(job));
if (createResponse.getStatus() != CreateJobResponse.Status.OK) {
fail(format("Failed to create job %s - %s", job.getId(), createResponse.toString()));
}
createJob.markSuccess();
} catch (InterruptedException | ExecutionException | TimeoutException e) {
fail(format("Failed to create job %s %s - %s", job.getId(), job.toString(), e));
} finally {
createJob.finish();
}
final TemporaryJobReports.Step deployJob = reportWriter.step("deploy job").tag("jobId", job.getId());
try {
// Deploy job
final Deployment deployment = Deployment.of(job.getId(), Goal.START);
for (final String host : hosts) {
// HELIOS_HOST_ADDRESS is the IP address we should use to reach the host, instead of
// the hostname. This is used when running a helios cluster inside a VM, and the containers
// can be reached by IP address only, since DNS won't be able to resolve the host name of
// the helios agent running in the VM.
final HostStatus hostStatus = client.hostStatus(host).get();
final String hostAddress = hostStatus.getEnvironment().get("HELIOS_HOST_ADDRESS");
if (hostAddress != null) {
hostToIp.put(host, hostAddress);
}
log.info("Deploying {} to {}", getJobDescription(job), host);
final JobDeployResponse deployResponse = get(client.deploy(deployment, host));
if (deployResponse.getStatus() != JobDeployResponse.Status.OK) {
fail(format("Failed to deploy job %s %s - %s", job.getId(), job.toString(), deployResponse));
}
}
deployJob.markSuccess();
} catch (InterruptedException | ExecutionException | TimeoutException e) {
fail(format("Failed to deploy job %s %s - %s", job.getId(), job.toString(), e));
} finally {
deployJob.finish();
}
try {
// Wait for job to come up
for (final String host : hosts) {
awaitUp(host);
}
} catch (TimeoutException e) {
fail(format("Failed while probing job %s %s - %s", job.getId(), job.toString(), e));
}
}
Aggregations