use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.
the class ZooKeeperAgentModel method getTaskStatuses.
/**
* Returns the {@link TaskStatus}es for all tasks assigned to the current agent.
*/
@Override
public Map<JobId, TaskStatus> getTaskStatuses() {
final Map<JobId, TaskStatus> statuses = Maps.newHashMap();
for (final Map.Entry<String, byte[]> entry : this.taskStatuses.entrySet()) {
try {
final JobId id = JobId.fromString(entry.getKey());
final TaskStatus status = Json.read(entry.getValue(), TaskStatus.class);
statuses.put(id, status);
} catch (IOException e) {
throw Throwables.propagate(e);
}
}
return statuses;
}
use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.
the class ZooKeeperMasterModel method getJobs.
/**
* Returns a {@link Map} of {@link JobId} to {@link Job} objects for all of the jobs known.
*/
@Override
public Map<JobId, Job> getJobs() {
log.debug("getting jobs");
final String folder = Paths.configJobs();
final ZooKeeperClient client = provider.get("getJobs");
try {
final List<String> ids;
try {
ids = client.getChildren(folder);
} catch (NoNodeException e) {
return Maps.newHashMap();
}
final Map<JobId, Job> descriptors = Maps.newHashMap();
for (final String id : ids) {
final JobId jobId = JobId.fromString(id);
final String path = Paths.configJob(jobId);
try {
final byte[] data = client.getData(path);
final Job descriptor = parse(data, Job.class);
descriptors.put(descriptor.getId(), descriptor);
} catch (NoNodeException e) {
// Ignore, the job was deleted before we had a chance to read it.
log.debug("Ignoring deleted job {}", jobId);
}
}
return descriptors;
} catch (KeeperException | IOException e) {
throw new HeliosRuntimeException("getting jobs failed", e);
}
}
use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.
the class ZooKeeperMasterModel method deployJobRetry.
private void deployJobRetry(final ZooKeeperClient client, final String host, final Deployment deployment, int count, final String token) throws JobDoesNotExistException, JobAlreadyDeployedException, HostNotFoundException, JobPortAllocationConflictException, TokenVerificationException {
if (count == 3) {
throw new HeliosRuntimeException("3 failures (possibly concurrent modifications) while " + "deploying. Giving up.");
}
log.info("deploying {}: {} (retry={})", deployment, host, count);
final JobId id = deployment.getJobId();
final Job job = getJob(id);
if (job == null) {
throw new JobDoesNotExistException(id);
}
verifyToken(token, job);
final UUID operationId = UUID.randomUUID();
final String jobPath = Paths.configJob(id);
try {
Paths.configHostJob(host, id);
} catch (IllegalArgumentException e) {
throw new HostNotFoundException("Could not find Helios host '" + host + "'");
}
final String taskPath = Paths.configHostJob(host, id);
final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId);
final List<Integer> staticPorts = staticPorts(job);
final Map<String, byte[]> portNodes = Maps.newHashMap();
final byte[] idJson = id.toJsonBytes();
for (final int port : staticPorts) {
final String path = Paths.configHostPort(host, port);
portNodes.put(path, idJson);
}
final Task task = new Task(job, deployment.getGoal(), deployment.getDeployerUser(), deployment.getDeployerMaster(), deployment.getDeploymentGroupName());
final List<ZooKeeperOperation> operations = Lists.newArrayList(check(jobPath), create(portNodes), create(Paths.configJobHost(id, host)));
// Attempt to read a task here.
try {
client.getNode(taskPath);
// if we get here the node exists already
throw new JobAlreadyDeployedException(host, id);
} catch (NoNodeException e) {
operations.add(create(taskPath, task));
operations.add(create(taskCreationPath));
} catch (KeeperException e) {
throw new HeliosRuntimeException("reading existing task description failed", e);
}
// TODO (dano): Failure handling is racy wrt agent and job modifications.
try {
client.transaction(operations);
log.info("deployed {}: {} (retry={})", deployment, host, count);
} catch (NoNodeException e) {
// Either the job, the host or the task went away
assertJobExists(client, id);
assertHostExists(client, host);
// If the job and host still exists, we likely tried to redeploy a job that had an UNDEPLOY
// goal and lost the race with the agent removing the task before we could set it. Retry.
deployJobRetry(client, host, deployment, count + 1, token);
} catch (NodeExistsException e) {
// Check for conflict due to transaction retry
try {
if (client.exists(taskCreationPath) != null) {
// Our creation operation node existed, we're done here
return;
}
} catch (KeeperException ex) {
throw new HeliosRuntimeException("checking job deployment failed", ex);
}
try {
// Check if the job was already deployed
if (client.stat(taskPath) != null) {
throw new JobAlreadyDeployedException(host, id);
}
} catch (KeeperException ex) {
throw new HeliosRuntimeException("checking job deployment failed", e);
}
// Check for static port collisions
for (final int port : staticPorts) {
checkForPortConflicts(client, host, port, id);
}
// Catch all for logic and ephemeral issues
throw new HeliosRuntimeException("deploying job failed", e);
} catch (KeeperException e) {
throw new HeliosRuntimeException("deploying job failed", e);
}
}
use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.
the class JobValidatorTest method testIdMismatchFails.
@Test
public void testIdMismatchFails() throws Exception {
final Job job = new Job(JobId.fromString("foo:bar:badf00d"), "bar", EMPTY_HOSTNAME, EMPTY_CREATED, EMPTY_COMMAND, EMPTY_ENV, EMPTY_RESOURCES, EMPTY_PORTS, EMPTY_REGISTRATION, EMPTY_GRACE_PERIOD, EMPTY_VOLUMES, EMPTY_EXPIRES, EMPTY_REGISTRATION_DOMAIN, EMPTY_CREATING_USER, EMPTY_TOKEN, EMPTY_HEALTH_CHECK, EMPTY_SECURITY_OPT, DEFAULT_NETWORK_MODE, EMPTY_METADATA, EMPTY_CAPS, EMPTY_CAPS, EMPTY_SECONDS_TO_WAIT);
final JobId recomputedId = job.toBuilder().build().getId();
assertEquals(ImmutableSet.of("Id hash mismatch: " + job.getId().getHash() + " != " + recomputedId.getHash()), validator.validate(job));
}
use of com.spotify.helios.common.descriptors.JobId in project helios by spotify.
the class SystemTestBase method undeployJob.
protected void undeployJob(final JobId jobId, final String host) throws Exception {
final String undeployOutput = cli("undeploy", jobId.toString(), host);
assertThat(undeployOutput, containsString(host + ": done"));
final String output = cli("status", "--host", host, "--json");
final Map<JobId, JobStatus> statuses = Json.readUnchecked(output, new TypeReference<Map<JobId, JobStatus>>() {
});
final JobStatus status = statuses.get(jobId);
assertTrue(status == null || status.getDeployments().get(host) == null);
}
Aggregations