use of com.spotify.helios.servicescommon.coordination.ZooKeeperOperation in project helios by spotify.
the class ZooKeeperRegistrarUtil method reRegisterHost.
/**
* Re-register an agent with a different host id. Will remove the existing status of the agent
* but preserve any jobs deployed to the host and their history.
* @param client ZooKeeperClient
* @param host Host
* @param hostId ID of the host
* @throws HostNotFoundException If the hostname we are trying to re-register as doesn't exist.
* @throws KeeperException If an unexpected zookeeper error occurs.
*/
public static void reRegisterHost(final ZooKeeperClient client, final String host, final String hostId) throws HostNotFoundException, KeeperException {
// * Delete everything in the /status/hosts/<hostname> subtree
// * Don't delete any history for the job (on the host)
// * DON'T touch anything in the /config/hosts/<hostname> subtree, except updating the host id
log.info("re-registering host: {}, new host id: {}", host, hostId);
try {
final List<ZooKeeperOperation> operations = Lists.newArrayList();
// Check that the host exists in ZK
operations.add(check(Paths.configHost(host)));
// Remove the host status
final List<String> nodes = safeListRecursive(client, Paths.statusHost(host));
for (final String node : reverse(nodes)) {
operations.add(delete(node));
}
// ...and re-create the /status/hosts/<host>/jobs node + parent
operations.add(create(Paths.statusHost(host)));
operations.add(create(Paths.statusHostJobs(host)));
// Update the host ID
// We don't have WRITE permissions to the node, so delete and re-create it.
operations.add(delete(Paths.configHostId(host)));
operations.add(create(Paths.configHostId(host), hostId.getBytes(UTF_8)));
client.transaction(operations);
} catch (NoNodeException e) {
throw new HostNotFoundException(host);
} catch (KeeperException e) {
throw new HeliosRuntimeException(e);
}
}
use of com.spotify.helios.servicescommon.coordination.ZooKeeperOperation in project helios by spotify.
the class ZooKeeperMasterModel method getDeployOperations.
private List<ZooKeeperOperation> getDeployOperations(final ZooKeeperClient client, final String host, final Deployment deployment, final String token) throws JobDoesNotExistException, JobAlreadyDeployedException, TokenVerificationException, HostNotFoundException, JobPortAllocationConflictException {
assertHostExists(client, host);
final JobId id = deployment.getJobId();
final Job job = getJob(id);
if (job == null) {
throw new JobDoesNotExistException(id);
}
verifyToken(token, job);
final UUID operationId = UUID.randomUUID();
final String jobPath = Paths.configJob(id);
final String taskPath = Paths.configHostJob(host, id);
final String taskCreationPath = Paths.configHostJobCreation(host, id, operationId);
final List<Integer> staticPorts = staticPorts(job);
final Map<String, byte[]> portNodes = Maps.newHashMap();
final byte[] idJson = id.toJsonBytes();
for (final int port : staticPorts) {
final String path = Paths.configHostPort(host, port);
portNodes.put(path, idJson);
}
final Task task = new Task(job, deployment.getGoal(), deployment.getDeployerUser(), deployment.getDeployerMaster(), deployment.getDeploymentGroupName());
final List<ZooKeeperOperation> operations = Lists.newArrayList(check(jobPath), create(portNodes), create(Paths.configJobHost(id, host)));
// Attempt to read a task here.
try {
client.getNode(taskPath);
// if we get here the node exists already
throw new JobAlreadyDeployedException(host, id);
} catch (NoNodeException e) {
// if the real reason of the failure is that the job is already deployed.
for (final int port : staticPorts) {
checkForPortConflicts(client, host, port, id);
}
operations.add(create(taskPath, task));
operations.add(create(taskCreationPath));
} catch (KeeperException e) {
throw new HeliosRuntimeException("reading existing task description failed", e);
}
return ImmutableList.copyOf(operations);
}
use of com.spotify.helios.servicescommon.coordination.ZooKeeperOperation in project helios by spotify.
the class ZooKeeperMasterModel method updateDeploymentGroupHosts.
@Override
public void updateDeploymentGroupHosts(final String groupName, final List<String> hosts) throws DeploymentGroupDoesNotExistException {
log.debug("updating deployment-group hosts: name={}", groupName);
final ZooKeeperClient client = provider.get("updateDeploymentGroupHosts");
try {
final DeploymentGroupStatus status = getDeploymentGroupStatus(groupName);
if (!allowHostChange(status)) {
return;
}
// statusDeploymentGroupRemovedHosts may not exist for deployment groups created before it was
// introduced.
client.ensurePathAndSetData(Paths.statusDeploymentGroupRemovedHosts(groupName), Json.asBytesUnchecked(emptyList()));
final List<String> curHosts = getHosts(client, Paths.statusDeploymentGroupHosts(groupName));
final List<String> previouslyRemovedHosts = getHosts(client, Paths.statusDeploymentGroupRemovedHosts(groupName));
final List<String> removedHosts = removedHosts(curHosts, hosts, previouslyRemovedHosts);
if (hosts.equals(curHosts) && removedHosts.equals(previouslyRemovedHosts)) {
return;
}
log.info("for deployment-group name={}, curHosts={}, new hosts={}, " + "previouslyRemovedHosts={}, derived removedHosts={}", groupName, curHosts, hosts, previouslyRemovedHosts, removedHosts);
final List<ZooKeeperOperation> ops = Lists.newArrayList();
ops.add(set(Paths.statusDeploymentGroupHosts(groupName), Json.asBytes(hosts)));
ops.add(set(Paths.statusDeploymentGroupRemovedHosts(groupName), Json.asBytes(removedHosts)));
final Node dgn = client.getNode(Paths.configDeploymentGroup(groupName));
final Integer deploymentGroupVersion = dgn.getStat().getVersion();
DeploymentGroup deploymentGroup = Json.read(dgn.getBytes(), DeploymentGroup.class);
List<Map<String, Object>> events = ImmutableList.of();
if (deploymentGroup.getJobId() != null && updateOnHostChange(deploymentGroup, status)) {
deploymentGroup = deploymentGroup.toBuilder().setRollingUpdateReason(HOSTS_CHANGED).build();
// Fail transaction if the deployment group has been updated elsewhere.
ops.add(check(Paths.configDeploymentGroup(groupName), deploymentGroupVersion));
// NOTE: If the DG was removed this set() cause the transaction to fail, because
// removing the DG removes this node. It's *important* that there's an operation that
// causes the transaction to fail if the DG was removed or we'll end up with
// inconsistent state.
ops.add(set(Paths.configDeploymentGroup(deploymentGroup.getName()), deploymentGroup));
final RollingUpdateOp op = getInitRollingUpdateOps(deploymentGroup, hosts, removedHosts, client);
ops.addAll(op.operations());
events = op.events();
}
log.info("starting zookeeper transaction for updateDeploymentGroupHosts on deployment-group: " + "name={} jobId={} operations={}", groupName, deploymentGroup.getJobId(), ops);
client.transaction(ops);
emitEvents(deploymentGroupEventTopic, events);
} catch (BadVersionException e) {
// some other master beat us in processing this host update. not exceptional.
// ideally we would check the path in the exception, but curator doesn't provide a path
// for exceptions thrown as part of a transaction.
log.info("zookeeper transaction for updateDeploymentGroupHosts on deployment-group was " + "processed by another master: name={}", groupName);
} catch (NoNodeException e) {
throw new DeploymentGroupDoesNotExistException(groupName, e);
} catch (KeeperException | IOException e) {
throw new HeliosRuntimeException("updating deployment group hosts failed", e);
}
}
use of com.spotify.helios.servicescommon.coordination.ZooKeeperOperation in project helios by spotify.
the class ZooKeeperMasterModel method rollingUpdate.
@Override
public void rollingUpdate(final DeploymentGroup deploymentGroup, final JobId jobId, final RolloutOptions options) throws DeploymentGroupDoesNotExistException, JobDoesNotExistException {
checkNotNull(deploymentGroup, "deploymentGroup");
log.info("preparing to initiate rolling-update on deployment-group: name={}, jobId={}", deploymentGroup.getName(), jobId);
final DeploymentGroup updated = deploymentGroup.toBuilder().setJobId(jobId).setRolloutOptions(options).setRollingUpdateReason(MANUAL).build();
if (getJob(jobId) == null) {
throw new JobDoesNotExistException(jobId);
}
final List<ZooKeeperOperation> operations = Lists.newArrayList();
final ZooKeeperClient client = provider.get("rollingUpdate");
operations.add(set(Paths.configDeploymentGroup(updated.getName()), updated));
try {
final RollingUpdateOp op = getInitRollingUpdateOps(updated, client);
operations.addAll(op.operations());
log.info("starting zookeeper transaction for rolling-update on " + "deployment-group name={} jobId={}. List of operations: {}", deploymentGroup.getName(), jobId, operations);
client.transaction(operations);
emitEvents(deploymentGroupEventTopic, op.events());
log.info("initiated rolling-update on deployment-group: name={}, jobId={}", deploymentGroup.getName(), jobId);
} catch (final NoNodeException e) {
throw new DeploymentGroupDoesNotExistException(deploymentGroup.getName());
} catch (final KeeperException e) {
throw new HeliosRuntimeException("rolling-update on deployment-group " + deploymentGroup.getName() + " failed", e);
}
}
use of com.spotify.helios.servicescommon.coordination.ZooKeeperOperation in project helios by spotify.
the class RollingUpdateOpFactory method start.
public RollingUpdateOp start(final DeploymentGroup deploymentGroup, final ZooKeeperClient client) throws KeeperException {
client.ensurePath(Paths.statusDeploymentGroupTasks());
final List<ZooKeeperOperation> ops = Lists.newArrayList();
final List<Map<String, Object>> events = Lists.newArrayList();
final List<RolloutTask> rolloutTasks = tasks.getRolloutTasks();
events.add(eventFactory.rollingUpdateStarted(deploymentGroup));
final Stat tasksStat = client.exists(Paths.statusDeploymentGroupTasks(deploymentGroup.getName()));
if (tasksStat == null) {
// Create the tasks path if it doesn't already exist. The following operations (delete or set)
// assume the node already exists. If the tasks path is created/deleted before the transaction
// is committed it will fail. This will on occasion generate a user-visible error but is
// better than having inconsistent state.
ops.add(create(Paths.statusDeploymentGroupTasks(deploymentGroup.getName())));
}
final DeploymentGroupStatus status;
if (rolloutTasks.isEmpty()) {
status = DeploymentGroupStatus.newBuilder().setState(DONE).build();
ops.add(delete(Paths.statusDeploymentGroupTasks(deploymentGroup.getName())));
events.add(eventFactory.rollingUpdateDone(deploymentGroup));
} else {
final DeploymentGroupTasks tasks = DeploymentGroupTasks.newBuilder().setRolloutTasks(rolloutTasks).setTaskIndex(0).setDeploymentGroup(deploymentGroup).build();
status = DeploymentGroupStatus.newBuilder().setState(ROLLING_OUT).build();
ops.add(set(Paths.statusDeploymentGroupTasks(deploymentGroup.getName()), tasks));
}
// NOTE: If the DG was removed this set() cause the transaction to fail, because removing
// the DG removes this node. It's *important* that there's an operation that causes the
// transaction to fail if the DG was removed or we'll end up with inconsistent state.
ops.add(set(Paths.statusDeploymentGroup(deploymentGroup.getName()), status));
return new RollingUpdateOp(ImmutableList.copyOf(ops), ImmutableList.copyOf(events));
}
Aggregations