use of com.spotify.helios.common.HeliosRuntimeException in project helios by spotify.
the class MasterService method setupZookeeperClient.
/**
* Create a Zookeeper client and create the control and state nodes if needed.
*
* @param config The service configuration.
* @return A zookeeper client.
*/
private ZooKeeperClient setupZookeeperClient(final MasterConfig config) {
ACLProvider aclProvider = null;
List<AuthInfo> authorization = null;
final String masterUser = config.getZookeeperAclMasterUser();
final String masterPassword = config.getZooKeeperAclMasterPassword();
final String agentUser = config.getZookeeperAclAgentUser();
final String agentDigest = config.getZooKeeperAclAgentDigest();
if (!isNullOrEmpty(masterPassword)) {
if (isNullOrEmpty(masterUser)) {
throw new HeliosRuntimeException("Master username must be set if a password is set");
}
authorization = Lists.newArrayList(new AuthInfo("digest", String.format("%s:%s", masterUser, masterPassword).getBytes()));
}
if (config.isZooKeeperEnableAcls()) {
if (isNullOrEmpty(masterUser) || isNullOrEmpty(masterPassword)) {
throw new HeliosRuntimeException("ZooKeeper ACLs enabled but master username and/or password not set");
}
if (isNullOrEmpty(agentUser) || isNullOrEmpty(agentDigest)) {
throw new HeliosRuntimeException("ZooKeeper ACLs enabled but agent username and/or digest not set");
}
aclProvider = heliosAclProvider(masterUser, digest(masterUser, masterPassword), agentUser, agentDigest);
}
final RetryPolicy zooKeeperRetryPolicy = new ExponentialBackoffRetry(1000, 3);
final CuratorFramework curator = curatorClientFactory.newClient(config.getZooKeeperConnectionString(), config.getZooKeeperSessionTimeoutMillis(), config.getZooKeeperConnectionTimeoutMillis(), zooKeeperRetryPolicy, aclProvider, authorization);
final ZooKeeperClient client = new DefaultZooKeeperClient(curator, config.getZooKeeperClusterId());
client.start();
zkRegistrar = ZooKeeperRegistrarService.newBuilder().setZooKeeperClient(client).setZooKeeperRegistrar(new MasterZooKeeperRegistrar(config.getName())).build();
// place where we have access to the ACL provider.
if (aclProvider != null) {
// effects are limited to a spurious log line.
try {
final List<ACL> curAcls = client.getAcl("/");
final List<ACL> wantedAcls = aclProvider.getAclForPath("/");
if (!Sets.newHashSet(curAcls).equals(Sets.newHashSet(wantedAcls))) {
log.info("Current ACL's on the zookeeper root node differ from desired, updating: {} -> {}", curAcls, wantedAcls);
client.getCuratorFramework().setACL().withACL(wantedAcls).forPath("/");
}
} catch (Exception e) {
log.error("Failed to get/set ACLs on the zookeeper root node", e);
}
}
return client;
}
use of com.spotify.helios.common.HeliosRuntimeException in project helios by spotify.
the class ZooKeeperMasterModel method removeJob.
/**
* Deletes a job from ZooKeeper. Ensures that job is not currently running anywhere.
*/
@Override
public Job removeJob(final JobId id, final String token) throws JobDoesNotExistException, JobStillDeployedException, TokenVerificationException {
log.info("removing job: id={}", id);
final ZooKeeperClient client = provider.get("removeJob");
final Job job = getJob(client, id);
if (job == null) {
throw new JobDoesNotExistException(id);
}
verifyToken(token, job);
// TODO (dano): handle retry failures
try {
final ImmutableList.Builder<ZooKeeperOperation> operations = ImmutableList.builder();
final UUID jobCreationOperationId = getJobCreation(client, id);
if (jobCreationOperationId != null) {
operations.add(delete(Paths.configJobCreation(id, jobCreationOperationId)));
}
operations.add(delete(Paths.configJobHosts(id)), delete(Paths.configJobRefShort(id)), delete(Paths.configJob(id)), // change down the tree. Effectively, make it that version == cVersion.
set(Paths.configJobs(), UUID.randomUUID().toString().getBytes()));
client.transaction(operations.build());
} catch (final NoNodeException e) {
throw new JobDoesNotExistException(id);
} catch (final NotEmptyException e) {
throw new JobStillDeployedException(id, listJobHosts(client, id));
} catch (final KeeperException e) {
throw new HeliosRuntimeException("removing job " + id + " failed", e);
}
// Delete job history on a best effort basis
try {
client.deleteRecursive(Paths.historyJob(id));
} catch (NoNodeException ignored) {
// There's no history for this job
} catch (KeeperException e) {
log.warn("error removing job history for job {}", id, e);
}
return job;
}
use of com.spotify.helios.common.HeliosRuntimeException in project helios by spotify.
the class ZooKeeperMasterModel method getTaskStatuses.
private Map<JobId, TaskStatus> getTaskStatuses(final ZooKeeperClient client, final String host) {
final Map<JobId, TaskStatus> statuses = Maps.newHashMap();
final List<JobId> jobIds = listHostJobs(client, host);
for (final JobId jobId : jobIds) {
TaskStatus status;
try {
status = getTaskStatus(client, host, jobId);
} catch (HeliosRuntimeException e) {
// Skip this task status so we can return other available information instead of failing the
// entire thing.
status = null;
}
if (status != null) {
statuses.put(jobId, status);
} else {
log.debug("Task {} status missing for host {}", jobId, host);
}
}
return statuses;
}
use of com.spotify.helios.common.HeliosRuntimeException in project helios by spotify.
the class ZooKeeperMasterModel method getDeployment.
/**
* Returns the current deployment state of {@code jobId} on {@code host}.
*/
@Override
public Deployment getDeployment(final String host, final JobId jobId) {
final String path = Paths.configHostJob(host, jobId);
final ZooKeeperClient client = provider.get("getDeployment");
try {
final byte[] data = client.getData(path);
final Task task = parse(data, Task.class);
return Deployment.of(jobId, task.getGoal(), task.getDeployerUser(), task.getDeployerMaster(), task.getDeploymentGroupName());
} catch (KeeperException.NoNodeException e) {
return null;
} catch (KeeperException | IOException e) {
throw new HeliosRuntimeException("getting deployment failed", e);
}
}
use of com.spotify.helios.common.HeliosRuntimeException in project helios by spotify.
the class ZooKeeperMasterModel method getDeploymentGroupTasks.
private Map<String, VersionedValue<DeploymentGroupTasks>> getDeploymentGroupTasks(final ZooKeeperClient client) {
final String folder = Paths.statusDeploymentGroupTasks();
try {
final List<String> names;
try {
names = client.getChildren(folder);
} catch (NoNodeException e) {
return Collections.emptyMap();
}
final Map<String, VersionedValue<DeploymentGroupTasks>> ret = Maps.newHashMap();
for (final String name : names) {
final String path = Paths.statusDeploymentGroupTasks(name);
try {
final Node node = client.getNode(path);
final byte[] data = node.getBytes();
final int version = node.getStat().getVersion();
if (data.length == 0) {
// This can happen because of ensurePath creates an empty node
log.debug("Ignoring empty deployment group tasks {}", name);
} else {
final DeploymentGroupTasks val = parse(data, DeploymentGroupTasks.class);
ret.put(name, VersionedValue.of(val, version));
}
} catch (NoNodeException e) {
// Ignore, the deployment group was deleted before we had a chance to read it.
log.debug("Ignoring deleted deployment group tasks {}", name);
}
}
return ret;
} catch (KeeperException | IOException e) {
throw new HeliosRuntimeException("getting deployment group tasks failed", e);
}
}
Aggregations