use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.
the class DefaultDeployer method pickHost.
@VisibleForTesting
String pickHost(final List<String> filteredHosts) {
final List<String> mutatedList = Lists.newArrayList(filteredHosts);
while (true) {
final String candidateHost = hostPicker.pickHost(mutatedList);
try {
final HostStatus hostStatus = client.hostStatus(candidateHost).get();
if (hostStatus != null && Status.UP == hostStatus.getStatus()) {
return candidateHost;
}
mutatedList.remove(candidateHost);
if (mutatedList.isEmpty()) {
fail("all hosts matching filter pattern are DOWN");
}
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
}
}
use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.
the class HeliosDeploymentResource method before.
/**
* Ensure that the HeliosDeployment is up.
*/
@Override
public void before() throws Throwable {
super.before();
log.info("verifying connectivity to {}", deployment.address());
// wait for the helios master to be available
Polling.awaitUnchecked(30, TimeUnit.SECONDS, "Could not connect to HeliosDeployment at " + deployment.address() + " after %d %s", new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
final HostAndPort hap = deployment.address();
final SocketAddress address = new InetSocketAddress(hap.getHost(), hap.getPort());
log.debug("attempting to connect to {}", address);
try {
final Socket s = new Socket();
s.connect(address, 100);
log.info("successfully connected to address {} for {}", address, deployment);
return true;
} catch (SocketTimeoutException | ConnectException e) {
log.debug("could not yet connect to HeliosDeployment: {}", e.toString());
return null;
}
}
});
// Ensure that at least one agent is available and UP in this HeliosDeployment.
// This prevents continuing with the test when starting up helios-solo before the agent is
// registered.
final HeliosClient client = client();
Polling.awaitUnchecked(30, TimeUnit.SECONDS, "No agents were available at HeliosDeployment at " + deployment.address() + " after %d %s", new Callable<Boolean>() {
@Override
public Boolean call() throws Exception {
final ListenableFuture<List<String>> future = client.listHosts();
final List<String> hosts;
try {
// use a short timeout to allow this request to be retried a few times by the
// Polling.await loop
hosts = future.get(1, TimeUnit.SECONDS);
} catch (TimeoutException | InterruptedException e) {
log.debug("timed out waiting for listHosts request to finish, will retry");
return null;
}
if (hosts.isEmpty()) {
log.debug("0 agents in {}, will retry", deployment);
return null;
}
// Check that at least one host is UP (is maintaining a reasonably reliable
// connection to ZK) in addition to registering.
final ListenableFuture<Map<String, HostStatus>> statusFuture = client.hostStatuses(hosts);
final Map<String, HostStatus> hostStatuses;
try {
hostStatuses = statusFuture.get(1, TimeUnit.SECONDS);
} catch (TimeoutException | InterruptedException e) {
log.debug("timed out waiting for hostStatuses to finish, will retry");
return null;
}
for (final HostStatus hostStatus : hostStatuses.values()) {
if (hostStatus != null && hostStatus.getStatus() == HostStatus.Status.UP) {
log.info("Ensured that at least one agent is UP in this HeliosDeployment, " + "continuing with test!");
return true;
}
}
return null;
}
});
}
use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.
the class HeliosSoloLogService method runOneIteration.
@Override
protected void runOneIteration() throws Exception {
try {
// fetch all the jobs running on the solo deployment
for (final String host : get(heliosClient.listHosts())) {
final HostStatus hostStatus = get(heliosClient.hostStatus(host));
if (hostStatus == null) {
continue;
}
final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
for (final TaskStatus status : statuses.values()) {
final JobId jobId = status.getJob().getId();
final String containerId = status.getContainerId();
if (isNullOrEmpty(containerId)) {
continue;
}
if (!logFutures.containsKey(containerId)) {
// for any containers we're not already tracking, attach to their stdout/stderr
final Future<?> future = this.executor().submit(new LogFollowJob(containerId, jobId));
logFutures.put(containerId, future);
}
}
}
} catch (Exception e) {
log.debug("Caught exception, will ignore", e);
}
}
use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.
the class MultipleHostsTest method testHostStatuses.
@Test
public void testHostStatuses() throws Exception {
final String aHost = testHost() + "a";
final String bHost = testHost() + "b";
startDefaultMaster();
startDefaultAgent(aHost);
startDefaultAgent(bHost);
awaitHostStatus(aHost, UP, LONG_WAIT_SECONDS, SECONDS);
awaitHostStatus(bHost, UP, LONG_WAIT_SECONDS, SECONDS);
final Map<String, HostStatus> cliStatuses = new ObjectMapper().readValue(cli("hosts", "--json"), new TypeReference<Map<String, HostStatus>>() {
});
assertTrue("status must contain key for " + aHost, cliStatuses.containsKey(aHost));
assertTrue("status must contain key for " + bHost, cliStatuses.containsKey(bHost));
final HeliosClient client = defaultClient();
final Map<String, HostStatus> clientStatuses = client.hostStatuses(ImmutableList.of(aHost, bHost)).get();
assertTrue("status must contain key for " + aHost, clientStatuses.containsKey(aHost));
assertTrue("status must contain key for " + bHost, clientStatuses.containsKey(bHost));
}
use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.
the class HeliosSoloDeployment method undeployLeftoverJobs.
/**
* Undeploy jobs left over by {@link TemporaryJobs}. TemporaryJobs should clean these up,
* but sometimes a few are left behind for whatever reason.
*/
@VisibleForTesting
protected void undeployLeftoverJobs() {
try {
// See if there are jobs running on any helios agent. If we are using TemporaryJobs,
// that class should've undeployed them at this point.
// Any jobs still running at this point have only been partially cleaned up.
// We look for jobs via hostStatus() because the job may have been deleted from the master,
// but the agent may still not have had enough time to undeploy the job from itself.
final List<String> hosts = heliosClient.listHosts().get();
for (final String host : hosts) {
final HostStatus hostStatus = heliosClient.hostStatus(host).get();
final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
for (final Map.Entry<JobId, TaskStatus> status : statuses.entrySet()) {
final JobId jobId = status.getKey();
final Goal goal = status.getValue().getGoal();
if (goal != Goal.UNDEPLOY) {
log.info("Job {} is still set to {} on host {}. Undeploying it now.", jobId, goal, host);
final JobUndeployResponse undeployResponse = heliosClient.undeploy(jobId, host).get();
log.info("Undeploy response for job {} is {}.", jobId, undeployResponse.getStatus());
if (undeployResponse.getStatus() != JobUndeployResponse.Status.OK) {
log.warn("Undeploy response for job {} was not OK. This could mean that something " + "beat the helios-solo master in telling the helios-solo agent to " + "undeploy.", jobId);
}
}
log.info("Waiting for job {} to actually be undeployed...", jobId);
awaitJobUndeployed(heliosClient, host, jobId, jobUndeployWaitSeconds, TimeUnit.SECONDS);
log.info("Job {} successfully undeployed.", jobId);
}
}
} catch (Exception e) {
log.warn("Exception occurred when trying to clean up leftover jobs.", e);
}
}
Aggregations