Search in sources :

Example 21 with HostStatus

use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.

the class DefaultDeployer method pickHost.

@VisibleForTesting
String pickHost(final List<String> filteredHosts) {
    final List<String> mutatedList = Lists.newArrayList(filteredHosts);
    while (true) {
        final String candidateHost = hostPicker.pickHost(mutatedList);
        try {
            final HostStatus hostStatus = client.hostStatus(candidateHost).get();
            if (hostStatus != null && Status.UP == hostStatus.getStatus()) {
                return candidateHost;
            }
            mutatedList.remove(candidateHost);
            if (mutatedList.isEmpty()) {
                fail("all hosts matching filter pattern are DOWN");
            }
        } catch (InterruptedException | ExecutionException e) {
            throw new RuntimeException(e);
        }
    }
}
Also used : HostStatus(com.spotify.helios.common.descriptors.HostStatus) ExecutionException(java.util.concurrent.ExecutionException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 22 with HostStatus

use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.

the class HeliosDeploymentResource method before.

/**
 * Ensure that the HeliosDeployment is up.
 */
@Override
public void before() throws Throwable {
    super.before();
    log.info("verifying connectivity to {}", deployment.address());
    // wait for the helios master to be available
    Polling.awaitUnchecked(30, TimeUnit.SECONDS, "Could not connect to HeliosDeployment at " + deployment.address() + " after %d %s", new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            final HostAndPort hap = deployment.address();
            final SocketAddress address = new InetSocketAddress(hap.getHost(), hap.getPort());
            log.debug("attempting to connect to {}", address);
            try {
                final Socket s = new Socket();
                s.connect(address, 100);
                log.info("successfully connected to address {} for {}", address, deployment);
                return true;
            } catch (SocketTimeoutException | ConnectException e) {
                log.debug("could not yet connect to HeliosDeployment: {}", e.toString());
                return null;
            }
        }
    });
    // Ensure that at least one agent is available and UP in this HeliosDeployment.
    // This prevents continuing with the test when starting up helios-solo before the agent is
    // registered.
    final HeliosClient client = client();
    Polling.awaitUnchecked(30, TimeUnit.SECONDS, "No agents were available at HeliosDeployment at " + deployment.address() + " after %d %s", new Callable<Boolean>() {

        @Override
        public Boolean call() throws Exception {
            final ListenableFuture<List<String>> future = client.listHosts();
            final List<String> hosts;
            try {
                // use a short timeout to allow this request to be retried a few times by the
                // Polling.await loop
                hosts = future.get(1, TimeUnit.SECONDS);
            } catch (TimeoutException | InterruptedException e) {
                log.debug("timed out waiting for listHosts request to finish, will retry");
                return null;
            }
            if (hosts.isEmpty()) {
                log.debug("0 agents in {}, will retry", deployment);
                return null;
            }
            // Check that at least one host is UP (is maintaining a reasonably reliable
            // connection to ZK) in addition to registering.
            final ListenableFuture<Map<String, HostStatus>> statusFuture = client.hostStatuses(hosts);
            final Map<String, HostStatus> hostStatuses;
            try {
                hostStatuses = statusFuture.get(1, TimeUnit.SECONDS);
            } catch (TimeoutException | InterruptedException e) {
                log.debug("timed out waiting for hostStatuses to finish, will retry");
                return null;
            }
            for (final HostStatus hostStatus : hostStatuses.values()) {
                if (hostStatus != null && hostStatus.getStatus() == HostStatus.Status.UP) {
                    log.info("Ensured that at least one agent is UP in this HeliosDeployment, " + "continuing with test!");
                    return true;
                }
            }
            return null;
        }
    });
}
Also used : InetSocketAddress(java.net.InetSocketAddress) HeliosClient(com.spotify.helios.client.HeliosClient) TimeoutException(java.util.concurrent.TimeoutException) SocketTimeoutException(java.net.SocketTimeoutException) ConnectException(java.net.ConnectException) HostAndPort(com.google.common.net.HostAndPort) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) List(java.util.List) HostStatus(com.spotify.helios.common.descriptors.HostStatus) SocketAddress(java.net.SocketAddress) InetSocketAddress(java.net.InetSocketAddress) Map(java.util.Map) Socket(java.net.Socket)

Example 23 with HostStatus

use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.

the class HeliosSoloLogService method runOneIteration.

@Override
protected void runOneIteration() throws Exception {
    try {
        // fetch all the jobs running on the solo deployment
        for (final String host : get(heliosClient.listHosts())) {
            final HostStatus hostStatus = get(heliosClient.hostStatus(host));
            if (hostStatus == null) {
                continue;
            }
            final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
            for (final TaskStatus status : statuses.values()) {
                final JobId jobId = status.getJob().getId();
                final String containerId = status.getContainerId();
                if (isNullOrEmpty(containerId)) {
                    continue;
                }
                if (!logFutures.containsKey(containerId)) {
                    // for any containers we're not already tracking, attach to their stdout/stderr
                    final Future<?> future = this.executor().submit(new LogFollowJob(containerId, jobId));
                    logFutures.put(containerId, future);
                }
            }
        }
    } catch (Exception e) {
        log.debug("Caught exception, will ignore", e);
    }
}
Also used : HostStatus(com.spotify.helios.common.descriptors.HostStatus) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) JobId(com.spotify.helios.common.descriptors.JobId) TimeoutException(java.util.concurrent.TimeoutException) DockerException(com.spotify.docker.client.exceptions.DockerException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) ConnectionClosedException(org.apache.http.ConnectionClosedException)

Example 24 with HostStatus

use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.

the class MultipleHostsTest method testHostStatuses.

@Test
public void testHostStatuses() throws Exception {
    final String aHost = testHost() + "a";
    final String bHost = testHost() + "b";
    startDefaultMaster();
    startDefaultAgent(aHost);
    startDefaultAgent(bHost);
    awaitHostStatus(aHost, UP, LONG_WAIT_SECONDS, SECONDS);
    awaitHostStatus(bHost, UP, LONG_WAIT_SECONDS, SECONDS);
    final Map<String, HostStatus> cliStatuses = new ObjectMapper().readValue(cli("hosts", "--json"), new TypeReference<Map<String, HostStatus>>() {
    });
    assertTrue("status must contain key for " + aHost, cliStatuses.containsKey(aHost));
    assertTrue("status must contain key for " + bHost, cliStatuses.containsKey(bHost));
    final HeliosClient client = defaultClient();
    final Map<String, HostStatus> clientStatuses = client.hostStatuses(ImmutableList.of(aHost, bHost)).get();
    assertTrue("status must contain key for " + aHost, clientStatuses.containsKey(aHost));
    assertTrue("status must contain key for " + bHost, clientStatuses.containsKey(bHost));
}
Also used : HostStatus(com.spotify.helios.common.descriptors.HostStatus) HeliosClient(com.spotify.helios.client.HeliosClient) Map(java.util.Map) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Test(org.junit.Test)

Example 25 with HostStatus

use of com.spotify.helios.common.descriptors.HostStatus in project helios by spotify.

the class HeliosSoloDeployment method undeployLeftoverJobs.

/**
 * Undeploy jobs left over by {@link TemporaryJobs}. TemporaryJobs should clean these up,
 * but sometimes a few are left behind for whatever reason.
 */
@VisibleForTesting
protected void undeployLeftoverJobs() {
    try {
        // See if there are jobs running on any helios agent. If we are using TemporaryJobs,
        // that class should've undeployed them at this point.
        // Any jobs still running at this point have only been partially cleaned up.
        // We look for jobs via hostStatus() because the job may have been deleted from the master,
        // but the agent may still not have had enough time to undeploy the job from itself.
        final List<String> hosts = heliosClient.listHosts().get();
        for (final String host : hosts) {
            final HostStatus hostStatus = heliosClient.hostStatus(host).get();
            final Map<JobId, TaskStatus> statuses = hostStatus.getStatuses();
            for (final Map.Entry<JobId, TaskStatus> status : statuses.entrySet()) {
                final JobId jobId = status.getKey();
                final Goal goal = status.getValue().getGoal();
                if (goal != Goal.UNDEPLOY) {
                    log.info("Job {} is still set to {} on host {}. Undeploying it now.", jobId, goal, host);
                    final JobUndeployResponse undeployResponse = heliosClient.undeploy(jobId, host).get();
                    log.info("Undeploy response for job {} is {}.", jobId, undeployResponse.getStatus());
                    if (undeployResponse.getStatus() != JobUndeployResponse.Status.OK) {
                        log.warn("Undeploy response for job {} was not OK. This could mean that something " + "beat the helios-solo master in telling the helios-solo agent to " + "undeploy.", jobId);
                    }
                }
                log.info("Waiting for job {} to actually be undeployed...", jobId);
                awaitJobUndeployed(heliosClient, host, jobId, jobUndeployWaitSeconds, TimeUnit.SECONDS);
                log.info("Job {} successfully undeployed.", jobId);
            }
        }
    } catch (Exception e) {
        log.warn("Exception occurred when trying to clean up leftover jobs.", e);
    }
}
Also used : Goal(com.spotify.helios.common.descriptors.Goal) JobUndeployResponse(com.spotify.helios.common.protocol.JobUndeployResponse) HostStatus(com.spotify.helios.common.descriptors.HostStatus) TaskStatus(com.spotify.helios.common.descriptors.TaskStatus) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) JobId(com.spotify.helios.common.descriptors.JobId) DockerCertificateException(com.spotify.docker.client.exceptions.DockerCertificateException) DockerException(com.spotify.docker.client.exceptions.DockerException) ImageNotFoundException(com.spotify.docker.client.exceptions.ImageNotFoundException) UnknownHostException(java.net.UnknownHostException) ExecutionException(java.util.concurrent.ExecutionException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

HostStatus (com.spotify.helios.common.descriptors.HostStatus)28 Test (org.junit.Test)17 JobId (com.spotify.helios.common.descriptors.JobId)13 TaskStatus (com.spotify.helios.common.descriptors.TaskStatus)10 HeliosClient (com.spotify.helios.client.HeliosClient)9 Map (java.util.Map)8 Job (com.spotify.helios.common.descriptors.Job)6 Deployment (com.spotify.helios.common.descriptors.Deployment)5 Matchers.containsString (org.hamcrest.Matchers.containsString)5 DockerClient (com.spotify.docker.client.DockerClient)4 JobUndeployResponse (com.spotify.helios.common.protocol.JobUndeployResponse)4 ExecutionException (java.util.concurrent.ExecutionException)4 AgentMain (com.spotify.helios.agent.AgentMain)3 DeploymentGroup (com.spotify.helios.common.descriptors.DeploymentGroup)3 CreateJobResponse (com.spotify.helios.common.protocol.CreateJobResponse)3 JobDeployResponse (com.spotify.helios.common.protocol.JobDeployResponse)3 TimeoutException (java.util.concurrent.TimeoutException)3 ExceptionMetered (com.codahale.metrics.annotation.ExceptionMetered)2 Timed (com.codahale.metrics.annotation.Timed)2 TypeReference (com.fasterxml.jackson.core.type.TypeReference)2