Search in sources :

Example 1 with ServiceDiscoveryException

use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.

the class LocalhostTfServingController method startServingInstance.

/**
 * Starts a Tensorflow serving instance. Executes the tfserving bash script to launch a tensorflow serving
 * server as serving-user and localize the tf-model from HDFS server. It records the PID of the server for monitoring.
 *
 * @param project the project to start the serving in
 * @param user the user starting the serving
 * @param serving the serving instance to start (tfserving modelserver)
 * @throws ServingException
 */
public void startServingInstance(Project project, Users user, Serving serving) throws ServingException {
    String script = settings.getSudoersDir() + "/tfserving.sh";
    // TODO(Fabio) this is bad as we don't know if the port is used or not
    Integer grpcPort = ThreadLocalRandom.current().nextInt(40000, 59999);
    Integer restPort = ThreadLocalRandom.current().nextInt(40000, 59999);
    Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
    ProcessDescriptor processDescriptor;
    try {
        processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("start").addCommand(serving.getName()).addCommand(Paths.get(serving.getModelPath(), serving.getModelVersion().toString()).toString()).addCommand(String.valueOf(grpcPort)).addCommand(String.valueOf(restPort)).addCommand(secretDir.toString()).addCommand(project.getName() + USER_NAME_DELIMITER + user.getUsername()).addCommand(serving.isBatchingEnabled() ? "1" : "0").addCommand(project.getName().toLowerCase()).addCommand(projectUtils.getFullDockerImageName(project, true)).setWaitTimeout(2L, TimeUnit.MINUTES).ignoreOutErrStreams(false).build();
        logger.log(Level.INFO, processDescriptor.toString());
    } catch (ServiceDiscoveryException ex) {
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
    }
    // Materialized TLS certificates to be able to read the model
    if (settings.getHopsRpcTls()) {
        try {
            certificateMaterializer.materializeCertificatesLocal(user.getUsername(), project.getName());
        } catch (IOException e) {
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, e.getMessage(), e);
        } finally {
            // Release lock on the serving entry
            servingFacade.releaseLock(project, serving.getId());
        }
    }
    try {
        ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
        if (processResult.getExitCode() != 0) {
            // Startup process failed for some reason
            serving.setCid(CID_STOPPED);
            servingFacade.updateDbObject(serving, project);
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.INFO);
        }
        // Read the pid for TensorFlow Serving server
        Path cidFilePath = Paths.get(secretDir.toString(), "tfserving.pid");
        String cid = Files.readFirstLine(cidFilePath.toFile(), Charset.defaultCharset());
        // Update the info in the db
        serving.setCid(cid);
        serving.setLocalPort(restPort);
        serving.setDeployed(new Date());
        servingFacade.updateDbObject(serving, project);
    } catch (Exception ex) {
        // Startup process failed for some reason
        serving.setCid(CID_STOPPED);
        servingFacade.updateDbObject(serving, project);
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
    } finally {
        if (settings.getHopsRpcTls()) {
            certificateMaterializer.removeCertificatesLocal(user.getUsername(), project.getName());
        }
        // release lock on the serving entry
        servingFacade.releaseLock(project, serving.getId());
    }
}
Also used : Path(java.nio.file.Path) ServingException(io.hops.hopsworks.exceptions.ServingException) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) IOException(java.io.IOException) Date(java.util.Date) IOException(java.io.IOException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) ServingException(io.hops.hopsworks.exceptions.ServingException)

Example 2 with ServiceDiscoveryException

use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.

the class ZookeeperTopicCleanerTimer method execute.

// Run once per hour
@Schedule(persistent = false, minute = "0", hour = "*")
public void execute(Timer timer) {
    LOGGER.log(Level.INFO, "Running ZookeeperTopicCleanerTimer.");
    try {
        String zkConnectionString = kafkaBrokers.getZookeeperConnectionString();
        Set<String> zkTopics = new HashSet<>();
        try {
            zk = new ZooKeeper(zkConnectionString, Settings.ZOOKEEPER_SESSION_TIMEOUT_MS, new ZookeeperWatcher());
            List<String> topics = zk.getChildren("/brokers/topics", false);
            zkTopics.addAll(topics);
        } catch (IOException ex) {
            LOGGER.log(Level.SEVERE, "Unable to find the zookeeper server: ", ex.toString());
        } catch (KeeperException | InterruptedException ex) {
            LOGGER.log(Level.SEVERE, "Cannot retrieve topic list from Zookeeper", ex);
        } finally {
            if (zk != null) {
                try {
                    zk.close();
                } catch (InterruptedException ex) {
                    LOGGER.log(Level.SEVERE, "Unable to close zookeeper connection", ex);
                }
                zk = null;
            }
        }
        List<ProjectTopics> dbProjectTopics = em.createNamedQuery("ProjectTopics.findAll").getResultList();
        Set<String> dbTopics = new HashSet<>();
        for (ProjectTopics pt : dbProjectTopics) {
            dbTopics.add(pt.getTopicName());
        }
        /*
       * To remove topics from zookeeper which do not exist in database. This
       * situation
       * happens when a hopsworks project is deleted, because all the topics in
       * the project
       * will be deleted (cascade delete) without deleting them from the Kafka
       * cluster.
       * 1. get all topics from zookeeper
       * 2. get the topics which exist in zookeeper, but not in database
       * zkTopics.removeAll(dbTopics);
       * 3. remove those topics
       */
        zkTopics.removeAll(dbTopics);
        // DON'T remove offset topic
        zkTopics.remove(offsetTopic);
        if (!zkTopics.isEmpty()) {
            // blocks until all are deleted
            try {
                hopsKafkaAdminClient.deleteTopics(zkTopics).all().get();
                LOGGER.log(Level.INFO, "Removed topics {0} from Kafka", new Object[] { zkTopics });
            } catch (ExecutionException | InterruptedException ex) {
                LOGGER.log(Level.SEVERE, "Error dropping topics from Kafka", ex);
            }
        }
    } catch (ServiceDiscoveryException ex) {
        LOGGER.log(Level.SEVERE, "Could not discover Zookeeper server addresses", ex);
    } catch (Exception ex) {
        LOGGER.log(Level.SEVERE, "Got an exception while cleaning up kafka topics", ex);
    }
}
Also used : IOException(java.io.IOException) KeeperException(org.apache.zookeeper.KeeperException) IOException(java.io.IOException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) ExecutionException(java.util.concurrent.ExecutionException) ZooKeeper(org.apache.zookeeper.ZooKeeper) ProjectTopics(io.hops.hopsworks.persistence.entity.kafka.ProjectTopics) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) ExecutionException(java.util.concurrent.ExecutionException) KeeperException(org.apache.zookeeper.KeeperException) HashSet(java.util.HashSet) Schedule(javax.ejb.Schedule)

Example 3 with ServiceDiscoveryException

use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.

the class LivyController method getLivySession.

/**
 * Get livy session by id
 *
 * @param sessionId
 * @return
 */
public LivyMsg.Session getLivySession(int sessionId) {
    Client client = ClientBuilder.newClient();
    LivyMsg.Session session = null;
    try {
        String livyUrl = getLivyURL();
        WebTarget target = client.target(livyUrl).path("/sessions/" + sessionId);
        session = target.request().get(LivyMsg.Session.class);
    } catch (NotFoundException | ServiceDiscoveryException e) {
        LOGGER.log(Level.WARNING, null, e);
        return null;
    } finally {
        client.close();
    }
    return session;
}
Also used : NotFoundException(javax.ws.rs.NotFoundException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) WebTarget(javax.ws.rs.client.WebTarget) Client(javax.ws.rs.client.Client)

Example 4 with ServiceDiscoveryException

use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.

the class LivyController method getLivySessions.

/**
 * Get all livy sessions
 *
 * @return
 */
public LivyMsg getLivySessions() {
    LivyMsg livySession = null;
    Client client = ClientBuilder.newClient();
    try {
        WebTarget target = client.target(getLivyURL()).path("/sessions");
        livySession = target.request().get(LivyMsg.class);
    } catch (ServiceDiscoveryException ex) {
        LOGGER.log(Level.WARNING, null, ex);
        return null;
    } finally {
        client.close();
    }
    return livySession;
}
Also used : ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) WebTarget(javax.ws.rs.client.WebTarget) Client(javax.ws.rs.client.Client)

Example 5 with ServiceDiscoveryException

use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.

the class FlinkController method startJob.

public Execution startJob(final Jobs job, final Users user) throws GenericException, JobException, ServiceException {
    // First: some parameter checking.
    if (job == null) {
        throw new NullPointerException("Cannot run a null job.");
    } else if (user == null) {
        throw new NullPointerException("Cannot run a job as a null user.");
    } else if (job.getJobType() != JobType.FLINK) {
        throw new IllegalArgumentException("Job configuration is not a Flink job configuration.");
    }
    // Set Hopsworks consul service domain, don't use the address, use the name
    String username = hdfsUsersBean.getHdfsUserName(job.getProject(), user);
    FlinkJob flinkjob = null;
    try {
        String hopsworksRestEndpoint = "https://" + serviceDiscoveryController.constructServiceFQDNWithPort(ServiceDiscoveryController.HopsworksService.HOPSWORKS_APP);
        UserGroupInformation proxyUser = ugiService.getProxyUser(username);
        try {
            flinkjob = proxyUser.doAs((PrivilegedExceptionAction<FlinkJob>) () -> new FlinkJob(job, submitter, user, hdfsUsersBean.getHdfsUserName(job.getProject(), job.getCreator()), settings, kafkaBrokers.getKafkaBrokersString(), hopsworksRestEndpoint, servingConfig, serviceDiscoveryController));
        } catch (InterruptedException ex) {
            LOGGER.log(Level.SEVERE, null, ex);
        }
    } catch (IOException ex) {
        throw new JobException(RESTCodes.JobErrorCode.PROXY_ERROR, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
    } catch (ServiceDiscoveryException ex) {
        throw new ServiceException(RESTCodes.ServiceErrorCode.SERVICE_NOT_FOUND, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
    }
    if (flinkjob == null) {
        throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ERROR, Level.WARNING, "Could not instantiate job with name: " + job.getName() + " and id: " + job.getId(), "sparkjob object was null");
    }
    Execution execution = flinkjob.requestExecutionId();
    submitter.startExecution(flinkjob);
    activityFacade.persistActivity(ActivityFacade.RAN_JOB, job.getProject(), user.asUser(), ActivityFlag.JOB);
    return execution;
}
Also used : PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) GenericException(io.hops.hopsworks.exceptions.GenericException) JobException(io.hops.hopsworks.exceptions.JobException) Execution(io.hops.hopsworks.persistence.entity.jobs.history.Execution) ServiceException(io.hops.hopsworks.exceptions.ServiceException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Aggregations

ServiceDiscoveryException (com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException)32 IOException (java.io.IOException)16 ServiceException (io.hops.hopsworks.exceptions.ServiceException)11 Service (com.logicalclocks.servicediscoverclient.service.Service)7 FeaturestoreException (io.hops.hopsworks.exceptions.FeaturestoreException)6 SQLException (java.sql.SQLException)4 Level (java.util.logging.Level)4 TransactionAttribute (javax.ejb.TransactionAttribute)4 FeaturestoreStorageConnectorDTO (io.hops.hopsworks.common.featurestore.storageconnectors.FeaturestoreStorageConnectorDTO)3 ServiceDiscoveryController (io.hops.hopsworks.common.hosts.ServiceDiscoveryController)3 Dataset (io.hops.hopsworks.persistence.entity.dataset.Dataset)3 Featurestore (io.hops.hopsworks.persistence.entity.featurestore.Featurestore)3 ArrayList (java.util.ArrayList)3 Date (java.util.Date)3 List (java.util.List)3 HttpHost (org.apache.http.HttpHost)3 TemplateException (freemarker.template.TemplateException)2 FeaturestoreHopsfsConnectorDTO (io.hops.hopsworks.common.featurestore.storageconnectors.hopsfs.FeaturestoreHopsfsConnectorDTO)2 Settings (io.hops.hopsworks.common.util.Settings)2 ProjectException (io.hops.hopsworks.exceptions.ProjectException)2