use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.
the class LocalhostTfServingController method startServingInstance.
/**
* Starts a Tensorflow serving instance. Executes the tfserving bash script to launch a tensorflow serving
* server as serving-user and localize the tf-model from HDFS server. It records the PID of the server for monitoring.
*
* @param project the project to start the serving in
* @param user the user starting the serving
* @param serving the serving instance to start (tfserving modelserver)
* @throws ServingException
*/
public void startServingInstance(Project project, Users user, Serving serving) throws ServingException {
String script = settings.getSudoersDir() + "/tfserving.sh";
// TODO(Fabio) this is bad as we don't know if the port is used or not
Integer grpcPort = ThreadLocalRandom.current().nextInt(40000, 59999);
Integer restPort = ThreadLocalRandom.current().nextInt(40000, 59999);
Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
ProcessDescriptor processDescriptor;
try {
processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("start").addCommand(serving.getName()).addCommand(Paths.get(serving.getModelPath(), serving.getModelVersion().toString()).toString()).addCommand(String.valueOf(grpcPort)).addCommand(String.valueOf(restPort)).addCommand(secretDir.toString()).addCommand(project.getName() + USER_NAME_DELIMITER + user.getUsername()).addCommand(serving.isBatchingEnabled() ? "1" : "0").addCommand(project.getName().toLowerCase()).addCommand(projectUtils.getFullDockerImageName(project, true)).setWaitTimeout(2L, TimeUnit.MINUTES).ignoreOutErrStreams(false).build();
logger.log(Level.INFO, processDescriptor.toString());
} catch (ServiceDiscoveryException ex) {
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
}
// Materialized TLS certificates to be able to read the model
if (settings.getHopsRpcTls()) {
try {
certificateMaterializer.materializeCertificatesLocal(user.getUsername(), project.getName());
} catch (IOException e) {
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, e.getMessage(), e);
} finally {
// Release lock on the serving entry
servingFacade.releaseLock(project, serving.getId());
}
}
try {
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
if (processResult.getExitCode() != 0) {
// Startup process failed for some reason
serving.setCid(CID_STOPPED);
servingFacade.updateDbObject(serving, project);
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.INFO);
}
// Read the pid for TensorFlow Serving server
Path cidFilePath = Paths.get(secretDir.toString(), "tfserving.pid");
String cid = Files.readFirstLine(cidFilePath.toFile(), Charset.defaultCharset());
// Update the info in the db
serving.setCid(cid);
serving.setLocalPort(restPort);
serving.setDeployed(new Date());
servingFacade.updateDbObject(serving, project);
} catch (Exception ex) {
// Startup process failed for some reason
serving.setCid(CID_STOPPED);
servingFacade.updateDbObject(serving, project);
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
} finally {
if (settings.getHopsRpcTls()) {
certificateMaterializer.removeCertificatesLocal(user.getUsername(), project.getName());
}
// release lock on the serving entry
servingFacade.releaseLock(project, serving.getId());
}
}
use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.
the class ZookeeperTopicCleanerTimer method execute.
// Run once per hour
@Schedule(persistent = false, minute = "0", hour = "*")
public void execute(Timer timer) {
LOGGER.log(Level.INFO, "Running ZookeeperTopicCleanerTimer.");
try {
String zkConnectionString = kafkaBrokers.getZookeeperConnectionString();
Set<String> zkTopics = new HashSet<>();
try {
zk = new ZooKeeper(zkConnectionString, Settings.ZOOKEEPER_SESSION_TIMEOUT_MS, new ZookeeperWatcher());
List<String> topics = zk.getChildren("/brokers/topics", false);
zkTopics.addAll(topics);
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, "Unable to find the zookeeper server: ", ex.toString());
} catch (KeeperException | InterruptedException ex) {
LOGGER.log(Level.SEVERE, "Cannot retrieve topic list from Zookeeper", ex);
} finally {
if (zk != null) {
try {
zk.close();
} catch (InterruptedException ex) {
LOGGER.log(Level.SEVERE, "Unable to close zookeeper connection", ex);
}
zk = null;
}
}
List<ProjectTopics> dbProjectTopics = em.createNamedQuery("ProjectTopics.findAll").getResultList();
Set<String> dbTopics = new HashSet<>();
for (ProjectTopics pt : dbProjectTopics) {
dbTopics.add(pt.getTopicName());
}
/*
* To remove topics from zookeeper which do not exist in database. This
* situation
* happens when a hopsworks project is deleted, because all the topics in
* the project
* will be deleted (cascade delete) without deleting them from the Kafka
* cluster.
* 1. get all topics from zookeeper
* 2. get the topics which exist in zookeeper, but not in database
* zkTopics.removeAll(dbTopics);
* 3. remove those topics
*/
zkTopics.removeAll(dbTopics);
// DON'T remove offset topic
zkTopics.remove(offsetTopic);
if (!zkTopics.isEmpty()) {
// blocks until all are deleted
try {
hopsKafkaAdminClient.deleteTopics(zkTopics).all().get();
LOGGER.log(Level.INFO, "Removed topics {0} from Kafka", new Object[] { zkTopics });
} catch (ExecutionException | InterruptedException ex) {
LOGGER.log(Level.SEVERE, "Error dropping topics from Kafka", ex);
}
}
} catch (ServiceDiscoveryException ex) {
LOGGER.log(Level.SEVERE, "Could not discover Zookeeper server addresses", ex);
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Got an exception while cleaning up kafka topics", ex);
}
}
use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.
the class LivyController method getLivySession.
/**
* Get livy session by id
*
* @param sessionId
* @return
*/
public LivyMsg.Session getLivySession(int sessionId) {
Client client = ClientBuilder.newClient();
LivyMsg.Session session = null;
try {
String livyUrl = getLivyURL();
WebTarget target = client.target(livyUrl).path("/sessions/" + sessionId);
session = target.request().get(LivyMsg.Session.class);
} catch (NotFoundException | ServiceDiscoveryException e) {
LOGGER.log(Level.WARNING, null, e);
return null;
} finally {
client.close();
}
return session;
}
use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.
the class LivyController method getLivySessions.
/**
* Get all livy sessions
*
* @return
*/
public LivyMsg getLivySessions() {
LivyMsg livySession = null;
Client client = ClientBuilder.newClient();
try {
WebTarget target = client.target(getLivyURL()).path("/sessions");
livySession = target.request().get(LivyMsg.class);
} catch (ServiceDiscoveryException ex) {
LOGGER.log(Level.WARNING, null, ex);
return null;
} finally {
client.close();
}
return livySession;
}
use of com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException in project hopsworks by logicalclocks.
the class FlinkController method startJob.
public Execution startJob(final Jobs job, final Users user) throws GenericException, JobException, ServiceException {
// First: some parameter checking.
if (job == null) {
throw new NullPointerException("Cannot run a null job.");
} else if (user == null) {
throw new NullPointerException("Cannot run a job as a null user.");
} else if (job.getJobType() != JobType.FLINK) {
throw new IllegalArgumentException("Job configuration is not a Flink job configuration.");
}
// Set Hopsworks consul service domain, don't use the address, use the name
String username = hdfsUsersBean.getHdfsUserName(job.getProject(), user);
FlinkJob flinkjob = null;
try {
String hopsworksRestEndpoint = "https://" + serviceDiscoveryController.constructServiceFQDNWithPort(ServiceDiscoveryController.HopsworksService.HOPSWORKS_APP);
UserGroupInformation proxyUser = ugiService.getProxyUser(username);
try {
flinkjob = proxyUser.doAs((PrivilegedExceptionAction<FlinkJob>) () -> new FlinkJob(job, submitter, user, hdfsUsersBean.getHdfsUserName(job.getProject(), job.getCreator()), settings, kafkaBrokers.getKafkaBrokersString(), hopsworksRestEndpoint, servingConfig, serviceDiscoveryController));
} catch (InterruptedException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
} catch (IOException ex) {
throw new JobException(RESTCodes.JobErrorCode.PROXY_ERROR, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
} catch (ServiceDiscoveryException ex) {
throw new ServiceException(RESTCodes.ServiceErrorCode.SERVICE_NOT_FOUND, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
}
if (flinkjob == null) {
throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ERROR, Level.WARNING, "Could not instantiate job with name: " + job.getName() + " and id: " + job.getId(), "sparkjob object was null");
}
Execution execution = flinkjob.requestExecutionId();
submitter.startExecution(flinkjob);
activityFacade.persistActivity(ActivityFacade.RAN_JOB, job.getProject(), user.asUser(), ActivityFlag.JOB);
return execution;
}
Aggregations