use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.
the class LocalhostServingController method startOrStop.
/**
* Starts or stop a serving instance (depending on the user command). Will call the controller for the corresponding
* model server, such as Tensorflow Serving or Python
*
* @param project the project where the serving resides
* @param user the user making the request
* @param servingId the id of the serving
* @param command the command (start or stop)
* @throws ServingException if the serving could not be started or lock could be acquired
*/
@Override
public void startOrStop(Project project, Users user, Integer servingId, ServingCommands command) throws ServingException {
Serving serving = servingFacade.acquireLock(project, servingId);
ServingStatusEnum currentStatus = getServingStatus(serving);
// If we reached this point, we just acquired a lock
if (currentStatus == ServingStatusEnum.STARTING && command == ServingCommands.START) {
startServingInstance(project, user, serving);
// getServingStatus returns UPDATING if the PID is different than -2 and there is a lock.
// If we reached this point, we just acquired a lock
} else if (currentStatus == ServingStatusEnum.UPDATING && command == ServingCommands.STOP) {
killServingInstance(project, serving, true);
} else {
// Release lock before throwing the exception
servingFacade.releaseLock(project, servingId);
String userMsg = "Instance is already " + (command == ServingCommands.START ? ServingStatusEnum.STARTED.toString() : ServingStatusEnum.STOPPED.toString()).toLowerCase();
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERROR, Level.FINE, userMsg);
}
}
use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.
the class LocalhostServingController method getServingInternal.
/**
* Gets the internal representation of a serving. The internal represenation contains extra information that
* is not exposed to the user, such as status, available replicas, nodeport, and extended kafka details
*
* @param serving the serving to get the internal representation for
* @return internal representation of the serving
*/
private ServingWrapper getServingInternal(Serving serving) throws ServingException {
ServingWrapper servingWrapper = new ServingWrapper(serving);
ServingStatusEnum status = getServingStatus(serving);
servingWrapper.setStatus(status);
switch(status) {
case STOPPED:
case STARTING:
case UPDATING:
servingWrapper.setAvailableReplicas(0);
servingWrapper.setInternalPort(null);
break;
case RUNNING:
servingWrapper.setAvailableReplicas(1);
servingWrapper.setInternalPort(serving.getLocalPort());
}
String internalIP;
try {
internalIP = serviceDiscoveryController.getAnyAddressOfServiceWithDNS(ServiceDiscoveryController.HopsworksService.HOPSWORKS_APP).getAddress();
} catch (ServiceDiscoveryException e) {
String userMsg = "Could not find internal host for serving instance '" + serving.getName() + "'";
throw new ServingException(RESTCodes.ServingErrorCode.STATUSERROR, Level.FINE, userMsg);
}
servingWrapper.setInternalIPs(Collections.singletonList(internalIP));
String path;
if (serving.getModelServer() == ModelServer.TENSORFLOW_SERVING) {
path = localhostTfInferenceUtils.getPath(serving.getName(), serving.getModelVersion(), null);
} else if (serving.getModelServer() == ModelServer.PYTHON) {
path = localhostSkLearnInferenceUtils.getPath(null);
} else {
throw new UnsupportedOperationException("Model server not supported as local serving");
}
servingWrapper.setInternalPath(path);
// These values will be fetched from the location href in the UI (client-side). By doing this, we make sure
// that we display the correct host and port to reach Hopsworks. For instance, using proxies or SSH
// tunneling, the port might differ from the default 80 or 443 on the client side.
servingWrapper.setExternalIP(null);
servingWrapper.setExternalPort(null);
servingWrapper.setKafkaTopicDTO(kafkaServingHelper.buildTopicDTO(serving));
return servingWrapper;
}
use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.
the class LocalhostServingMonitor method monitor.
@Timeout
public void monitor(Timer timer) {
try {
// Get the list of running Localhost Serving instances
List<Serving> servingList = servingFacade.getLocalhostRunning();
for (Serving serving : servingList) {
try {
Serving dbServing = servingFacade.acquireLock(serving.getProject(), serving.getId());
ProcessDescriptor.Builder builder = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo");
if (serving.getModelServer() == ModelServer.TENSORFLOW_SERVING) {
builder.addCommand(tfScript);
}
if (serving.getModelServer() == ModelServer.PYTHON) {
builder.addCommand(sklearnScript);
}
ProcessDescriptor processDescriptor = builder.addCommand("alive").addCommand(dbServing.getProject().getName().toLowerCase()).addCommand(dbServing.getName()).ignoreOutErrStreams(true).build();
LOGGER.log(Level.FINE, processDescriptor.toString());
try {
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
if (processResult.getExitCode() != 0) {
// The processes is dead, run the kill script to delete the directory
// and update the value in the db
Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
builder = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo");
if (serving.getModelServer() == ModelServer.TENSORFLOW_SERVING) {
builder.addCommand(tfScript);
}
if (serving.getModelServer() == ModelServer.PYTHON) {
builder.addCommand(sklearnScript);
}
processDescriptor = builder.addCommand("kill").addCommand(dbServing.getCid()).addCommand(dbServing.getName()).addCommand(dbServing.getProject().getName().toLowerCase()).addCommand(secretDir.toString()).ignoreOutErrStreams(true).build();
LOGGER.log(Level.FINE, processDescriptor.toString());
osProcessExecutor.execute(processDescriptor);
// If the process succeeded to delete the localDir update the db
dbServing.setCid(CID_STOPPED);
dbServing.setLocalPort(-1);
servingFacade.updateDbObject(dbServing, dbServing.getProject());
}
} catch (IOException e) {
LOGGER.log(Level.SEVERE, "Could not clean up serving instance with id: " + serving.getId(), e);
}
servingFacade.releaseLock(serving.getProject(), serving.getId());
} catch (ServingException e) {
LOGGER.log(Level.INFO, "Error processing serving instance with id: " + serving.getId(), e);
}
}
} catch (Exception e) {
LOGGER.log(Level.SEVERE, "Got an exception while monitoring servings", e);
}
}
use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.
the class ServingFacade method releaseLock.
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public Serving releaseLock(Project project, Integer id) throws ServingException {
int retries = 5;
// Acquire DB read lock on the row
while (retries > 0) {
try {
Serving serving = em.createNamedQuery("Serving.findByProjectAndId", Serving.class).setParameter("project", project).setParameter("id", id).setLockMode(LockModeType.PESSIMISTIC_WRITE).getSingleResult();
serving.setLockIP(null);
serving.setLockTimestamp(null);
return em.merge(serving);
} catch (LockTimeoutException e) {
retries--;
}
}
// Lock will be claimed
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.FINE);
}
use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.
the class ServingFacade method acquireLock.
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public Serving acquireLock(Project project, Integer id) throws ServingException {
int retries = 5;
if (nodeIP == null) {
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE);
}
// Acquire DB read lock on the row
while (retries > 0) {
try {
Serving serving = em.createNamedQuery("Serving.findByProjectAndId", Serving.class).setParameter("project", project).setParameter("id", id).setLockMode(LockModeType.PESSIMISTIC_WRITE).getSingleResult();
if (serving == null) {
throw new ServingException(RESTCodes.ServingErrorCode.INSTANCENOTFOUND, Level.WARNING);
}
if (serving.getLockIP() != null && serving.getLockTimestamp() > System.currentTimeMillis() - LOCK_TIMEOUT) {
// There is another request working on this entry. Wait.
retries--;
continue;
}
serving.setLockIP(nodeIP);
serving.setLockTimestamp(System.currentTimeMillis());
// Lock acquire, return;
return em.merge(serving);
} catch (LockTimeoutException e) {
retries--;
}
}
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.FINE, "Instance is busy. Please, " + "try later");
}
Aggregations