Search in sources :

Example 6 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class LocalhostServingController method startOrStop.

/**
 * Starts or stop a serving instance (depending on the user command). Will call the controller for the corresponding
 * model server, such as Tensorflow Serving or Python
 *
 * @param project the project where the serving resides
 * @param user the user making the request
 * @param servingId the id of the serving
 * @param command the command (start or stop)
 * @throws ServingException if the serving could not be started or lock could be acquired
 */
@Override
public void startOrStop(Project project, Users user, Integer servingId, ServingCommands command) throws ServingException {
    Serving serving = servingFacade.acquireLock(project, servingId);
    ServingStatusEnum currentStatus = getServingStatus(serving);
    // If we reached this point, we just acquired a lock
    if (currentStatus == ServingStatusEnum.STARTING && command == ServingCommands.START) {
        startServingInstance(project, user, serving);
    // getServingStatus returns UPDATING if the PID is different than -2 and there is a lock.
    // If we reached this point, we just acquired a lock
    } else if (currentStatus == ServingStatusEnum.UPDATING && command == ServingCommands.STOP) {
        killServingInstance(project, serving, true);
    } else {
        // Release lock before throwing the exception
        servingFacade.releaseLock(project, servingId);
        String userMsg = "Instance is already " + (command == ServingCommands.START ? ServingStatusEnum.STARTED.toString() : ServingStatusEnum.STOPPED.toString()).toLowerCase();
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERROR, Level.FINE, userMsg);
    }
}
Also used : Serving(io.hops.hopsworks.persistence.entity.serving.Serving) ServingException(io.hops.hopsworks.exceptions.ServingException)

Example 7 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class LocalhostServingController method getServingInternal.

/**
 * Gets the internal representation of a serving. The internal represenation contains extra information that
 * is not exposed to the user, such as status, available replicas, nodeport, and extended kafka details
 *
 * @param serving the serving to get the internal representation for
 * @return internal representation of the serving
 */
private ServingWrapper getServingInternal(Serving serving) throws ServingException {
    ServingWrapper servingWrapper = new ServingWrapper(serving);
    ServingStatusEnum status = getServingStatus(serving);
    servingWrapper.setStatus(status);
    switch(status) {
        case STOPPED:
        case STARTING:
        case UPDATING:
            servingWrapper.setAvailableReplicas(0);
            servingWrapper.setInternalPort(null);
            break;
        case RUNNING:
            servingWrapper.setAvailableReplicas(1);
            servingWrapper.setInternalPort(serving.getLocalPort());
    }
    String internalIP;
    try {
        internalIP = serviceDiscoveryController.getAnyAddressOfServiceWithDNS(ServiceDiscoveryController.HopsworksService.HOPSWORKS_APP).getAddress();
    } catch (ServiceDiscoveryException e) {
        String userMsg = "Could not find internal host for serving instance '" + serving.getName() + "'";
        throw new ServingException(RESTCodes.ServingErrorCode.STATUSERROR, Level.FINE, userMsg);
    }
    servingWrapper.setInternalIPs(Collections.singletonList(internalIP));
    String path;
    if (serving.getModelServer() == ModelServer.TENSORFLOW_SERVING) {
        path = localhostTfInferenceUtils.getPath(serving.getName(), serving.getModelVersion(), null);
    } else if (serving.getModelServer() == ModelServer.PYTHON) {
        path = localhostSkLearnInferenceUtils.getPath(null);
    } else {
        throw new UnsupportedOperationException("Model server not supported as local serving");
    }
    servingWrapper.setInternalPath(path);
    // These values will be fetched from the location href in the UI (client-side). By doing this, we make sure
    // that we display the correct host and port to reach Hopsworks. For instance, using proxies or SSH
    // tunneling, the port might differ from the default 80 or 443 on the client side.
    servingWrapper.setExternalIP(null);
    servingWrapper.setExternalPort(null);
    servingWrapper.setKafkaTopicDTO(kafkaServingHelper.buildTopicDTO(serving));
    return servingWrapper;
}
Also used : ServingException(io.hops.hopsworks.exceptions.ServingException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException)

Example 8 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class LocalhostServingMonitor method monitor.

@Timeout
public void monitor(Timer timer) {
    try {
        // Get the list of running Localhost Serving instances
        List<Serving> servingList = servingFacade.getLocalhostRunning();
        for (Serving serving : servingList) {
            try {
                Serving dbServing = servingFacade.acquireLock(serving.getProject(), serving.getId());
                ProcessDescriptor.Builder builder = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo");
                if (serving.getModelServer() == ModelServer.TENSORFLOW_SERVING) {
                    builder.addCommand(tfScript);
                }
                if (serving.getModelServer() == ModelServer.PYTHON) {
                    builder.addCommand(sklearnScript);
                }
                ProcessDescriptor processDescriptor = builder.addCommand("alive").addCommand(dbServing.getProject().getName().toLowerCase()).addCommand(dbServing.getName()).ignoreOutErrStreams(true).build();
                LOGGER.log(Level.FINE, processDescriptor.toString());
                try {
                    ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
                    if (processResult.getExitCode() != 0) {
                        // The processes is dead, run the kill script to delete the directory
                        // and update the value in the db
                        Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
                        builder = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo");
                        if (serving.getModelServer() == ModelServer.TENSORFLOW_SERVING) {
                            builder.addCommand(tfScript);
                        }
                        if (serving.getModelServer() == ModelServer.PYTHON) {
                            builder.addCommand(sklearnScript);
                        }
                        processDescriptor = builder.addCommand("kill").addCommand(dbServing.getCid()).addCommand(dbServing.getName()).addCommand(dbServing.getProject().getName().toLowerCase()).addCommand(secretDir.toString()).ignoreOutErrStreams(true).build();
                        LOGGER.log(Level.FINE, processDescriptor.toString());
                        osProcessExecutor.execute(processDescriptor);
                        // If the process succeeded to delete the localDir update the db
                        dbServing.setCid(CID_STOPPED);
                        dbServing.setLocalPort(-1);
                        servingFacade.updateDbObject(dbServing, dbServing.getProject());
                    }
                } catch (IOException e) {
                    LOGGER.log(Level.SEVERE, "Could not clean up serving instance with id: " + serving.getId(), e);
                }
                servingFacade.releaseLock(serving.getProject(), serving.getId());
            } catch (ServingException e) {
                LOGGER.log(Level.INFO, "Error processing serving instance with id: " + serving.getId(), e);
            }
        }
    } catch (Exception e) {
        LOGGER.log(Level.SEVERE, "Got an exception while monitoring servings", e);
    }
}
Also used : Serving(io.hops.hopsworks.persistence.entity.serving.Serving) Path(java.nio.file.Path) ServingException(io.hops.hopsworks.exceptions.ServingException) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) IOException(java.io.IOException) IOException(java.io.IOException) ServingException(io.hops.hopsworks.exceptions.ServingException) Timeout(javax.ejb.Timeout)

Example 9 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class ServingFacade method releaseLock.

@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public Serving releaseLock(Project project, Integer id) throws ServingException {
    int retries = 5;
    // Acquire DB read lock on the row
    while (retries > 0) {
        try {
            Serving serving = em.createNamedQuery("Serving.findByProjectAndId", Serving.class).setParameter("project", project).setParameter("id", id).setLockMode(LockModeType.PESSIMISTIC_WRITE).getSingleResult();
            serving.setLockIP(null);
            serving.setLockTimestamp(null);
            return em.merge(serving);
        } catch (LockTimeoutException e) {
            retries--;
        }
    }
    // Lock will be claimed
    throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.FINE);
}
Also used : Serving(io.hops.hopsworks.persistence.entity.serving.Serving) ServingException(io.hops.hopsworks.exceptions.ServingException) LockTimeoutException(javax.persistence.LockTimeoutException) TransactionAttribute(javax.ejb.TransactionAttribute)

Example 10 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class ServingFacade method acquireLock.

@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
public Serving acquireLock(Project project, Integer id) throws ServingException {
    int retries = 5;
    if (nodeIP == null) {
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE);
    }
    // Acquire DB read lock on the row
    while (retries > 0) {
        try {
            Serving serving = em.createNamedQuery("Serving.findByProjectAndId", Serving.class).setParameter("project", project).setParameter("id", id).setLockMode(LockModeType.PESSIMISTIC_WRITE).getSingleResult();
            if (serving == null) {
                throw new ServingException(RESTCodes.ServingErrorCode.INSTANCENOTFOUND, Level.WARNING);
            }
            if (serving.getLockIP() != null && serving.getLockTimestamp() > System.currentTimeMillis() - LOCK_TIMEOUT) {
                // There is another request working on this entry. Wait.
                retries--;
                continue;
            }
            serving.setLockIP(nodeIP);
            serving.setLockTimestamp(System.currentTimeMillis());
            // Lock acquire, return;
            return em.merge(serving);
        } catch (LockTimeoutException e) {
            retries--;
        }
    }
    throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.FINE, "Instance is busy. Please, " + "try later");
}
Also used : Serving(io.hops.hopsworks.persistence.entity.serving.Serving) ServingException(io.hops.hopsworks.exceptions.ServingException) LockTimeoutException(javax.persistence.LockTimeoutException) TransactionAttribute(javax.ejb.TransactionAttribute)

Aggregations

ServingException (io.hops.hopsworks.exceptions.ServingException)14 IOException (java.io.IOException)7 ProcessDescriptor (io.hops.hopsworks.common.util.ProcessDescriptor)6 Path (java.nio.file.Path)6 Serving (io.hops.hopsworks.persistence.entity.serving.Serving)5 ProcessResult (io.hops.hopsworks.common.util.ProcessResult)3 ServiceDiscoveryException (com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException)2 ProjectTopics (io.hops.hopsworks.persistence.entity.kafka.ProjectTopics)2 Subjects (io.hops.hopsworks.persistence.entity.kafka.schemas.Subjects)2 Date (java.util.Date)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 TransactionAttribute (javax.ejb.TransactionAttribute)2 LockTimeoutException (javax.persistence.LockTimeoutException)2 Strings (com.google.common.base.Strings)1 ProjectTopicsFacade (io.hops.hopsworks.common.dao.kafka.ProjectTopicsFacade)1 TopicDTO (io.hops.hopsworks.common.dao.kafka.TopicDTO)1 ServingFacade (io.hops.hopsworks.common.dao.serving.ServingFacade)1 DistributedFileSystemOps (io.hops.hopsworks.common.hdfs.DistributedFileSystemOps)1 Utils (io.hops.hopsworks.common.hdfs.Utils)1