Search in sources :

Example 11 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class LocalhostSkLearnServingController method startServingInstance.

/**
 * Starts a SkLearn serving instance. Executes the sklearn bash script to launch a Flask server as serving-user
 * in the project's anaconda environment. It records the PID of the server for monitoring.
 *
 * @param project the project to start the serving in
 * @param user the user starting the serving
 * @param serving the serving instance to start (flask server)
 * @throws ServingException
 */
public void startServingInstance(Project project, Users user, Serving serving) throws ServingException {
    String script = settings.getSudoersDir() + "/sklearn_serving.sh";
    Integer port = ThreadLocalRandom.current().nextInt(40000, 59999);
    Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
    String predictorFilename = serving.getPredictor();
    if (serving.getPredictor().contains("/")) {
        String[] splits = serving.getPredictor().split("/");
        predictorFilename = splits[splits.length - 1];
    }
    try {
        ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("start").addCommand(predictorFilename).addCommand(Paths.get(serving.getPredictor()).toString()).addCommand(String.valueOf(port)).addCommand(secretDir.toString()).addCommand(project.getName() + USER_NAME_DELIMITER + user.getUsername()).addCommand(project.getName().toLowerCase()).addCommand(settings.getAnacondaProjectDir() + "/bin/python").addCommand(certificateMaterializer.getUserTransientKeystorePath(project, user)).addCommand(certificateMaterializer.getUserTransientTruststorePath(project, user)).addCommand(certificateMaterializer.getUserTransientPasswordPath(project, user)).addCommand(serving.getName()).addCommand(projectUtils.getFullDockerImageName(project, false)).setWaitTimeout(2L, TimeUnit.MINUTES).ignoreOutErrStreams(true).build();
        logger.log(Level.FINE, processDescriptor.toString());
        // Materialized TLS certificates so that user can read from HDFS inside python script
        certificateMaterializer.materializeCertificatesLocal(user.getUsername(), project.getName());
        ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
        if (processResult.getExitCode() != 0) {
            // Startup process failed for some reason
            serving.setCid(CID_STOPPED);
            servingFacade.updateDbObject(serving, project);
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.WARNING, "Could not start sklearn serving", "ut:" + processResult.getStdout() + ", err:" + processResult.getStderr());
        }
        // Read the pid for SkLearn Serving Flask server
        Path pidFilePath = Paths.get(secretDir.toString(), "sklearn_flask_server.pid");
        // Pid file is created by sklearn server inside the docker container.
        // That means the process that started the container returned with exit code 0 but the file might not have been
        // created yet. Therefore, we wait until the file is created
        String pidContents = Files.readFirstLine(pidFilePath.toFile(), Charset.defaultCharset());
        int pidReadCounter = 0;
        while (Strings.isNullOrEmpty(pidContents) && pidReadCounter < 10) {
            logger.log(Level.FINE, "Waiting for sklearn to start...");
            Thread.sleep(1000);
            pidContents = Files.readFirstLine(pidFilePath.toFile(), Charset.defaultCharset());
            pidReadCounter++;
        }
        if (Strings.isNullOrEmpty(pidContents)) {
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.WARNING, "Could not start sklearn serving because pid file could not be read or was empty");
        }
        logger.log(Level.FINE, "sklearn pidContents:" + pidContents);
        // Update the info in the db
        serving.setCid(pidContents);
        serving.setLocalPort(port);
        serving.setDeployed(new Date());
        servingFacade.updateDbObject(serving, project);
    } catch (Exception ex) {
        // Startup process failed for some reason
        serving.setCid(CID_STOPPED);
        servingFacade.updateDbObject(serving, project);
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
    } finally {
        if (settings.getHopsRpcTls()) {
            certificateMaterializer.removeCertificatesLocal(user.getUsername(), project.getName());
        }
        // release lock on the serving entry
        servingFacade.releaseLock(project, serving.getId());
    }
}
Also used : Path(java.nio.file.Path) ServingException(io.hops.hopsworks.exceptions.ServingException) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) Date(java.util.Date) IOException(java.io.IOException) ServingException(io.hops.hopsworks.exceptions.ServingException)

Example 12 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class LocalhostSkLearnServingController method killServingInstance.

/**
 * Stops a SKLearn serving instance by killing the process with the corresponding PID
 *
 * @param project the project where the sklearn instance is running
 * @param serving the serving instance to stop
 * @param releaseLock boolean flag deciding whether to release the lock afterwards.
 * @throws ServingException
 */
public void killServingInstance(Project project, Serving serving, boolean releaseLock) throws ServingException {
    String script = settings.getSudoersDir() + "/sklearn_serving.sh";
    Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
    ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("kill").addCommand(serving.getCid()).addCommand(serving.getName()).addCommand(serving.getProject().getName().toLowerCase()).addCommand(secretDir.toString()).ignoreOutErrStreams(true).build();
    logger.log(Level.FINE, processDescriptor.toString());
    try {
        osProcessExecutor.execute(processDescriptor);
    } catch (IOException ex) {
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERROR, Level.SEVERE, "serving id: " + serving.getId(), ex.getMessage(), ex);
    }
    serving.setCid(CID_STOPPED);
    serving.setLocalPort(-1);
    serving.setDeployed(null);
    servingFacade.updateDbObject(serving, project);
    if (releaseLock) {
        // During the restart the lock is needed until the serving instance is actually restarted.
        // The startSkLearnServingInstance method is responsible of releasing the lock on the db entry
        // During the termination phase, this method is responsible of releasing the lock
        // In case of termination + deletion, we don't release the lock as the entry will be removed from the db.
        servingFacade.releaseLock(project, serving.getId());
    }
}
Also used : Path(java.nio.file.Path) ServingException(io.hops.hopsworks.exceptions.ServingException) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) IOException(java.io.IOException)

Example 13 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class LocalhostTfServingController method updateModelVersion.

/**
 * Updates the model version that is being served of an existing tfserving instance. The new model is copied to the
 * secret directory where the serving instance is running and then the server will automatically pick up the new
 * version.
 *
 * @param project the project of the serving instance
 * @param user the user making the request
 * @param serving the serving instance to update the model version for
 * @throws ServingException
 */
public void updateModelVersion(Project project, Users user, Serving serving) throws ServingException {
    // TFServing polls for new version of the model in the directory
    // if a new version is downloaded it starts serving it
    String script = settings.getSudoersDir() + "/tfserving.sh";
    Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS, serving.getLocalDir());
    ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("update").addCommand(serving.getName()).addCommand(Paths.get(serving.getModelPath(), serving.getModelVersion().toString()).toString()).addCommand(secretDir.toString()).addCommand(project.getName() + USER_NAME_DELIMITER + user.getUsername()).ignoreOutErrStreams(true).setWaitTimeout(2L, TimeUnit.MINUTES).build();
    logger.log(Level.INFO, processDescriptor.toString());
    // Materialized TLS certificates to be able to read the model
    if (settings.getHopsRpcTls()) {
        try {
            certificateMaterializer.materializeCertificatesLocal(user.getUsername(), project.getName());
        } catch (IOException e) {
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, e.getMessage(), e);
        } finally {
            servingFacade.releaseLock(project, serving.getId());
        }
    }
    try {
        osProcessExecutor.execute(processDescriptor);
    } catch (IOException ex) {
        throw new ServingException(RESTCodes.ServingErrorCode.UPDATEERROR, Level.SEVERE, "serving id: " + serving.getId(), ex.getMessage(), ex);
    } finally {
        if (settings.getHopsRpcTls()) {
            certificateMaterializer.removeCertificatesLocal(user.getUsername(), project.getName());
        }
        servingFacade.releaseLock(project, serving.getId());
    }
}
Also used : Path(java.nio.file.Path) ServingException(io.hops.hopsworks.exceptions.ServingException) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) IOException(java.io.IOException)

Example 14 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class LocalhostTfServingController method killServingInstance.

/**
 * Stops a Tensorflow serving instance by killing the process with the corresponding PID
 *
 * @param project the project where the tensorflow serving instance is running
 * @param serving the serving instance to stop
 * @param releaseLock boolean flag deciding whether to release the lock afterwards.
 * @throws ServingException
 */
public void killServingInstance(Project project, Serving serving, boolean releaseLock) throws ServingException {
    String script = settings.getSudoersDir() + "/tfserving.sh";
    Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
    ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("kill").addCommand(serving.getCid()).addCommand(serving.getName()).addCommand(serving.getProject().getName().toLowerCase()).addCommand(secretDir.toString()).ignoreOutErrStreams(true).build();
    logger.log(Level.INFO, processDescriptor.toString());
    try {
        osProcessExecutor.execute(processDescriptor);
    } catch (IOException ex) {
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERROR, Level.SEVERE, "serving id: " + serving.getId(), ex.getMessage(), ex);
    }
    serving.setCid(CID_STOPPED);
    serving.setLocalPort(-1);
    serving.setDeployed(null);
    servingFacade.updateDbObject(serving, project);
    if (releaseLock) {
        // During the restart the lock is needed until the serving instance is actually restarted.
        // The startTfServingInstance method is responsible of releasing the lock on the db entry
        // During the termination phase, this method is responsible of releasing the lock
        // In case of termination + deletion, we don't release the lock as the entry will be removed from the db.
        servingFacade.releaseLock(project, serving.getId());
    }
}
Also used : Path(java.nio.file.Path) ServingException(io.hops.hopsworks.exceptions.ServingException) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) IOException(java.io.IOException)

Aggregations

ServingException (io.hops.hopsworks.exceptions.ServingException)14 IOException (java.io.IOException)7 ProcessDescriptor (io.hops.hopsworks.common.util.ProcessDescriptor)6 Path (java.nio.file.Path)6 Serving (io.hops.hopsworks.persistence.entity.serving.Serving)5 ProcessResult (io.hops.hopsworks.common.util.ProcessResult)3 ServiceDiscoveryException (com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException)2 ProjectTopics (io.hops.hopsworks.persistence.entity.kafka.ProjectTopics)2 Subjects (io.hops.hopsworks.persistence.entity.kafka.schemas.Subjects)2 Date (java.util.Date)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 TransactionAttribute (javax.ejb.TransactionAttribute)2 LockTimeoutException (javax.persistence.LockTimeoutException)2 Strings (com.google.common.base.Strings)1 ProjectTopicsFacade (io.hops.hopsworks.common.dao.kafka.ProjectTopicsFacade)1 TopicDTO (io.hops.hopsworks.common.dao.kafka.TopicDTO)1 ServingFacade (io.hops.hopsworks.common.dao.serving.ServingFacade)1 DistributedFileSystemOps (io.hops.hopsworks.common.hdfs.DistributedFileSystemOps)1 Utils (io.hops.hopsworks.common.hdfs.Utils)1