Search in sources :

Example 1 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class LocalhostTfServingController method startServingInstance.

/**
 * Starts a Tensorflow serving instance. Executes the tfserving bash script to launch a tensorflow serving
 * server as serving-user and localize the tf-model from HDFS server. It records the PID of the server for monitoring.
 *
 * @param project the project to start the serving in
 * @param user the user starting the serving
 * @param serving the serving instance to start (tfserving modelserver)
 * @throws ServingException
 */
public void startServingInstance(Project project, Users user, Serving serving) throws ServingException {
    String script = settings.getSudoersDir() + "/tfserving.sh";
    // TODO(Fabio) this is bad as we don't know if the port is used or not
    Integer grpcPort = ThreadLocalRandom.current().nextInt(40000, 59999);
    Integer restPort = ThreadLocalRandom.current().nextInt(40000, 59999);
    Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
    ProcessDescriptor processDescriptor;
    try {
        processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("start").addCommand(serving.getName()).addCommand(Paths.get(serving.getModelPath(), serving.getModelVersion().toString()).toString()).addCommand(String.valueOf(grpcPort)).addCommand(String.valueOf(restPort)).addCommand(secretDir.toString()).addCommand(project.getName() + USER_NAME_DELIMITER + user.getUsername()).addCommand(serving.isBatchingEnabled() ? "1" : "0").addCommand(project.getName().toLowerCase()).addCommand(projectUtils.getFullDockerImageName(project, true)).setWaitTimeout(2L, TimeUnit.MINUTES).ignoreOutErrStreams(false).build();
        logger.log(Level.INFO, processDescriptor.toString());
    } catch (ServiceDiscoveryException ex) {
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
    }
    // Materialized TLS certificates to be able to read the model
    if (settings.getHopsRpcTls()) {
        try {
            certificateMaterializer.materializeCertificatesLocal(user.getUsername(), project.getName());
        } catch (IOException e) {
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, e.getMessage(), e);
        } finally {
            // Release lock on the serving entry
            servingFacade.releaseLock(project, serving.getId());
        }
    }
    try {
        ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
        if (processResult.getExitCode() != 0) {
            // Startup process failed for some reason
            serving.setCid(CID_STOPPED);
            servingFacade.updateDbObject(serving, project);
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.INFO);
        }
        // Read the pid for TensorFlow Serving server
        Path cidFilePath = Paths.get(secretDir.toString(), "tfserving.pid");
        String cid = Files.readFirstLine(cidFilePath.toFile(), Charset.defaultCharset());
        // Update the info in the db
        serving.setCid(cid);
        serving.setLocalPort(restPort);
        serving.setDeployed(new Date());
        servingFacade.updateDbObject(serving, project);
    } catch (Exception ex) {
        // Startup process failed for some reason
        serving.setCid(CID_STOPPED);
        servingFacade.updateDbObject(serving, project);
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
    } finally {
        if (settings.getHopsRpcTls()) {
            certificateMaterializer.removeCertificatesLocal(user.getUsername(), project.getName());
        }
        // release lock on the serving entry
        servingFacade.releaseLock(project, serving.getId());
    }
}
Also used : Path(java.nio.file.Path) ServingException(io.hops.hopsworks.exceptions.ServingException) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) IOException(java.io.IOException) Date(java.util.Date) IOException(java.io.IOException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) ServingException(io.hops.hopsworks.exceptions.ServingException)

Example 2 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class ServingUtil method validatePythonUserInput.

private void validatePythonUserInput(Serving serving) throws IllegalArgumentException, ServingException {
    // Check model files and/or python scripts
    try {
        List<Inode> children = inodeController.getChildren(serving.getModelVersionPath());
        long modelFiles = children.stream().filter(c -> {
            String name = c.getInodePK().getName();
            return MODEL_FILE_EXTS.stream().anyMatch(name::endsWith);
        }).count();
        if (modelFiles == 0) {
            // if no model files found
            if (children.stream().noneMatch(c -> c.getInodePK().getName().endsWith(".py"))) {
                // and no python script
                throw new ServingException(RESTCodes.ServingErrorCode.MODEL_FILES_STRUCTURE_NOT_VALID, Level.FINE, "Model" + " path requires either a python script or model file (i.e., joblib or pickle files)");
            }
        }
    } catch (FileNotFoundException e) {
        throw new ServingException(RESTCodes.ServingErrorCode.MODEL_PATH_NOT_FOUND, Level.FINE, null);
    }
    if (serving.isBatchingEnabled()) {
        throw new ServingException(RESTCodes.ServingErrorCode.REQUEST_BATCHING_NOT_SUPPORTED, Level.SEVERE, "Request " + "batching is not supported in Python deployments");
    }
}
Also used : InodeController(io.hops.hopsworks.common.hdfs.inode.InodeController) StringUtils(org.apache.commons.lang.StringUtils) Arrays(java.util.Arrays) ServingFacade(io.hops.hopsworks.common.dao.serving.ServingFacade) Project(io.hops.hopsworks.persistence.entity.project.Project) Level(java.util.logging.Level) ProjectTopicsFacade(io.hops.hopsworks.common.dao.kafka.ProjectTopicsFacade) HashSet(java.util.HashSet) Strings(com.google.common.base.Strings) Settings(io.hops.hopsworks.common.util.Settings) Matcher(java.util.regex.Matcher) ServingWrapper(io.hops.hopsworks.common.serving.ServingWrapper) Inode(io.hops.hopsworks.persistence.entity.hdfs.inode.Inode) ModelServer(io.hops.hopsworks.persistence.entity.serving.ModelServer) Utils(io.hops.hopsworks.common.hdfs.Utils) EJB(javax.ejb.EJB) TopicDTO(io.hops.hopsworks.common.dao.kafka.TopicDTO) Stateless(javax.ejb.Stateless) RESTCodes(io.hops.hopsworks.restutils.RESTCodes) ProjectTopics(io.hops.hopsworks.persistence.entity.kafka.ProjectTopics) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) List(java.util.List) Subjects(io.hops.hopsworks.persistence.entity.kafka.schemas.Subjects) Stream(java.util.stream.Stream) Serving(io.hops.hopsworks.persistence.entity.serving.Serving) ServingException(io.hops.hopsworks.exceptions.ServingException) Pattern(java.util.regex.Pattern) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Inode(io.hops.hopsworks.persistence.entity.hdfs.inode.Inode) ServingException(io.hops.hopsworks.exceptions.ServingException) FileNotFoundException(java.io.FileNotFoundException)

Example 3 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class ServingUtil method validateServingName.

private void validateServingName(Serving serving, Serving dbServing) throws ServingException {
    if (Strings.isNullOrEmpty(serving.getName())) {
        throw new IllegalArgumentException("Serving name not provided");
    } else if (serving.getName().contains(" ")) {
        throw new IllegalArgumentException("Serving name cannot contain spaces");
    }
    // Check for duplicated entries
    if (dbServing != null && !dbServing.getId().equals(serving.getId())) {
        // There is already an entry for this project
        throw new ServingException(RESTCodes.ServingErrorCode.DUPLICATEDENTRY, Level.FINE);
    }
    // Check serving name follows allowed regex as required by the InferenceResource to use it as a
    // REST endpoint
    Pattern urlPattern = Pattern.compile("[a-zA-Z0-9]+");
    Matcher urlMatcher = urlPattern.matcher(serving.getName());
    if (!urlMatcher.matches()) {
        throw new IllegalArgumentException("Serving name must follow regex: \"[a-zA-Z0-9]+\"");
    }
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) ServingException(io.hops.hopsworks.exceptions.ServingException)

Example 4 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class ServingUtil method validateKafkaTopicSchema.

private void validateKafkaTopicSchema(Project project, Serving serving, TopicDTO topic) throws ServingException {
    // if an existing topic, check schema
    if (topic != null && !topic.getName().equals("NONE") && !topic.getName().equals("CREATE")) {
        ProjectTopics projectTopic = projectTopicsFacade.findTopicByNameAndProject(project, topic.getName()).orElseThrow(() -> new ServingException(RESTCodes.ServingErrorCode.KAFKA_TOPIC_NOT_FOUND, Level.SEVERE, null));
        Subjects subjects = projectTopic.getSubjects();
        if (!subjects.getSubject().equalsIgnoreCase(Settings.INFERENCE_SCHEMANAME)) {
            throw new ServingException(RESTCodes.ServingErrorCode.KAFKA_TOPIC_NOT_VALID, Level.FINE, "Inference logging" + " requires a Kafka topic with schema '" + Settings.INFERENCE_SCHEMANAME + "'");
        }
    }
}
Also used : ProjectTopics(io.hops.hopsworks.persistence.entity.kafka.ProjectTopics) ServingException(io.hops.hopsworks.exceptions.ServingException) Subjects(io.hops.hopsworks.persistence.entity.kafka.schemas.Subjects)

Example 5 with ServingException

use of io.hops.hopsworks.exceptions.ServingException in project hopsworks by logicalclocks.

the class ProjectController method removeProjectInt.

private void removeProjectInt(Project project, List<HdfsUsers> usersToClean, List<HdfsGroups> groupsToClean, List<Future<?>> projectCreationFutures, boolean decreaseCreatedProj, Users owner) throws IOException, InterruptedException, HopsSecurityException, ServiceException, ProjectException, GenericException, TensorBoardException, FeaturestoreException {
    DistributedFileSystemOps dfso = null;
    try {
        dfso = dfs.getDfsOps();
        // Run custom handlers for project deletion
        ProjectHandler.runProjectPreDeleteHandlers(projectHandlers, project);
        // log removal to notify elastic search
        logProject(project, OperationType.Delete);
        // change the owner and group of the project folder to hdfs super user
        Path location = new Path(Utils.getProjectPath(project.getName()));
        changeOwnershipToSuperuser(location, dfso);
        Path dumy = new Path("/tmp/" + project.getName());
        changeOwnershipToSuperuser(dumy, dfso);
        // remove kafka topics
        removeKafkaTopics(project);
        // projectCreationFutures will be null during project deletion.
        if (projectCreationFutures != null) {
            for (Future f : projectCreationFutures) {
                if (f != null) {
                    try {
                        f.get();
                    } catch (ExecutionException ex) {
                        LOGGER.log(Level.SEVERE, "Error while waiting for ProjectCreationFutures to finish for Project " + project.getName(), ex);
                    }
                }
            }
        }
        try {
            certificatesController.revokeProjectCertificates(project, owner);
        } catch (HopsSecurityException ex) {
            if (ex.getErrorCode() != RESTCodes.SecurityErrorCode.CERTIFICATE_NOT_FOUND) {
                LOGGER.log(Level.SEVERE, "Could not delete certificates during cleanup for project " + project.getName() + ". Manual cleanup is needed!!!", ex);
                throw ex;
            }
        } catch (IOException | GenericException ex) {
            LOGGER.log(Level.SEVERE, "Could not delete certificates during cleanup for project " + project.getName() + ". Manual cleanup is needed!!!", ex);
            throw ex;
        }
        // remove running tensorboards
        removeTensorBoard(project);
        // remove jupyter
        removeJupyter(project);
        removeProjectRelatedFiles(usersToClean, dfso);
        // remove quota
        removeQuotas(project);
        // change owner for files in shared datasets
        fixSharedDatasets(project, dfso);
        // Delete online featurestore database
        onlineFeaturestoreController.removeOnlineFeatureStore(project);
        // Delete Hive database - will automatically cleanup all the Hive's metadata
        hiveController.dropDatabases(project, dfso, false);
        try {
            // Delete elasticsearch template for this project
            removeElasticsearch(project);
        } catch (ElasticException ex) {
            LOGGER.log(Level.WARNING, "Failure while removing elasticsearch indices", ex);
        }
        // delete project group and users
        removeGroupAndUsers(groupsToClean, usersToClean);
        // remove dumy Inode
        dfso.rm(dumy, true);
        // Remove servings
        try {
            servingController.deleteAll(project);
        } catch (ServingException e) {
            throw new IOException(e);
        }
        // Remove Airflow DAGs from local filesystem,
        // JWT renewal monitors and materialized X.509
        airflowManager.onProjectRemoval(project);
        // remove folder
        removeProjectFolder(project.getName(), dfso);
        if (decreaseCreatedProj) {
            usersController.decrementNumProjectsCreated(project.getOwner().getUid());
        }
        usersController.decrementNumActiveProjects(project.getOwner().getUid());
        // Run custom handlers for project deletion
        ProjectHandler.runProjectPostDeleteHandlers(projectHandlers, project);
        LOGGER.log(Level.INFO, "{0} - project removed.", project.getName());
    } finally {
        if (dfso != null) {
            dfso.close();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ElasticException(io.hops.hopsworks.exceptions.ElasticException) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) ServingException(io.hops.hopsworks.exceptions.ServingException) Future(java.util.concurrent.Future) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) GenericException(io.hops.hopsworks.exceptions.GenericException) HopsSecurityException(io.hops.hopsworks.exceptions.HopsSecurityException)

Aggregations

ServingException (io.hops.hopsworks.exceptions.ServingException)14 IOException (java.io.IOException)7 ProcessDescriptor (io.hops.hopsworks.common.util.ProcessDescriptor)6 Path (java.nio.file.Path)6 Serving (io.hops.hopsworks.persistence.entity.serving.Serving)5 ProcessResult (io.hops.hopsworks.common.util.ProcessResult)3 ServiceDiscoveryException (com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException)2 ProjectTopics (io.hops.hopsworks.persistence.entity.kafka.ProjectTopics)2 Subjects (io.hops.hopsworks.persistence.entity.kafka.schemas.Subjects)2 Date (java.util.Date)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 TransactionAttribute (javax.ejb.TransactionAttribute)2 LockTimeoutException (javax.persistence.LockTimeoutException)2 Strings (com.google.common.base.Strings)1 ProjectTopicsFacade (io.hops.hopsworks.common.dao.kafka.ProjectTopicsFacade)1 TopicDTO (io.hops.hopsworks.common.dao.kafka.TopicDTO)1 ServingFacade (io.hops.hopsworks.common.dao.serving.ServingFacade)1 DistributedFileSystemOps (io.hops.hopsworks.common.hdfs.DistributedFileSystemOps)1 Utils (io.hops.hopsworks.common.hdfs.Utils)1