Search in sources :

Example 1 with ProcessDescriptor

use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.

the class AsynchronousGitCommandExecutor method execute.

@Asynchronous
@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
public void execute(GitOpExecution gitOpExecution, GitPaths gitPaths) {
    int maxTries = 5;
    String pid = "";
    String gitCommand = gitOpExecution.getGitCommandConfiguration().getCommandType().getGitCommand();
    String prog = settings.getSudoersDir() + "/git.sh";
    String commandArgumentsFile = gitPaths.getConfDirPath() + File.separator + GitContainerLaunchScriptArgumentsTemplate.FILE_NAME;
    while (maxTries > 0 && Strings.isNullOrEmpty(pid)) {
        try {
            ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("start").addCommand(commandArgumentsFile).redirectErrorStream(true).setCurrentWorkingDirectory(new File(gitPaths.getGitPath())).setWaitTimeout(60L, TimeUnit.SECONDS).build();
            String pidFile = gitPaths.getRunDirPath() + "/git.pid";
            ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
            if (processResult.getExitCode() != 0) {
                String errorMsg = "Could not start git service to execute command " + gitCommand + " . " + "Exit code: " + processResult.getExitCode() + " Error: stdout: " + processResult.getStdout() + " stderr: " + processResult.getStderr();
                LOGGER.log(Level.SEVERE, errorMsg);
                throw new IOException(errorMsg);
            } else {
                pid = com.google.common.io.Files.readFirstLine(new File(pidFile), Charset.defaultCharset());
                // Get the updated repository
                Optional<GitRepository> optional = gitRepositoryFacade.findById(gitOpExecution.getRepository().getId());
                gitRepositoryFacade.updateRepositoryCid(optional.get(), pid);
            // gitOpExecutionFacade.updateState(gitOpExecution, GitOpExecutionState.SUBMITTED);
            }
        } catch (Exception ex) {
            LOGGER.log(Level.SEVERE, "Problem executing shell script to start git command service", ex);
            maxTries--;
        }
    }
    if (Strings.isNullOrEmpty(pid)) {
        updateExecutionStateToFail(gitOpExecution);
    }
}
Also used : GitRepository(io.hops.hopsworks.persistence.entity.git.GitRepository) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) IOException(java.io.IOException) File(java.io.File) IOException(java.io.IOException) Asynchronous(javax.ejb.Asynchronous) TransactionAttribute(javax.ejb.TransactionAttribute)

Example 2 with ProcessDescriptor

use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.

the class LocalhostTfServingController method startServingInstance.

/**
 * Starts a Tensorflow serving instance. Executes the tfserving bash script to launch a tensorflow serving
 * server as serving-user and localize the tf-model from HDFS server. It records the PID of the server for monitoring.
 *
 * @param project the project to start the serving in
 * @param user the user starting the serving
 * @param serving the serving instance to start (tfserving modelserver)
 * @throws ServingException
 */
public void startServingInstance(Project project, Users user, Serving serving) throws ServingException {
    String script = settings.getSudoersDir() + "/tfserving.sh";
    // TODO(Fabio) this is bad as we don't know if the port is used or not
    Integer grpcPort = ThreadLocalRandom.current().nextInt(40000, 59999);
    Integer restPort = ThreadLocalRandom.current().nextInt(40000, 59999);
    Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
    ProcessDescriptor processDescriptor;
    try {
        processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("start").addCommand(serving.getName()).addCommand(Paths.get(serving.getModelPath(), serving.getModelVersion().toString()).toString()).addCommand(String.valueOf(grpcPort)).addCommand(String.valueOf(restPort)).addCommand(secretDir.toString()).addCommand(project.getName() + USER_NAME_DELIMITER + user.getUsername()).addCommand(serving.isBatchingEnabled() ? "1" : "0").addCommand(project.getName().toLowerCase()).addCommand(projectUtils.getFullDockerImageName(project, true)).setWaitTimeout(2L, TimeUnit.MINUTES).ignoreOutErrStreams(false).build();
        logger.log(Level.INFO, processDescriptor.toString());
    } catch (ServiceDiscoveryException ex) {
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
    }
    // Materialized TLS certificates to be able to read the model
    if (settings.getHopsRpcTls()) {
        try {
            certificateMaterializer.materializeCertificatesLocal(user.getUsername(), project.getName());
        } catch (IOException e) {
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, e.getMessage(), e);
        } finally {
            // Release lock on the serving entry
            servingFacade.releaseLock(project, serving.getId());
        }
    }
    try {
        ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
        if (processResult.getExitCode() != 0) {
            // Startup process failed for some reason
            serving.setCid(CID_STOPPED);
            servingFacade.updateDbObject(serving, project);
            throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.INFO);
        }
        // Read the pid for TensorFlow Serving server
        Path cidFilePath = Paths.get(secretDir.toString(), "tfserving.pid");
        String cid = Files.readFirstLine(cidFilePath.toFile(), Charset.defaultCharset());
        // Update the info in the db
        serving.setCid(cid);
        serving.setLocalPort(restPort);
        serving.setDeployed(new Date());
        servingFacade.updateDbObject(serving, project);
    } catch (Exception ex) {
        // Startup process failed for some reason
        serving.setCid(CID_STOPPED);
        servingFacade.updateDbObject(serving, project);
        throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
    } finally {
        if (settings.getHopsRpcTls()) {
            certificateMaterializer.removeCertificatesLocal(user.getUsername(), project.getName());
        }
        // release lock on the serving entry
        servingFacade.releaseLock(project, serving.getId());
    }
}
Also used : Path(java.nio.file.Path) ServingException(io.hops.hopsworks.exceptions.ServingException) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) IOException(java.io.IOException) Date(java.util.Date) IOException(java.io.IOException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) ServingException(io.hops.hopsworks.exceptions.ServingException)

Example 3 with ProcessDescriptor

use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.

the class DatasetController method unzip.

public void unzip(Project project, Users user, Path path, Path destPath) throws DatasetException {
    String hdfsUser = hdfsUsersController.getHdfsUserName(project, user);
    checkFileExists(path, hdfsUser);
    CompressionInfo compressionInfo = new CompressionInfo(path, destPath);
    String stagingDir = settings.getStagingDir() + File.separator + compressionInfo.getStagingDirectory();
    File unzipDir = new File(stagingDir);
    unzipDir.mkdirs();
    settings.addUnzippingState(compressionInfo);
    ProcessDescriptor.Builder processDescriptorBuilder = new ProcessDescriptor.Builder().addCommand(settings.getHopsworksDomainDir() + "/bin/unzip-background.sh").addCommand(stagingDir).addCommand(path.toString()).addCommand(hdfsUser);
    if (destPath != null) {
        processDescriptorBuilder.addCommand(destPath.toString());
    }
    ProcessDescriptor processDescriptor = processDescriptorBuilder.ignoreOutErrStreams(true).build();
    try {
        ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
        int result = processResult.getExitCode();
        if (result == 2) {
            throw new DatasetException(RESTCodes.DatasetErrorCode.COMPRESSION_SIZE_ERROR, Level.WARNING);
        }
        if (result != 0) {
            throw new DatasetException(RESTCodes.DatasetErrorCode.COMPRESSION_ERROR, Level.WARNING, "path: " + path.toString() + ", result: " + result);
        }
    } catch (IOException ex) {
        throw new DatasetException(RESTCodes.DatasetErrorCode.COMPRESSION_ERROR, Level.SEVERE, "path: " + path.toString(), ex.getMessage(), ex);
    }
}
Also used : ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) IOException(java.io.IOException) CompressionInfo(io.hops.hopsworks.common.dataset.util.CompressionInfo) File(java.io.File) DatasetException(io.hops.hopsworks.exceptions.DatasetException)

Example 4 with ProcessDescriptor

use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.

the class LocalHostJupyterProcessMgr method startJupyterServer.

@Override
@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
public JupyterDTO startJupyterServer(Project project, String secretConfig, String hdfsUser, Users user, JupyterSettings js, String allowOrigin) throws ServiceException, JobException {
    String prog = settings.getSudoersDir() + "/jupyter.sh";
    Integer port = ThreadLocalRandom.current().nextInt(40000, 59999);
    JupyterPaths jp = jupyterConfigFilesGenerator.generateConfiguration(project, secretConfig, hdfsUser, user, js, port, allowOrigin);
    String secretDir = settings.getStagingDir() + Settings.PRIVATE_DIRS + js.getSecret();
    String token = TokenGenerator.generateToken(TOKEN_LENGTH);
    String cid = "";
    // The Jupyter Notebook is running at: http://localhost:8888/?token=c8de56fa4deed24899803e93c227592aef6538f93025fe01
    int maxTries = 5;
    // kill any running servers for this user, clear cached entries
    while (maxTries > 0) {
        try {
            // use pidfile to kill any running servers
            ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("start").addCommand(jp.getNotebookPath()).addCommand(settings.getHadoopSymbolicLinkDir() + "-" + settings.getHadoopVersion()).addCommand(hdfsUser).addCommand(settings.getAnacondaProjectDir()).addCommand(port.toString()).addCommand(HopsUtils.getJupyterLogName(hdfsUser, port)).addCommand(secretDir).addCommand(jp.getCertificatesDir()).addCommand(hdfsUser).addCommand(token).addCommand(js.getMode().getValue()).addCommand(projectUtils.getFullDockerImageName(project, false)).addCommand(Boolean.toString(js.isGitBackend())).redirectErrorStream(true).setCurrentWorkingDirectory(new File(jp.getNotebookPath())).setWaitTimeout(60L, TimeUnit.SECONDS).build();
            String pidfile = jp.getRunDirPath() + "/jupyter.pid";
            ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
            if (processResult.getExitCode() != 0) {
                String errorMsg = "Could not start Jupyter server. Exit code: " + processResult.getExitCode() + " Error: stdout: " + processResult.getStdout() + " stderr: " + processResult.getStderr();
                LOGGER.log(Level.SEVERE, errorMsg);
                throw new IOException(errorMsg);
            }
            // Read the pid for Jupyter Notebook
            cid = com.google.common.io.Files.readFirstLine(new File(pidfile), Charset.defaultCharset());
            return new JupyterDTO(port, token, cid, secretConfig, jp.getCertificatesDir());
        } catch (Exception ex) {
            LOGGER.log(Level.SEVERE, "Problem executing shell script to start Jupyter server", ex);
            maxTries--;
        }
    }
    String errorMsg = "Failed to start Jupyter";
    throw new ServiceException(RESTCodes.ServiceErrorCode.JUPYTER_START_ERROR, Level.SEVERE, errorMsg, errorMsg + " for project " + project);
}
Also used : ServiceException(io.hops.hopsworks.exceptions.ServiceException) JupyterPaths(io.hops.hopsworks.common.dao.jupyter.config.JupyterPaths) URIBuilder(org.apache.http.client.utils.URIBuilder) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) IOException(java.io.IOException) File(java.io.File) ClientProtocolException(org.apache.http.client.ClientProtocolException) URISyntaxException(java.net.URISyntaxException) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) ServiceException(io.hops.hopsworks.exceptions.ServiceException) JobException(io.hops.hopsworks.exceptions.JobException) JupyterDTO(io.hops.hopsworks.common.dao.jupyter.config.JupyterDTO) TransactionAttribute(javax.ejb.TransactionAttribute)

Example 5 with ProcessDescriptor

use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.

the class TensorBoardProcessMgr method startTensorBoard.

/**
 * Start the TensorBoard process
 * @param project
 * @param user
 * @param hdfsUser
 * @param hdfsLogdir
 * @return
 * @throws IOException
 */
public TensorBoardDTO startTensorBoard(Project project, Users user, HdfsUsers hdfsUser, String hdfsLogdir, String tensorBoardDirectory) throws TensorBoardException {
    String prog = settings.getSudoersDir() + "/tensorboard.sh";
    Integer port = 0;
    String cid = null;
    String tbBasePath = settings.getStagingDir() + Settings.TENSORBOARD_DIRS;
    String tbSecretDir = tbBasePath + tensorBoardDirectory;
    String certsPath = "";
    File tbDir = new File(tbSecretDir);
    if (!tbDir.exists()) {
        tbDir.mkdirs();
    }
    DistributedFileSystemOps dfso = dfsService.getDfsOps();
    try {
        certsPath = tbSecretDir + "/certs";
        File certsDir = new File(certsPath);
        certsDir.mkdirs();
        HopsUtils.materializeCertificatesForUserCustomDir(project.getName(), user.getUsername(), settings.getHdfsTmpCertDir(), dfso, certificateMaterializer, settings, certsPath);
    } catch (IOException ioe) {
        LOGGER.log(Level.SEVERE, "Failed in materializing certificates for " + hdfsUser + " in directory " + certsPath, ioe);
        HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(), settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
        throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_START_ERROR, Level.SEVERE, "Failed to start TensorBoard", "An exception occurred while materializing certificates", ioe);
    } finally {
        if (dfso != null) {
            dfsService.closeDfsClient(dfso);
        }
    }
    String anacondaEnvironmentPath = settings.getAnacondaProjectDir();
    int retries = 3;
    while (retries > 0) {
        try {
            if (retries == 0) {
                throw new IOException("Failed to start TensorBoard for project=" + project.getName() + ", user=" + user.getUid());
            }
            // use pidfile to kill any running servers
            port = ThreadLocalRandom.current().nextInt(40000, 59999);
            ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("start").addCommand(hdfsUser.getName()).addCommand(hdfsLogdir).addCommand(tbSecretDir).addCommand(port.toString()).addCommand(anacondaEnvironmentPath).addCommand(projectUtils.getFullDockerImageName(project, true)).addCommand(Integer.toString(settings.getTensorBoardMaxReloadThreads())).ignoreOutErrStreams(true).build();
            LOGGER.log(Level.FINE, processDescriptor.toString());
            ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
            if (!processResult.processExited()) {
                throw new IOException("TensorBoard start process timed out!");
            }
            int exitValue = processResult.getExitCode();
            String pidPath = tbSecretDir + File.separator + port + ".pid";
            File pidFile = new File(pidPath);
            // Read the pid for TensorBoard server
            if (pidFile.exists()) {
                cid = Files.readFirstLine(pidFile, Charset.defaultCharset());
            }
            if (exitValue == 0 && cid != null) {
                TensorBoardDTO tensorBoardDTO = new TensorBoardDTO();
                String host = null;
                try {
                    host = InetAddress.getLocalHost().getHostAddress();
                } catch (UnknownHostException ex) {
                    LOGGER.log(Level.SEVERE, null, ex);
                }
                tensorBoardDTO.setEndpoint(host + ":" + port);
                tensorBoardDTO.setCid(cid);
                return tensorBoardDTO;
            } else {
                LOGGER.log(Level.SEVERE, "Failed starting TensorBoard got exitcode " + exitValue + " retrying on new port");
                if (cid != null) {
                    this.killTensorBoard(cid);
                }
            }
        } catch (Exception ex) {
            LOGGER.log(Level.SEVERE, "Problem starting TensorBoard: {0}", ex);
        } finally {
            retries--;
        }
    }
    certsPath = tbBasePath + "/certs";
    HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(), settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
    removeTensorBoardDirectory(tbSecretDir);
    throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_START_ERROR, Level.SEVERE, "Failed to start TensorBoard after exhausting retry attempts");
}
Also used : UnknownHostException(java.net.UnknownHostException) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) IOException(java.io.IOException) TensorBoardException(io.hops.hopsworks.exceptions.TensorBoardException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) File(java.io.File) TensorBoardException(io.hops.hopsworks.exceptions.TensorBoardException)

Aggregations

ProcessDescriptor (io.hops.hopsworks.common.util.ProcessDescriptor)23 IOException (java.io.IOException)21 ProcessResult (io.hops.hopsworks.common.util.ProcessResult)20 File (java.io.File)9 ServingException (io.hops.hopsworks.exceptions.ServingException)6 Path (java.nio.file.Path)6 ProjectException (io.hops.hopsworks.exceptions.ProjectException)3 ServiceException (io.hops.hopsworks.exceptions.ServiceException)3 Project (io.hops.hopsworks.persistence.entity.project.Project)3 BufferedWriter (java.io.BufferedWriter)3 FileWriter (java.io.FileWriter)3 TransactionAttribute (javax.ejb.TransactionAttribute)3 ServiceDiscoveryException (com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException)2 CompressionInfo (io.hops.hopsworks.common.dataset.util.CompressionInfo)2 DatasetException (io.hops.hopsworks.exceptions.DatasetException)2 PythonException (io.hops.hopsworks.exceptions.PythonException)2 TensorBoardException (io.hops.hopsworks.exceptions.TensorBoardException)2 ArrayList (java.util.ArrayList)2 Date (java.util.Date)2 JupyterDTO (io.hops.hopsworks.common.dao.jupyter.config.JupyterDTO)1