Search in sources :

Example 1 with TensorBoardException

use of io.hops.hopsworks.exceptions.TensorBoardException in project hopsworks by logicalclocks.

the class TensorBoardKillTimer method rotate.

@Schedule(persistent = false, minute = "*/10", hour = "*")
public void rotate(Timer timer) {
    try {
        LOGGER.log(Level.INFO, "Running TensorBoardKillTimer.");
        int tensorBoardMaxLastAccessed = settings.getTensorBoardMaxLastAccessed();
        Collection<TensorBoard> tensorBoardCollection = tensorBoardFacade.findAll();
        for (TensorBoard tensorBoard : tensorBoardCollection) {
            // Standard case, TB have been idle for a given amount of time
            Date accessed = tensorBoard.getLastAccessed();
            Date current = Calendar.getInstance().getTime();
            if ((current.getTime() - accessed.getTime()) > tensorBoardMaxLastAccessed) {
                try {
                    tensorBoardController.cleanup(tensorBoard);
                    LOGGER.log(Level.FINE, "Killed TensorBoard " + tensorBoard.toString() + " not accessed in the last " + tensorBoardMaxLastAccessed + " milliseconds");
                } catch (TensorBoardException ex) {
                    LOGGER.log(Level.SEVERE, "Failed to clean up running TensorBoard", ex);
                }
            }
        }
        // sanity check to make sure that all .pid files have a corresponding TB
        try {
            List<TensorBoard> TBs = tensorBoardFacade.findAll();
            String tbDirPath = settings.getStagingDir() + Settings.TENSORBOARD_DIRS;
            File tbDir = new File(tbDirPath);
            // For each project_projectmember directory try to find .pid file
            for (File currentTbDir : tbDir.listFiles()) {
                for (File possiblePidFile : currentTbDir.listFiles()) {
                    if (possiblePidFile.getName().endsWith(".pid")) {
                        String cid = com.google.common.io.Files.readFirstLine(possiblePidFile, Charset.defaultCharset());
                        if (cid != null) {
                            // do not kill TBs which are in the DB
                            boolean tbExists = false;
                            for (TensorBoard tb : TBs) {
                                if (tb.getCid().equals(cid)) {
                                    tbExists = true;
                                }
                            }
                            if (!tbExists) {
                                LOGGER.log(Level.WARNING, "Detected a stray TensorBoard with pid " + cid + " in directory " + currentTbDir.getAbsolutePath() + ", cleaning up...");
                                tensorBoardProcessMgr.killTensorBoard(cid);
                                tensorBoardProcessMgr.removeTensorBoardDirectory(currentTbDir.getAbsolutePath());
                            }
                        }
                    }
                }
            }
        } catch (IOException | NumberFormatException e) {
            LOGGER.log(Level.SEVERE, "Exception while reading .pid files", e);
        }
    } catch (Exception e) {
        LOGGER.log(Level.SEVERE, "An error occurred while checking for expired TensorBoards to be cleaned up", e);
    }
}
Also used : TensorBoard(io.hops.hopsworks.persistence.entity.tensorflow.TensorBoard) IOException(java.io.IOException) Date(java.util.Date) TensorBoardException(io.hops.hopsworks.exceptions.TensorBoardException) IOException(java.io.IOException) File(java.io.File) TensorBoardException(io.hops.hopsworks.exceptions.TensorBoardException) Schedule(javax.ejb.Schedule)

Example 2 with TensorBoardException

use of io.hops.hopsworks.exceptions.TensorBoardException in project hopsworks by logicalclocks.

the class TensorBoardProcessMgr method startTensorBoard.

/**
 * Start the TensorBoard process
 * @param project
 * @param user
 * @param hdfsUser
 * @param hdfsLogdir
 * @return
 * @throws IOException
 */
public TensorBoardDTO startTensorBoard(Project project, Users user, HdfsUsers hdfsUser, String hdfsLogdir, String tensorBoardDirectory) throws TensorBoardException {
    String prog = settings.getSudoersDir() + "/tensorboard.sh";
    Integer port = 0;
    String cid = null;
    String tbBasePath = settings.getStagingDir() + Settings.TENSORBOARD_DIRS;
    String tbSecretDir = tbBasePath + tensorBoardDirectory;
    String certsPath = "";
    File tbDir = new File(tbSecretDir);
    if (!tbDir.exists()) {
        tbDir.mkdirs();
    }
    DistributedFileSystemOps dfso = dfsService.getDfsOps();
    try {
        certsPath = tbSecretDir + "/certs";
        File certsDir = new File(certsPath);
        certsDir.mkdirs();
        HopsUtils.materializeCertificatesForUserCustomDir(project.getName(), user.getUsername(), settings.getHdfsTmpCertDir(), dfso, certificateMaterializer, settings, certsPath);
    } catch (IOException ioe) {
        LOGGER.log(Level.SEVERE, "Failed in materializing certificates for " + hdfsUser + " in directory " + certsPath, ioe);
        HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(), settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
        throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_START_ERROR, Level.SEVERE, "Failed to start TensorBoard", "An exception occurred while materializing certificates", ioe);
    } finally {
        if (dfso != null) {
            dfsService.closeDfsClient(dfso);
        }
    }
    String anacondaEnvironmentPath = settings.getAnacondaProjectDir();
    int retries = 3;
    while (retries > 0) {
        try {
            if (retries == 0) {
                throw new IOException("Failed to start TensorBoard for project=" + project.getName() + ", user=" + user.getUid());
            }
            // use pidfile to kill any running servers
            port = ThreadLocalRandom.current().nextInt(40000, 59999);
            ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("start").addCommand(hdfsUser.getName()).addCommand(hdfsLogdir).addCommand(tbSecretDir).addCommand(port.toString()).addCommand(anacondaEnvironmentPath).addCommand(projectUtils.getFullDockerImageName(project, true)).addCommand(Integer.toString(settings.getTensorBoardMaxReloadThreads())).ignoreOutErrStreams(true).build();
            LOGGER.log(Level.FINE, processDescriptor.toString());
            ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
            if (!processResult.processExited()) {
                throw new IOException("TensorBoard start process timed out!");
            }
            int exitValue = processResult.getExitCode();
            String pidPath = tbSecretDir + File.separator + port + ".pid";
            File pidFile = new File(pidPath);
            // Read the pid for TensorBoard server
            if (pidFile.exists()) {
                cid = Files.readFirstLine(pidFile, Charset.defaultCharset());
            }
            if (exitValue == 0 && cid != null) {
                TensorBoardDTO tensorBoardDTO = new TensorBoardDTO();
                String host = null;
                try {
                    host = InetAddress.getLocalHost().getHostAddress();
                } catch (UnknownHostException ex) {
                    LOGGER.log(Level.SEVERE, null, ex);
                }
                tensorBoardDTO.setEndpoint(host + ":" + port);
                tensorBoardDTO.setCid(cid);
                return tensorBoardDTO;
            } else {
                LOGGER.log(Level.SEVERE, "Failed starting TensorBoard got exitcode " + exitValue + " retrying on new port");
                if (cid != null) {
                    this.killTensorBoard(cid);
                }
            }
        } catch (Exception ex) {
            LOGGER.log(Level.SEVERE, "Problem starting TensorBoard: {0}", ex);
        } finally {
            retries--;
        }
    }
    certsPath = tbBasePath + "/certs";
    HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(), settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
    removeTensorBoardDirectory(tbSecretDir);
    throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_START_ERROR, Level.SEVERE, "Failed to start TensorBoard after exhausting retry attempts");
}
Also used : UnknownHostException(java.net.UnknownHostException) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) ProcessResult(io.hops.hopsworks.common.util.ProcessResult) IOException(java.io.IOException) TensorBoardException(io.hops.hopsworks.exceptions.TensorBoardException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) File(java.io.File) TensorBoardException(io.hops.hopsworks.exceptions.TensorBoardException)

Example 3 with TensorBoardException

use of io.hops.hopsworks.exceptions.TensorBoardException in project hopsworks by logicalclocks.

the class TensorBoardProcessMgr method removeTensorBoardDirectory.

/**
 * Remove TensorBoard directory
 * @param tensorBoardDirectoryPath
 * @throws IOException
 */
public void removeTensorBoardDirectory(String tensorBoardDirectoryPath) throws TensorBoardException {
    // Remove directory
    String prog = settings.getSudoersDir() + "/tensorboard.sh";
    ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("cleanup").addCommand(tensorBoardDirectoryPath).ignoreOutErrStreams(true).build();
    LOGGER.log(Level.FINE, processDescriptor.toString());
    try {
        ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
        if (!processResult.processExited() || processResult.getExitCode() != 0) {
            throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_CLEANUP_ERROR, Level.SEVERE, "Failed to cleanup TensorBoard", "Could not delete TensorBoard directory: " + tensorBoardDirectoryPath);
        }
    } catch (IOException ex) {
        throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_CLEANUP_ERROR, Level.SEVERE, "Failed to cleanup TensorBoard", "Could not delete TensorBoard directory: " + tensorBoardDirectoryPath, ex);
    }
}
Also used : ProcessResult(io.hops.hopsworks.common.util.ProcessResult) ProcessDescriptor(io.hops.hopsworks.common.util.ProcessDescriptor) IOException(java.io.IOException) TensorBoardException(io.hops.hopsworks.exceptions.TensorBoardException)

Example 4 with TensorBoardException

use of io.hops.hopsworks.exceptions.TensorBoardException in project hopsworks by logicalclocks.

the class TensorBoardResource method getTensorBoard.

@ApiOperation(value = "Get the TensorBoard", response = TensorBoardDTO.class)
@GET
@Produces(MediaType.APPLICATION_JSON)
@AllowedProjectRoles({ AllowedProjectRoles.DATA_OWNER, AllowedProjectRoles.DATA_SCIENTIST })
@JWTRequired(acceptedTokens = { Audience.API }, allowedUserRoles = { "HOPS_ADMIN", "HOPS_USER" })
public Response getTensorBoard(@Context SecurityContext sc) throws TensorBoardException {
    try {
        Users user = jWTHelper.getUserPrincipal(sc);
        TensorBoardDTO tbDTO = tensorBoardController.getTensorBoard(project, user);
        if (tbDTO == null) {
            throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_NOT_FOUND, Level.FINE);
        }
        return Response.ok().entity(tbDTO).build();
    } catch (PersistenceException pe) {
        throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_FETCH_ERROR, Level.SEVERE, null, pe.getMessage(), pe);
    }
}
Also used : TensorBoardDTO(io.hops.hopsworks.common.dao.tensorflow.config.TensorBoardDTO) PersistenceException(javax.persistence.PersistenceException) Users(io.hops.hopsworks.persistence.entity.user.Users) TensorBoardException(io.hops.hopsworks.exceptions.TensorBoardException) Produces(javax.ws.rs.Produces) GET(javax.ws.rs.GET) JWTRequired(io.hops.hopsworks.jwt.annotation.JWTRequired) ApiOperation(io.swagger.annotations.ApiOperation) AllowedProjectRoles(io.hops.hopsworks.api.filter.AllowedProjectRoles)

Aggregations

TensorBoardException (io.hops.hopsworks.exceptions.TensorBoardException)4 IOException (java.io.IOException)3 ProcessDescriptor (io.hops.hopsworks.common.util.ProcessDescriptor)2 ProcessResult (io.hops.hopsworks.common.util.ProcessResult)2 File (java.io.File)2 AllowedProjectRoles (io.hops.hopsworks.api.filter.AllowedProjectRoles)1 TensorBoardDTO (io.hops.hopsworks.common.dao.tensorflow.config.TensorBoardDTO)1 DistributedFileSystemOps (io.hops.hopsworks.common.hdfs.DistributedFileSystemOps)1 JWTRequired (io.hops.hopsworks.jwt.annotation.JWTRequired)1 TensorBoard (io.hops.hopsworks.persistence.entity.tensorflow.TensorBoard)1 Users (io.hops.hopsworks.persistence.entity.user.Users)1 ApiOperation (io.swagger.annotations.ApiOperation)1 UnknownHostException (java.net.UnknownHostException)1 Date (java.util.Date)1 Schedule (javax.ejb.Schedule)1 PersistenceException (javax.persistence.PersistenceException)1 GET (javax.ws.rs.GET)1 Produces (javax.ws.rs.Produces)1