use of io.hops.hopsworks.exceptions.TensorBoardException in project hopsworks by logicalclocks.
the class TensorBoardKillTimer method rotate.
@Schedule(persistent = false, minute = "*/10", hour = "*")
public void rotate(Timer timer) {
try {
LOGGER.log(Level.INFO, "Running TensorBoardKillTimer.");
int tensorBoardMaxLastAccessed = settings.getTensorBoardMaxLastAccessed();
Collection<TensorBoard> tensorBoardCollection = tensorBoardFacade.findAll();
for (TensorBoard tensorBoard : tensorBoardCollection) {
// Standard case, TB have been idle for a given amount of time
Date accessed = tensorBoard.getLastAccessed();
Date current = Calendar.getInstance().getTime();
if ((current.getTime() - accessed.getTime()) > tensorBoardMaxLastAccessed) {
try {
tensorBoardController.cleanup(tensorBoard);
LOGGER.log(Level.FINE, "Killed TensorBoard " + tensorBoard.toString() + " not accessed in the last " + tensorBoardMaxLastAccessed + " milliseconds");
} catch (TensorBoardException ex) {
LOGGER.log(Level.SEVERE, "Failed to clean up running TensorBoard", ex);
}
}
}
// sanity check to make sure that all .pid files have a corresponding TB
try {
List<TensorBoard> TBs = tensorBoardFacade.findAll();
String tbDirPath = settings.getStagingDir() + Settings.TENSORBOARD_DIRS;
File tbDir = new File(tbDirPath);
// For each project_projectmember directory try to find .pid file
for (File currentTbDir : tbDir.listFiles()) {
for (File possiblePidFile : currentTbDir.listFiles()) {
if (possiblePidFile.getName().endsWith(".pid")) {
String cid = com.google.common.io.Files.readFirstLine(possiblePidFile, Charset.defaultCharset());
if (cid != null) {
// do not kill TBs which are in the DB
boolean tbExists = false;
for (TensorBoard tb : TBs) {
if (tb.getCid().equals(cid)) {
tbExists = true;
}
}
if (!tbExists) {
LOGGER.log(Level.WARNING, "Detected a stray TensorBoard with pid " + cid + " in directory " + currentTbDir.getAbsolutePath() + ", cleaning up...");
tensorBoardProcessMgr.killTensorBoard(cid);
tensorBoardProcessMgr.removeTensorBoardDirectory(currentTbDir.getAbsolutePath());
}
}
}
}
}
} catch (IOException | NumberFormatException e) {
LOGGER.log(Level.SEVERE, "Exception while reading .pid files", e);
}
} catch (Exception e) {
LOGGER.log(Level.SEVERE, "An error occurred while checking for expired TensorBoards to be cleaned up", e);
}
}
use of io.hops.hopsworks.exceptions.TensorBoardException in project hopsworks by logicalclocks.
the class TensorBoardProcessMgr method startTensorBoard.
/**
* Start the TensorBoard process
* @param project
* @param user
* @param hdfsUser
* @param hdfsLogdir
* @return
* @throws IOException
*/
public TensorBoardDTO startTensorBoard(Project project, Users user, HdfsUsers hdfsUser, String hdfsLogdir, String tensorBoardDirectory) throws TensorBoardException {
String prog = settings.getSudoersDir() + "/tensorboard.sh";
Integer port = 0;
String cid = null;
String tbBasePath = settings.getStagingDir() + Settings.TENSORBOARD_DIRS;
String tbSecretDir = tbBasePath + tensorBoardDirectory;
String certsPath = "";
File tbDir = new File(tbSecretDir);
if (!tbDir.exists()) {
tbDir.mkdirs();
}
DistributedFileSystemOps dfso = dfsService.getDfsOps();
try {
certsPath = tbSecretDir + "/certs";
File certsDir = new File(certsPath);
certsDir.mkdirs();
HopsUtils.materializeCertificatesForUserCustomDir(project.getName(), user.getUsername(), settings.getHdfsTmpCertDir(), dfso, certificateMaterializer, settings, certsPath);
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE, "Failed in materializing certificates for " + hdfsUser + " in directory " + certsPath, ioe);
HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(), settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_START_ERROR, Level.SEVERE, "Failed to start TensorBoard", "An exception occurred while materializing certificates", ioe);
} finally {
if (dfso != null) {
dfsService.closeDfsClient(dfso);
}
}
String anacondaEnvironmentPath = settings.getAnacondaProjectDir();
int retries = 3;
while (retries > 0) {
try {
if (retries == 0) {
throw new IOException("Failed to start TensorBoard for project=" + project.getName() + ", user=" + user.getUid());
}
// use pidfile to kill any running servers
port = ThreadLocalRandom.current().nextInt(40000, 59999);
ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("start").addCommand(hdfsUser.getName()).addCommand(hdfsLogdir).addCommand(tbSecretDir).addCommand(port.toString()).addCommand(anacondaEnvironmentPath).addCommand(projectUtils.getFullDockerImageName(project, true)).addCommand(Integer.toString(settings.getTensorBoardMaxReloadThreads())).ignoreOutErrStreams(true).build();
LOGGER.log(Level.FINE, processDescriptor.toString());
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
if (!processResult.processExited()) {
throw new IOException("TensorBoard start process timed out!");
}
int exitValue = processResult.getExitCode();
String pidPath = tbSecretDir + File.separator + port + ".pid";
File pidFile = new File(pidPath);
// Read the pid for TensorBoard server
if (pidFile.exists()) {
cid = Files.readFirstLine(pidFile, Charset.defaultCharset());
}
if (exitValue == 0 && cid != null) {
TensorBoardDTO tensorBoardDTO = new TensorBoardDTO();
String host = null;
try {
host = InetAddress.getLocalHost().getHostAddress();
} catch (UnknownHostException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
tensorBoardDTO.setEndpoint(host + ":" + port);
tensorBoardDTO.setCid(cid);
return tensorBoardDTO;
} else {
LOGGER.log(Level.SEVERE, "Failed starting TensorBoard got exitcode " + exitValue + " retrying on new port");
if (cid != null) {
this.killTensorBoard(cid);
}
}
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Problem starting TensorBoard: {0}", ex);
} finally {
retries--;
}
}
certsPath = tbBasePath + "/certs";
HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(), settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
removeTensorBoardDirectory(tbSecretDir);
throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_START_ERROR, Level.SEVERE, "Failed to start TensorBoard after exhausting retry attempts");
}
use of io.hops.hopsworks.exceptions.TensorBoardException in project hopsworks by logicalclocks.
the class TensorBoardProcessMgr method removeTensorBoardDirectory.
/**
* Remove TensorBoard directory
* @param tensorBoardDirectoryPath
* @throws IOException
*/
public void removeTensorBoardDirectory(String tensorBoardDirectoryPath) throws TensorBoardException {
// Remove directory
String prog = settings.getSudoersDir() + "/tensorboard.sh";
ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("cleanup").addCommand(tensorBoardDirectoryPath).ignoreOutErrStreams(true).build();
LOGGER.log(Level.FINE, processDescriptor.toString());
try {
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
if (!processResult.processExited() || processResult.getExitCode() != 0) {
throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_CLEANUP_ERROR, Level.SEVERE, "Failed to cleanup TensorBoard", "Could not delete TensorBoard directory: " + tensorBoardDirectoryPath);
}
} catch (IOException ex) {
throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_CLEANUP_ERROR, Level.SEVERE, "Failed to cleanup TensorBoard", "Could not delete TensorBoard directory: " + tensorBoardDirectoryPath, ex);
}
}
use of io.hops.hopsworks.exceptions.TensorBoardException in project hopsworks by logicalclocks.
the class TensorBoardResource method getTensorBoard.
@ApiOperation(value = "Get the TensorBoard", response = TensorBoardDTO.class)
@GET
@Produces(MediaType.APPLICATION_JSON)
@AllowedProjectRoles({ AllowedProjectRoles.DATA_OWNER, AllowedProjectRoles.DATA_SCIENTIST })
@JWTRequired(acceptedTokens = { Audience.API }, allowedUserRoles = { "HOPS_ADMIN", "HOPS_USER" })
public Response getTensorBoard(@Context SecurityContext sc) throws TensorBoardException {
try {
Users user = jWTHelper.getUserPrincipal(sc);
TensorBoardDTO tbDTO = tensorBoardController.getTensorBoard(project, user);
if (tbDTO == null) {
throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_NOT_FOUND, Level.FINE);
}
return Response.ok().entity(tbDTO).build();
} catch (PersistenceException pe) {
throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_FETCH_ERROR, Level.SEVERE, null, pe.getMessage(), pe);
}
}
Aggregations