use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.
the class AsynchronousGitCommandExecutor method execute.
@Asynchronous
@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
public void execute(GitOpExecution gitOpExecution, GitPaths gitPaths) {
int maxTries = 5;
String pid = "";
String gitCommand = gitOpExecution.getGitCommandConfiguration().getCommandType().getGitCommand();
String prog = settings.getSudoersDir() + "/git.sh";
String commandArgumentsFile = gitPaths.getConfDirPath() + File.separator + GitContainerLaunchScriptArgumentsTemplate.FILE_NAME;
while (maxTries > 0 && Strings.isNullOrEmpty(pid)) {
try {
ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("start").addCommand(commandArgumentsFile).redirectErrorStream(true).setCurrentWorkingDirectory(new File(gitPaths.getGitPath())).setWaitTimeout(60L, TimeUnit.SECONDS).build();
String pidFile = gitPaths.getRunDirPath() + "/git.pid";
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
if (processResult.getExitCode() != 0) {
String errorMsg = "Could not start git service to execute command " + gitCommand + " . " + "Exit code: " + processResult.getExitCode() + " Error: stdout: " + processResult.getStdout() + " stderr: " + processResult.getStderr();
LOGGER.log(Level.SEVERE, errorMsg);
throw new IOException(errorMsg);
} else {
pid = com.google.common.io.Files.readFirstLine(new File(pidFile), Charset.defaultCharset());
// Get the updated repository
Optional<GitRepository> optional = gitRepositoryFacade.findById(gitOpExecution.getRepository().getId());
gitRepositoryFacade.updateRepositoryCid(optional.get(), pid);
// gitOpExecutionFacade.updateState(gitOpExecution, GitOpExecutionState.SUBMITTED);
}
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Problem executing shell script to start git command service", ex);
maxTries--;
}
}
if (Strings.isNullOrEmpty(pid)) {
updateExecutionStateToFail(gitOpExecution);
}
}
use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.
the class LocalhostTfServingController method startServingInstance.
/**
* Starts a Tensorflow serving instance. Executes the tfserving bash script to launch a tensorflow serving
* server as serving-user and localize the tf-model from HDFS server. It records the PID of the server for monitoring.
*
* @param project the project to start the serving in
* @param user the user starting the serving
* @param serving the serving instance to start (tfserving modelserver)
* @throws ServingException
*/
public void startServingInstance(Project project, Users user, Serving serving) throws ServingException {
String script = settings.getSudoersDir() + "/tfserving.sh";
// TODO(Fabio) this is bad as we don't know if the port is used or not
Integer grpcPort = ThreadLocalRandom.current().nextInt(40000, 59999);
Integer restPort = ThreadLocalRandom.current().nextInt(40000, 59999);
Path secretDir = Paths.get(settings.getStagingDir(), SERVING_DIRS + serving.getLocalDir());
ProcessDescriptor processDescriptor;
try {
processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(script).addCommand("start").addCommand(serving.getName()).addCommand(Paths.get(serving.getModelPath(), serving.getModelVersion().toString()).toString()).addCommand(String.valueOf(grpcPort)).addCommand(String.valueOf(restPort)).addCommand(secretDir.toString()).addCommand(project.getName() + USER_NAME_DELIMITER + user.getUsername()).addCommand(serving.isBatchingEnabled() ? "1" : "0").addCommand(project.getName().toLowerCase()).addCommand(projectUtils.getFullDockerImageName(project, true)).setWaitTimeout(2L, TimeUnit.MINUTES).ignoreOutErrStreams(false).build();
logger.log(Level.INFO, processDescriptor.toString());
} catch (ServiceDiscoveryException ex) {
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
}
// Materialized TLS certificates to be able to read the model
if (settings.getHopsRpcTls()) {
try {
certificateMaterializer.materializeCertificatesLocal(user.getUsername(), project.getName());
} catch (IOException e) {
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, e.getMessage(), e);
} finally {
// Release lock on the serving entry
servingFacade.releaseLock(project, serving.getId());
}
}
try {
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
if (processResult.getExitCode() != 0) {
// Startup process failed for some reason
serving.setCid(CID_STOPPED);
servingFacade.updateDbObject(serving, project);
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.INFO);
}
// Read the pid for TensorFlow Serving server
Path cidFilePath = Paths.get(secretDir.toString(), "tfserving.pid");
String cid = Files.readFirstLine(cidFilePath.toFile(), Charset.defaultCharset());
// Update the info in the db
serving.setCid(cid);
serving.setLocalPort(restPort);
serving.setDeployed(new Date());
servingFacade.updateDbObject(serving, project);
} catch (Exception ex) {
// Startup process failed for some reason
serving.setCid(CID_STOPPED);
servingFacade.updateDbObject(serving, project);
throw new ServingException(RESTCodes.ServingErrorCode.LIFECYCLEERRORINT, Level.SEVERE, null, ex.getMessage(), ex);
} finally {
if (settings.getHopsRpcTls()) {
certificateMaterializer.removeCertificatesLocal(user.getUsername(), project.getName());
}
// release lock on the serving entry
servingFacade.releaseLock(project, serving.getId());
}
}
use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.
the class DatasetController method unzip.
public void unzip(Project project, Users user, Path path, Path destPath) throws DatasetException {
String hdfsUser = hdfsUsersController.getHdfsUserName(project, user);
checkFileExists(path, hdfsUser);
CompressionInfo compressionInfo = new CompressionInfo(path, destPath);
String stagingDir = settings.getStagingDir() + File.separator + compressionInfo.getStagingDirectory();
File unzipDir = new File(stagingDir);
unzipDir.mkdirs();
settings.addUnzippingState(compressionInfo);
ProcessDescriptor.Builder processDescriptorBuilder = new ProcessDescriptor.Builder().addCommand(settings.getHopsworksDomainDir() + "/bin/unzip-background.sh").addCommand(stagingDir).addCommand(path.toString()).addCommand(hdfsUser);
if (destPath != null) {
processDescriptorBuilder.addCommand(destPath.toString());
}
ProcessDescriptor processDescriptor = processDescriptorBuilder.ignoreOutErrStreams(true).build();
try {
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
int result = processResult.getExitCode();
if (result == 2) {
throw new DatasetException(RESTCodes.DatasetErrorCode.COMPRESSION_SIZE_ERROR, Level.WARNING);
}
if (result != 0) {
throw new DatasetException(RESTCodes.DatasetErrorCode.COMPRESSION_ERROR, Level.WARNING, "path: " + path.toString() + ", result: " + result);
}
} catch (IOException ex) {
throw new DatasetException(RESTCodes.DatasetErrorCode.COMPRESSION_ERROR, Level.SEVERE, "path: " + path.toString(), ex.getMessage(), ex);
}
}
use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.
the class LocalHostJupyterProcessMgr method startJupyterServer.
@Override
@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
public JupyterDTO startJupyterServer(Project project, String secretConfig, String hdfsUser, Users user, JupyterSettings js, String allowOrigin) throws ServiceException, JobException {
String prog = settings.getSudoersDir() + "/jupyter.sh";
Integer port = ThreadLocalRandom.current().nextInt(40000, 59999);
JupyterPaths jp = jupyterConfigFilesGenerator.generateConfiguration(project, secretConfig, hdfsUser, user, js, port, allowOrigin);
String secretDir = settings.getStagingDir() + Settings.PRIVATE_DIRS + js.getSecret();
String token = TokenGenerator.generateToken(TOKEN_LENGTH);
String cid = "";
// The Jupyter Notebook is running at: http://localhost:8888/?token=c8de56fa4deed24899803e93c227592aef6538f93025fe01
int maxTries = 5;
// kill any running servers for this user, clear cached entries
while (maxTries > 0) {
try {
// use pidfile to kill any running servers
ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("start").addCommand(jp.getNotebookPath()).addCommand(settings.getHadoopSymbolicLinkDir() + "-" + settings.getHadoopVersion()).addCommand(hdfsUser).addCommand(settings.getAnacondaProjectDir()).addCommand(port.toString()).addCommand(HopsUtils.getJupyterLogName(hdfsUser, port)).addCommand(secretDir).addCommand(jp.getCertificatesDir()).addCommand(hdfsUser).addCommand(token).addCommand(js.getMode().getValue()).addCommand(projectUtils.getFullDockerImageName(project, false)).addCommand(Boolean.toString(js.isGitBackend())).redirectErrorStream(true).setCurrentWorkingDirectory(new File(jp.getNotebookPath())).setWaitTimeout(60L, TimeUnit.SECONDS).build();
String pidfile = jp.getRunDirPath() + "/jupyter.pid";
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
if (processResult.getExitCode() != 0) {
String errorMsg = "Could not start Jupyter server. Exit code: " + processResult.getExitCode() + " Error: stdout: " + processResult.getStdout() + " stderr: " + processResult.getStderr();
LOGGER.log(Level.SEVERE, errorMsg);
throw new IOException(errorMsg);
}
// Read the pid for Jupyter Notebook
cid = com.google.common.io.Files.readFirstLine(new File(pidfile), Charset.defaultCharset());
return new JupyterDTO(port, token, cid, secretConfig, jp.getCertificatesDir());
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Problem executing shell script to start Jupyter server", ex);
maxTries--;
}
}
String errorMsg = "Failed to start Jupyter";
throw new ServiceException(RESTCodes.ServiceErrorCode.JUPYTER_START_ERROR, Level.SEVERE, errorMsg, errorMsg + " for project " + project);
}
use of io.hops.hopsworks.common.util.ProcessDescriptor in project hopsworks by logicalclocks.
the class TensorBoardProcessMgr method startTensorBoard.
/**
* Start the TensorBoard process
* @param project
* @param user
* @param hdfsUser
* @param hdfsLogdir
* @return
* @throws IOException
*/
public TensorBoardDTO startTensorBoard(Project project, Users user, HdfsUsers hdfsUser, String hdfsLogdir, String tensorBoardDirectory) throws TensorBoardException {
String prog = settings.getSudoersDir() + "/tensorboard.sh";
Integer port = 0;
String cid = null;
String tbBasePath = settings.getStagingDir() + Settings.TENSORBOARD_DIRS;
String tbSecretDir = tbBasePath + tensorBoardDirectory;
String certsPath = "";
File tbDir = new File(tbSecretDir);
if (!tbDir.exists()) {
tbDir.mkdirs();
}
DistributedFileSystemOps dfso = dfsService.getDfsOps();
try {
certsPath = tbSecretDir + "/certs";
File certsDir = new File(certsPath);
certsDir.mkdirs();
HopsUtils.materializeCertificatesForUserCustomDir(project.getName(), user.getUsername(), settings.getHdfsTmpCertDir(), dfso, certificateMaterializer, settings, certsPath);
} catch (IOException ioe) {
LOGGER.log(Level.SEVERE, "Failed in materializing certificates for " + hdfsUser + " in directory " + certsPath, ioe);
HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(), settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_START_ERROR, Level.SEVERE, "Failed to start TensorBoard", "An exception occurred while materializing certificates", ioe);
} finally {
if (dfso != null) {
dfsService.closeDfsClient(dfso);
}
}
String anacondaEnvironmentPath = settings.getAnacondaProjectDir();
int retries = 3;
while (retries > 0) {
try {
if (retries == 0) {
throw new IOException("Failed to start TensorBoard for project=" + project.getName() + ", user=" + user.getUid());
}
// use pidfile to kill any running servers
port = ThreadLocalRandom.current().nextInt(40000, 59999);
ProcessDescriptor processDescriptor = new ProcessDescriptor.Builder().addCommand("/usr/bin/sudo").addCommand(prog).addCommand("start").addCommand(hdfsUser.getName()).addCommand(hdfsLogdir).addCommand(tbSecretDir).addCommand(port.toString()).addCommand(anacondaEnvironmentPath).addCommand(projectUtils.getFullDockerImageName(project, true)).addCommand(Integer.toString(settings.getTensorBoardMaxReloadThreads())).ignoreOutErrStreams(true).build();
LOGGER.log(Level.FINE, processDescriptor.toString());
ProcessResult processResult = osProcessExecutor.execute(processDescriptor);
if (!processResult.processExited()) {
throw new IOException("TensorBoard start process timed out!");
}
int exitValue = processResult.getExitCode();
String pidPath = tbSecretDir + File.separator + port + ".pid";
File pidFile = new File(pidPath);
// Read the pid for TensorBoard server
if (pidFile.exists()) {
cid = Files.readFirstLine(pidFile, Charset.defaultCharset());
}
if (exitValue == 0 && cid != null) {
TensorBoardDTO tensorBoardDTO = new TensorBoardDTO();
String host = null;
try {
host = InetAddress.getLocalHost().getHostAddress();
} catch (UnknownHostException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
tensorBoardDTO.setEndpoint(host + ":" + port);
tensorBoardDTO.setCid(cid);
return tensorBoardDTO;
} else {
LOGGER.log(Level.SEVERE, "Failed starting TensorBoard got exitcode " + exitValue + " retrying on new port");
if (cid != null) {
this.killTensorBoard(cid);
}
}
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Problem starting TensorBoard: {0}", ex);
} finally {
retries--;
}
}
certsPath = tbBasePath + "/certs";
HopsUtils.cleanupCertificatesForUserCustomDir(user.getUsername(), project.getName(), settings.getHdfsTmpCertDir(), certificateMaterializer, certsPath, settings);
removeTensorBoardDirectory(tbSecretDir);
throw new TensorBoardException(RESTCodes.TensorBoardErrorCode.TENSORBOARD_START_ERROR, Level.SEVERE, "Failed to start TensorBoard after exhausting retry attempts");
}
Aggregations