use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.
the class SparkController method createSparkJob.
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
private SparkJob createSparkJob(String username, Jobs job, Users user) throws JobException, GenericException, ServiceException {
SparkJob sparkjob = null;
try {
// Set Hopsworks consul service domain, don't use the address, use the name
String hopsworksRestEndpoint = "https://" + serviceDiscoveryController.constructServiceFQDNWithPort(ServiceDiscoveryController.HopsworksService.HOPSWORKS_APP);
UserGroupInformation proxyUser = ugiService.getProxyUser(username);
try {
sparkjob = proxyUser.doAs((PrivilegedExceptionAction<SparkJob>) () -> new SparkJob(job, submitter, user, settings.getHadoopSymbolicLinkDir(), hdfsUsersBean.getHdfsUserName(job.getProject(), user), settings, kafkaBrokers.getKafkaBrokersString(), hopsworksRestEndpoint, servingConfig, serviceDiscoveryController));
} catch (InterruptedException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
} catch (IOException ex) {
throw new JobException(RESTCodes.JobErrorCode.PROXY_ERROR, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
} catch (ServiceDiscoveryException ex) {
throw new ServiceException(RESTCodes.ServiceErrorCode.SERVICE_NOT_FOUND, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
}
if (sparkjob == null) {
throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ERROR, Level.WARNING, "Could not instantiate job with name: " + job.getName() + " and id: " + job.getId(), "sparkjob object was null");
}
return sparkjob;
}
use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.
the class SparkController method startJob.
/**
* Start the Spark job as the given user.
* <p/>
* @param job
* @param user
* @return
* @throws IllegalStateException If Spark is not set up properly.
* @throws IOException If starting the job fails.
* Spark job.
*/
public Execution startJob(final Jobs job, String args, final Users user) throws ServiceException, GenericException, JobException, ProjectException {
// First: some parameter checking.
sanityCheck(job, user);
String username = hdfsUsersBean.getHdfsUserName(job.getProject(), user);
SparkJobConfiguration sparkConfig = (SparkJobConfiguration) job.getJobConfig();
String appPath = sparkConfig.getAppPath();
if (job.getJobType().equals(JobType.PYSPARK)) {
if (job.getProject().getPythonEnvironment() == null) {
// Throw error in Hopsworks UI to notify user to enable Anaconda
throw new JobException(RESTCodes.JobErrorCode.JOB_START_FAILED, Level.SEVERE, "PySpark job needs to have Python Anaconda environment enabled");
}
}
SparkJob sparkjob = createSparkJob(username, job, user);
Execution exec = sparkjob.requestExecutionId(args);
if (job.getJobType().equals(JobType.PYSPARK) && appPath.endsWith(".ipynb")) {
submitter.getExecutionFacade().updateState(exec, JobState.CONVERTING_NOTEBOOK);
String pyAppPath = HopsUtils.prepJupyterNotebookConversion(exec, username, dfs);
sparkConfig.setAppPath(pyAppPath);
jupyterController.convertIPythonNotebook(username, appPath, job.getProject(), pyAppPath, jupyterController.getNotebookConversionType(appPath, user, job.getProject()));
}
submitter.startExecution(sparkjob, args);
activityFacade.persistActivity(ActivityFacade.RAN_JOB + job.getName(), job.getProject(), user.asUser(), ActivityFlag.JOB);
return exec;
}
use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.
the class AbstractExecutionController method retryLogAggregation.
@Override
public JobLogDTO retryLogAggregation(Execution execution, JobLogDTO.LogType type) throws JobException {
if (!execution.getState().isFinalState()) {
throw new JobException(RESTCodes.JobErrorCode.JOB_EXECUTION_INVALID_STATE, Level.FINE, "Job still running.");
}
DistributedFileSystemOps dfso = null;
DistributedFileSystemOps udfso = null;
Users user = execution.getUser();
String hdfsUser = hdfsUsersController.getHdfsUserName(execution.getJob().getProject(), user);
String aggregatedLogPath = settings.getAggregatedLogPath(hdfsUser, execution.getAppId());
if (aggregatedLogPath == null) {
throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, Level.INFO, "Log aggregation is not enabled");
}
try {
dfso = dfs.getDfsOps();
udfso = dfs.getDfsOps(hdfsUser);
if (!dfso.exists(aggregatedLogPath)) {
throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, Level.WARNING, "Logs not available. This could be caused by the retention policy.");
}
String hdfsLogPath = null;
String[] desiredLogTypes = null;
switch(type) {
case OUT:
hdfsLogPath = REMOTE_PROTOCOL + execution.getStdoutPath();
desiredLogTypes = new String[] { type.name() };
break;
case ERR:
hdfsLogPath = REMOTE_PROTOCOL + execution.getStderrPath();
desiredLogTypes = new String[] { type.name(), ".log" };
break;
default:
break;
}
if (!Strings.isNullOrEmpty(hdfsLogPath)) {
YarnClientWrapper yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
ApplicationId applicationId = ConverterUtils.toApplicationId(execution.getAppId());
YarnMonitor monitor = new YarnMonitor(applicationId, yarnClientWrapper, ycs);
try {
YarnLogUtil.copyAggregatedYarnLogs(udfso, aggregatedLogPath, hdfsLogPath, desiredLogTypes, monitor);
} catch (IOException | InterruptedException | YarnException ex) {
LOGGER.log(Level.SEVERE, null, ex);
throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, null, ex.getMessage());
} finally {
monitor.close();
}
}
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
} finally {
if (dfso != null) {
dfso.close();
}
if (udfso != null) {
dfs.closeDfsClient(udfso);
}
}
return getLog(execution, type);
}
use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.
the class AbstractExecutionController method stopExecution.
public Execution stopExecution(Execution execution) throws JobException {
// An execution when it's initializing might not have an appId in hopsworks
if (execution.getAppId() != null && JobState.getRunningStates().contains(execution.getState())) {
YarnClientWrapper yarnClientWrapper = null;
try {
yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
yarnClientWrapper.getYarnClient().killApplication(ApplicationId.fromString(execution.getAppId()));
yarnExecutionFinalizer.removeAllNecessary(execution);
return executionFacade.findById(execution.getId()).orElseThrow(() -> new JobException(RESTCodes.JobErrorCode.JOB_EXECUTION_NOT_FOUND, FINE, "Execution: " + execution.getId()));
} catch (IOException | YarnException ex) {
LOGGER.log(Level.SEVERE, "Could not kill job for job:" + execution.getJob().getName() + "with appId:" + execution.getAppId(), ex);
throw new JobException(RESTCodes.JobErrorCode.JOB_STOP_FAILED, Level.WARNING, ex.getMessage(), null, ex);
} finally {
ycs.closeYarnClient(yarnClientWrapper);
}
}
return execution;
}
use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.
the class AbstractExecutionController method getTensorBoardUrls.
// ====================================================================================================================
// TensorBoard
// ====================================================================================================================
@Override
public List<YarnAppUrlsDTO> getTensorBoardUrls(Users user, String appId, Project project) throws JobException {
List<YarnAppUrlsDTO> urls = new ArrayList<>();
DistributedFileSystemOps udfso = null;
try {
String hdfsUser = hdfsUsersController.getHdfsUserName(project, user);
udfso = dfs.getDfsOps(hdfsUser);
FileStatus[] statuses = udfso.getFilesystem().globStatus(new org.apache.hadoop.fs.Path("/Projects/" + project.getName() + "/Experiments/" + appId + "*/TensorBoard.*"));
for (FileStatus status : statuses) {
LOGGER.log(Level.FINE, "Reading TensorBoard for: {0}", status.getPath());
FSDataInputStream in = null;
try {
in = udfso.open(new org.apache.hadoop.fs.Path(status.getPath().toString()));
String url = IOUtils.toString(in, "UTF-8");
int prefix = url.indexOf("http://");
if (prefix != -1) {
url = url.substring("http://".length());
}
String name = status.getPath().getName();
urls.add(new YarnAppUrlsDTO(name, url));
} catch (Exception e) {
LOGGER.log(Level.WARNING, "Problem reading file with TensorBoard address from HDFS: " + e.getMessage());
} finally {
org.apache.hadoop.io.IOUtils.closeStream(in);
}
}
} catch (Exception e) {
throw new JobException(RESTCodes.JobErrorCode.TENSORBOARD_ERROR, Level.SEVERE, null, e.getMessage(), e);
} finally {
if (udfso != null) {
dfs.closeDfsClient(udfso);
}
}
return urls;
}
Aggregations