Search in sources :

Example 11 with JobException

use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.

the class SparkController method createSparkJob.

@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
private SparkJob createSparkJob(String username, Jobs job, Users user) throws JobException, GenericException, ServiceException {
    SparkJob sparkjob = null;
    try {
        // Set Hopsworks consul service domain, don't use the address, use the name
        String hopsworksRestEndpoint = "https://" + serviceDiscoveryController.constructServiceFQDNWithPort(ServiceDiscoveryController.HopsworksService.HOPSWORKS_APP);
        UserGroupInformation proxyUser = ugiService.getProxyUser(username);
        try {
            sparkjob = proxyUser.doAs((PrivilegedExceptionAction<SparkJob>) () -> new SparkJob(job, submitter, user, settings.getHadoopSymbolicLinkDir(), hdfsUsersBean.getHdfsUserName(job.getProject(), user), settings, kafkaBrokers.getKafkaBrokersString(), hopsworksRestEndpoint, servingConfig, serviceDiscoveryController));
        } catch (InterruptedException ex) {
            LOGGER.log(Level.SEVERE, null, ex);
        }
    } catch (IOException ex) {
        throw new JobException(RESTCodes.JobErrorCode.PROXY_ERROR, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
    } catch (ServiceDiscoveryException ex) {
        throw new ServiceException(RESTCodes.ServiceErrorCode.SERVICE_NOT_FOUND, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
    }
    if (sparkjob == null) {
        throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ERROR, Level.WARNING, "Could not instantiate job with name: " + job.getName() + " and id: " + job.getId(), "sparkjob object was null");
    }
    return sparkjob;
}
Also used : JobException(io.hops.hopsworks.exceptions.JobException) ServiceException(io.hops.hopsworks.exceptions.ServiceException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) GenericException(io.hops.hopsworks.exceptions.GenericException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) TransactionAttribute(javax.ejb.TransactionAttribute)

Example 12 with JobException

use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.

the class SparkController method startJob.

/**
 * Start the Spark job as the given user.
 * <p/>
 * @param job
 * @param user
 * @return
 * @throws IllegalStateException If Spark is not set up properly.
 * @throws IOException If starting the job fails.
 * Spark job.
 */
public Execution startJob(final Jobs job, String args, final Users user) throws ServiceException, GenericException, JobException, ProjectException {
    // First: some parameter checking.
    sanityCheck(job, user);
    String username = hdfsUsersBean.getHdfsUserName(job.getProject(), user);
    SparkJobConfiguration sparkConfig = (SparkJobConfiguration) job.getJobConfig();
    String appPath = sparkConfig.getAppPath();
    if (job.getJobType().equals(JobType.PYSPARK)) {
        if (job.getProject().getPythonEnvironment() == null) {
            // Throw error in Hopsworks UI to notify user to enable Anaconda
            throw new JobException(RESTCodes.JobErrorCode.JOB_START_FAILED, Level.SEVERE, "PySpark job needs to have Python Anaconda environment enabled");
        }
    }
    SparkJob sparkjob = createSparkJob(username, job, user);
    Execution exec = sparkjob.requestExecutionId(args);
    if (job.getJobType().equals(JobType.PYSPARK) && appPath.endsWith(".ipynb")) {
        submitter.getExecutionFacade().updateState(exec, JobState.CONVERTING_NOTEBOOK);
        String pyAppPath = HopsUtils.prepJupyterNotebookConversion(exec, username, dfs);
        sparkConfig.setAppPath(pyAppPath);
        jupyterController.convertIPythonNotebook(username, appPath, job.getProject(), pyAppPath, jupyterController.getNotebookConversionType(appPath, user, job.getProject()));
    }
    submitter.startExecution(sparkjob, args);
    activityFacade.persistActivity(ActivityFacade.RAN_JOB + job.getName(), job.getProject(), user.asUser(), ActivityFlag.JOB);
    return exec;
}
Also used : JobException(io.hops.hopsworks.exceptions.JobException) Execution(io.hops.hopsworks.persistence.entity.jobs.history.Execution) SparkJobConfiguration(io.hops.hopsworks.persistence.entity.jobs.configuration.spark.SparkJobConfiguration)

Example 13 with JobException

use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.

the class AbstractExecutionController method retryLogAggregation.

@Override
public JobLogDTO retryLogAggregation(Execution execution, JobLogDTO.LogType type) throws JobException {
    if (!execution.getState().isFinalState()) {
        throw new JobException(RESTCodes.JobErrorCode.JOB_EXECUTION_INVALID_STATE, Level.FINE, "Job still running.");
    }
    DistributedFileSystemOps dfso = null;
    DistributedFileSystemOps udfso = null;
    Users user = execution.getUser();
    String hdfsUser = hdfsUsersController.getHdfsUserName(execution.getJob().getProject(), user);
    String aggregatedLogPath = settings.getAggregatedLogPath(hdfsUser, execution.getAppId());
    if (aggregatedLogPath == null) {
        throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, Level.INFO, "Log aggregation is not enabled");
    }
    try {
        dfso = dfs.getDfsOps();
        udfso = dfs.getDfsOps(hdfsUser);
        if (!dfso.exists(aggregatedLogPath)) {
            throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, Level.WARNING, "Logs not available. This could be caused by the retention policy.");
        }
        String hdfsLogPath = null;
        String[] desiredLogTypes = null;
        switch(type) {
            case OUT:
                hdfsLogPath = REMOTE_PROTOCOL + execution.getStdoutPath();
                desiredLogTypes = new String[] { type.name() };
                break;
            case ERR:
                hdfsLogPath = REMOTE_PROTOCOL + execution.getStderrPath();
                desiredLogTypes = new String[] { type.name(), ".log" };
                break;
            default:
                break;
        }
        if (!Strings.isNullOrEmpty(hdfsLogPath)) {
            YarnClientWrapper yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
            ApplicationId applicationId = ConverterUtils.toApplicationId(execution.getAppId());
            YarnMonitor monitor = new YarnMonitor(applicationId, yarnClientWrapper, ycs);
            try {
                YarnLogUtil.copyAggregatedYarnLogs(udfso, aggregatedLogPath, hdfsLogPath, desiredLogTypes, monitor);
            } catch (IOException | InterruptedException | YarnException ex) {
                LOGGER.log(Level.SEVERE, null, ex);
                throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, null, ex.getMessage());
            } finally {
                monitor.close();
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    } finally {
        if (dfso != null) {
            dfso.close();
        }
        if (udfso != null) {
            dfs.closeDfsClient(udfso);
        }
    }
    return getLog(execution, type);
}
Also used : JobException(io.hops.hopsworks.exceptions.JobException) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) Users(io.hops.hopsworks.persistence.entity.user.Users) IOException(java.io.IOException) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) YarnMonitor(io.hops.hopsworks.common.jobs.yarn.YarnMonitor) YarnClientWrapper(io.hops.hopsworks.common.yarn.YarnClientWrapper) YarnException(org.apache.hadoop.yarn.exceptions.YarnException)

Example 14 with JobException

use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.

the class AbstractExecutionController method stopExecution.

public Execution stopExecution(Execution execution) throws JobException {
    // An execution when it's initializing might not have an appId in hopsworks
    if (execution.getAppId() != null && JobState.getRunningStates().contains(execution.getState())) {
        YarnClientWrapper yarnClientWrapper = null;
        try {
            yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
            yarnClientWrapper.getYarnClient().killApplication(ApplicationId.fromString(execution.getAppId()));
            yarnExecutionFinalizer.removeAllNecessary(execution);
            return executionFacade.findById(execution.getId()).orElseThrow(() -> new JobException(RESTCodes.JobErrorCode.JOB_EXECUTION_NOT_FOUND, FINE, "Execution: " + execution.getId()));
        } catch (IOException | YarnException ex) {
            LOGGER.log(Level.SEVERE, "Could not kill job for job:" + execution.getJob().getName() + "with appId:" + execution.getAppId(), ex);
            throw new JobException(RESTCodes.JobErrorCode.JOB_STOP_FAILED, Level.WARNING, ex.getMessage(), null, ex);
        } finally {
            ycs.closeYarnClient(yarnClientWrapper);
        }
    }
    return execution;
}
Also used : JobException(io.hops.hopsworks.exceptions.JobException) IOException(java.io.IOException) YarnClientWrapper(io.hops.hopsworks.common.yarn.YarnClientWrapper) YarnException(org.apache.hadoop.yarn.exceptions.YarnException)

Example 15 with JobException

use of io.hops.hopsworks.exceptions.JobException in project hopsworks by logicalclocks.

the class AbstractExecutionController method getTensorBoardUrls.

// ====================================================================================================================
// TensorBoard
// ====================================================================================================================
@Override
public List<YarnAppUrlsDTO> getTensorBoardUrls(Users user, String appId, Project project) throws JobException {
    List<YarnAppUrlsDTO> urls = new ArrayList<>();
    DistributedFileSystemOps udfso = null;
    try {
        String hdfsUser = hdfsUsersController.getHdfsUserName(project, user);
        udfso = dfs.getDfsOps(hdfsUser);
        FileStatus[] statuses = udfso.getFilesystem().globStatus(new org.apache.hadoop.fs.Path("/Projects/" + project.getName() + "/Experiments/" + appId + "*/TensorBoard.*"));
        for (FileStatus status : statuses) {
            LOGGER.log(Level.FINE, "Reading TensorBoard for: {0}", status.getPath());
            FSDataInputStream in = null;
            try {
                in = udfso.open(new org.apache.hadoop.fs.Path(status.getPath().toString()));
                String url = IOUtils.toString(in, "UTF-8");
                int prefix = url.indexOf("http://");
                if (prefix != -1) {
                    url = url.substring("http://".length());
                }
                String name = status.getPath().getName();
                urls.add(new YarnAppUrlsDTO(name, url));
            } catch (Exception e) {
                LOGGER.log(Level.WARNING, "Problem reading file with TensorBoard address from HDFS: " + e.getMessage());
            } finally {
                org.apache.hadoop.io.IOUtils.closeStream(in);
            }
        }
    } catch (Exception e) {
        throw new JobException(RESTCodes.JobErrorCode.TENSORBOARD_ERROR, Level.SEVERE, null, e.getMessage(), e);
    } finally {
        if (udfso != null) {
            dfs.closeDfsClient(udfso);
        }
    }
    return urls;
}
Also used : YarnAppUrlsDTO(io.hops.hopsworks.common.dao.jobs.description.YarnAppUrlsDTO) FileStatus(org.apache.hadoop.fs.FileStatus) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) ArrayList(java.util.ArrayList) ProjectException(io.hops.hopsworks.exceptions.ProjectException) JobException(io.hops.hopsworks.exceptions.JobException) GenericException(io.hops.hopsworks.exceptions.GenericException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) IOException(java.io.IOException) ServiceException(io.hops.hopsworks.exceptions.ServiceException) JobException(io.hops.hopsworks.exceptions.JobException) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream)

Aggregations

JobException (io.hops.hopsworks.exceptions.JobException)23 IOException (java.io.IOException)11 Path (javax.ws.rs.Path)8 Produces (javax.ws.rs.Produces)8 AllowedProjectRoles (io.hops.hopsworks.api.filter.AllowedProjectRoles)7 JWTRequired (io.hops.hopsworks.jwt.annotation.JWTRequired)7 GenericException (io.hops.hopsworks.exceptions.GenericException)6 ServiceException (io.hops.hopsworks.exceptions.ServiceException)6 Users (io.hops.hopsworks.persistence.entity.user.Users)6 DistributedFileSystemOps (io.hops.hopsworks.common.hdfs.DistributedFileSystemOps)5 ProjectException (io.hops.hopsworks.exceptions.ProjectException)4 SparkJobConfiguration (io.hops.hopsworks.persistence.entity.jobs.configuration.spark.SparkJobConfiguration)4 Execution (io.hops.hopsworks.persistence.entity.jobs.history.Execution)4 ApiOperation (io.swagger.annotations.ApiOperation)4 TransactionAttribute (javax.ejb.TransactionAttribute)4 ApiKeyRequired (io.hops.hopsworks.api.filter.apiKey.ApiKeyRequired)3 YarnAppUrlsDTO (io.hops.hopsworks.common.dao.jobs.description.YarnAppUrlsDTO)3 DatasetException (io.hops.hopsworks.exceptions.DatasetException)3 ArrayList (java.util.ArrayList)3 YarnException (org.apache.hadoop.yarn.exceptions.YarnException)3