Search in sources :

Example 1 with Execution

use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.

the class YarnExecutionFinalizer method copyLogs.

@Asynchronous
public Future<Execution> copyLogs(Execution exec) {
    DistributedFileSystemOps udfso = dfs.getDfsOps(exec.getHdfsUser());
    ApplicationId applicationId = ApplicationId.fromString(exec.getAppId());
    YarnClientWrapper yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
    YarnMonitor monitor = new YarnMonitor(applicationId, yarnClientWrapper, ycs);
    try {
        String stdOutPath = settings.getAggregatedLogPath(exec.getHdfsUser(), exec.getAppId());
        String[] logOutputPaths = Utils.getJobLogLocation(exec.getJob().getProject().getName(), exec.getJob().getJobType());
        String stdOutFinalDestination = logOutputPaths[0] + exec.getAppId() + File.separator + "stdout.log";
        String stdErrFinalDestination = logOutputPaths[1] + exec.getAppId() + File.separator + "stderr.log";
        try {
            String[] desiredOutLogTypes = { "out" };
            YarnLogUtil.copyAggregatedYarnLogs(udfso, stdOutPath, stdOutFinalDestination, desiredOutLogTypes, monitor);
            String[] desiredErrLogTypes = { "err", ".log" };
            YarnLogUtil.copyAggregatedYarnLogs(udfso, stdOutPath, stdErrFinalDestination, desiredErrLogTypes, monitor);
        } catch (IOException | InterruptedException | YarnException ex) {
            LOGGER.log(Level.SEVERE, "error while aggregation logs" + ex.toString());
        }
        Execution execution = updateExecutionSTDPaths(stdOutFinalDestination, stdErrFinalDestination, exec);
        finalize(exec, exec.getState());
        return new AsyncResult<>(execution);
    } finally {
        dfs.closeDfsClient(udfso);
        monitor.close();
    }
}
Also used : Execution(io.hops.hopsworks.persistence.entity.jobs.history.Execution) DistributedFileSystemOps(io.hops.hopsworks.common.hdfs.DistributedFileSystemOps) IOException(java.io.IOException) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) AsyncResult(javax.ejb.AsyncResult) YarnClientWrapper(io.hops.hopsworks.common.yarn.YarnClientWrapper) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) Asynchronous(javax.ejb.Asynchronous)

Example 2 with Execution

use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.

the class YarnJobsMonitor method monitor.

@Schedule(persistent = false, second = "*/5", minute = "*", hour = "*")
public synchronized void monitor(Timer timer) {
    try {
        Map<String, Execution> executions = new HashMap<>();
        List<Execution> execs = executionFacade.findNotFinished();
        if (execs != null && !execs.isEmpty()) {
            for (Execution exec : execs) {
                if (exec.getAppId() != null) {
                    executions.put(exec.getAppId(), exec);
                }
            }
            // Remove (Close) all monitors of deleted jobs
            Iterator<Map.Entry<String, YarnMonitor>> monitorsIter = monitors.entrySet().iterator();
            while (monitorsIter.hasNext()) {
                Map.Entry<String, YarnMonitor> entry = monitorsIter.next();
                // Check if Value associated with Key is 10
                if (!executions.keySet().contains(entry.getKey())) {
                    // Remove the element
                    entry.getValue().close();
                    monitorsIter.remove();
                }
            }
            maxStatusPollRetry = settings.getMaxStatusPollRetry();
            List<String> toRemove = new ArrayList<>();
            for (Map.Entry<String, Execution> entry : executions.entrySet()) {
                YarnMonitor monitor = monitors.get(entry.getKey());
                if (monitor == null) {
                    ApplicationId appId = ApplicationId.fromString(entry.getKey());
                    YarnClientWrapper newYarnclientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
                    monitor = new YarnMonitor(appId, newYarnclientWrapper, ycs);
                    monitors.put(entry.getKey(), monitor);
                }
                Execution exec = internalMonitor(executions.get(entry.getKey()), monitor);
                if (exec == null) {
                    toRemove.add(entry.getKey());
                    monitor.close();
                }
            }
            for (String appID : toRemove) {
                failures.remove(appID);
                monitors.remove(appID);
            }
            // This is here to do bookkeeping. Remove from the map all the executions which have finished copying the logs
            copyLogsFutures.entrySet().removeIf(futureResult -> futureResult.getValue().isDone());
        }
    } catch (Exception ex) {
        LOGGER.log(Level.SEVERE, "Error while monitoring jobs", ex);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) IOException(java.io.IOException) Execution(io.hops.hopsworks.persistence.entity.jobs.history.Execution) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) HashMap(java.util.HashMap) Map(java.util.Map) YarnClientWrapper(io.hops.hopsworks.common.yarn.YarnClientWrapper) Schedule(javax.ejb.Schedule)

Example 3 with Execution

use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.

the class ExecutionFacade method findByJob.

public CollectionInfo findByJob(Integer offset, Integer limit, Set<? extends AbstractFacade.FilterBy> filters, Set<? extends AbstractFacade.SortBy> sorts, Jobs job) {
    String duration = "";
    String queryStr = buildQuery("SELECT e" + duration + " FROM Execution e ", filters, sorts, "e.job = :job ");
    String queryCountStr = buildQuery("SELECT COUNT(e.id) FROM Execution e ", filters, sorts, "e.job = :job ");
    Query query = em.createQuery(queryStr, Execution.class).setParameter("job", job);
    Query queryCount = em.createQuery(queryCountStr, Execution.class).setParameter("job", job);
    setFilter(filters, query);
    setFilter(filters, queryCount);
    setOffsetAndLim(offset, limit, query);
    return new CollectionInfo((Long) queryCount.getSingleResult(), query.getResultList());
}
Also used : Execution(io.hops.hopsworks.persistence.entity.jobs.history.Execution) TypedQuery(javax.persistence.TypedQuery) Query(javax.persistence.Query)

Example 4 with Execution

use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.

the class FlinkController method startJob.

public Execution startJob(final Jobs job, final Users user) throws GenericException, JobException, ServiceException {
    // First: some parameter checking.
    if (job == null) {
        throw new NullPointerException("Cannot run a null job.");
    } else if (user == null) {
        throw new NullPointerException("Cannot run a job as a null user.");
    } else if (job.getJobType() != JobType.FLINK) {
        throw new IllegalArgumentException("Job configuration is not a Flink job configuration.");
    }
    // Set Hopsworks consul service domain, don't use the address, use the name
    String username = hdfsUsersBean.getHdfsUserName(job.getProject(), user);
    FlinkJob flinkjob = null;
    try {
        String hopsworksRestEndpoint = "https://" + serviceDiscoveryController.constructServiceFQDNWithPort(ServiceDiscoveryController.HopsworksService.HOPSWORKS_APP);
        UserGroupInformation proxyUser = ugiService.getProxyUser(username);
        try {
            flinkjob = proxyUser.doAs((PrivilegedExceptionAction<FlinkJob>) () -> new FlinkJob(job, submitter, user, hdfsUsersBean.getHdfsUserName(job.getProject(), job.getCreator()), settings, kafkaBrokers.getKafkaBrokersString(), hopsworksRestEndpoint, servingConfig, serviceDiscoveryController));
        } catch (InterruptedException ex) {
            LOGGER.log(Level.SEVERE, null, ex);
        }
    } catch (IOException ex) {
        throw new JobException(RESTCodes.JobErrorCode.PROXY_ERROR, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
    } catch (ServiceDiscoveryException ex) {
        throw new ServiceException(RESTCodes.ServiceErrorCode.SERVICE_NOT_FOUND, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
    }
    if (flinkjob == null) {
        throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ERROR, Level.WARNING, "Could not instantiate job with name: " + job.getName() + " and id: " + job.getId(), "sparkjob object was null");
    }
    Execution execution = flinkjob.requestExecutionId();
    submitter.startExecution(flinkjob);
    activityFacade.persistActivity(ActivityFacade.RAN_JOB, job.getProject(), user.asUser(), ActivityFlag.JOB);
    return execution;
}
Also used : PrivilegedExceptionAction(java.security.PrivilegedExceptionAction) IOException(java.io.IOException) GenericException(io.hops.hopsworks.exceptions.GenericException) JobException(io.hops.hopsworks.exceptions.JobException) Execution(io.hops.hopsworks.persistence.entity.jobs.history.Execution) ServiceException(io.hops.hopsworks.exceptions.ServiceException) ServiceDiscoveryException(com.logicalclocks.servicediscoverclient.exceptions.ServiceDiscoveryException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 5 with Execution

use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.

the class AbstractExecutionController method start.

@Override
@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
public Execution start(Jobs job, String args, Users user) throws JobException, GenericException, ServiceException, ProjectException {
    // If the limit for the number of executions for this job has been reached, return an error
    checkExecutionLimit(job);
    // A user should not be able to start a job if the project is prepaid and it doesn't have quota.
    if (job.getProject().getPaymentType().equals(PaymentType.PREPAID)) {
        YarnProjectsQuota projectQuota = yarnProjectsQuotaFacade.findByProjectName(job.getProject().getName());
        if (projectQuota == null || projectQuota.getQuotaRemaining() <= 0) {
            throw new ProjectException(RESTCodes.ProjectErrorCode.PROJECT_QUOTA_ERROR, Level.FINE);
        }
    }
    // If enabled and nodemanagers are all offline throw an JobException exception
    if (settings.isCheckingForNodemanagerStatusEnabled() && job.getJobType() != JobType.PYTHON) {
        hostServicesFacade.findServices("nodemanager").stream().filter(s -> s.getStatus() == ServiceStatus.Started).findFirst().orElseThrow(() -> new JobException(RESTCodes.JobErrorCode.NODEMANAGERS_OFFLINE, Level.SEVERE));
    }
    Execution exec;
    switch(job.getJobType()) {
        case FLINK:
            // Materialize certs
            return flinkController.startJob(job, user);
        case SPARK:
            exec = sparkController.startJob(job, args, user);
            if (exec == null) {
                throw new IllegalArgumentException("Problem getting execution object for: " + job.getJobType());
            }
            SparkJobConfiguration config = (SparkJobConfiguration) job.getJobConfig();
            String path = config.getAppPath();
            String pathOfInode;
            try {
                pathOfInode = Utils.prepPath(path);
            } catch (UnsupportedEncodingException ex) {
                throw new JobException(RESTCodes.JobErrorCode.JOB_START_FAILED, Level.FINE, "Job name: " + job.getName(), ex.getMessage(), ex);
            }
            Inode inode = inodeController.getInodeAtPath(pathOfInode);
            String inodeName = inode.getInodePK().getName();
            activityFacade.persistActivity(ActivityFacade.EXECUTED_JOB + inodeName, job.getProject(), user, ActivityFlag.JOB);
            break;
        case PYSPARK:
            if (job.getProject().getPythonEnvironment() == null) {
                throw new ProjectException(RESTCodes.ProjectErrorCode.ANACONDA_NOT_ENABLED, Level.FINEST);
            }
            exec = sparkController.startJob(job, args, user);
            if (exec == null) {
                throw new IllegalArgumentException("Error while getting execution object for: " + job.getJobType());
            }
            break;
        default:
            throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ACTION, Level.FINE, "Unsupported job type: " + job.getJobType());
    }
    return exec;
}
Also used : ProjectException(io.hops.hopsworks.exceptions.ProjectException) JobException(io.hops.hopsworks.exceptions.JobException) Execution(io.hops.hopsworks.persistence.entity.jobs.history.Execution) Inode(io.hops.hopsworks.persistence.entity.hdfs.inode.Inode) SparkJobConfiguration(io.hops.hopsworks.persistence.entity.jobs.configuration.spark.SparkJobConfiguration) UnsupportedEncodingException(java.io.UnsupportedEncodingException) GenericException(io.hops.hopsworks.exceptions.GenericException) YarnProjectsQuota(io.hops.hopsworks.persistence.entity.jobs.quota.YarnProjectsQuota) TransactionAttribute(javax.ejb.TransactionAttribute)

Aggregations

Execution (io.hops.hopsworks.persistence.entity.jobs.history.Execution)17 AllowedProjectRoles (io.hops.hopsworks.api.filter.AllowedProjectRoles)6 ApiKeyRequired (io.hops.hopsworks.api.filter.apiKey.ApiKeyRequired)6 JWTRequired (io.hops.hopsworks.jwt.annotation.JWTRequired)6 ApiOperation (io.swagger.annotations.ApiOperation)6 Path (javax.ws.rs.Path)5 Produces (javax.ws.rs.Produces)5 JobException (io.hops.hopsworks.exceptions.JobException)4 IOException (java.io.IOException)4 ResourceRequest (io.hops.hopsworks.common.api.ResourceRequest)3 YarnException (org.apache.hadoop.yarn.exceptions.YarnException)3 JobLogDTO (io.hops.hopsworks.common.jobs.JobLogDTO)2 YarnClientWrapper (io.hops.hopsworks.common.yarn.YarnClientWrapper)2 GenericException (io.hops.hopsworks.exceptions.GenericException)2 SparkJobConfiguration (io.hops.hopsworks.persistence.entity.jobs.configuration.spark.SparkJobConfiguration)2 Users (io.hops.hopsworks.persistence.entity.user.Users)2 TransactionAttribute (javax.ejb.TransactionAttribute)2 GET (javax.ws.rs.GET)2 POST (javax.ws.rs.POST)2 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)2