use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.
the class YarnExecutionFinalizer method copyLogs.
@Asynchronous
public Future<Execution> copyLogs(Execution exec) {
DistributedFileSystemOps udfso = dfs.getDfsOps(exec.getHdfsUser());
ApplicationId applicationId = ApplicationId.fromString(exec.getAppId());
YarnClientWrapper yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
YarnMonitor monitor = new YarnMonitor(applicationId, yarnClientWrapper, ycs);
try {
String stdOutPath = settings.getAggregatedLogPath(exec.getHdfsUser(), exec.getAppId());
String[] logOutputPaths = Utils.getJobLogLocation(exec.getJob().getProject().getName(), exec.getJob().getJobType());
String stdOutFinalDestination = logOutputPaths[0] + exec.getAppId() + File.separator + "stdout.log";
String stdErrFinalDestination = logOutputPaths[1] + exec.getAppId() + File.separator + "stderr.log";
try {
String[] desiredOutLogTypes = { "out" };
YarnLogUtil.copyAggregatedYarnLogs(udfso, stdOutPath, stdOutFinalDestination, desiredOutLogTypes, monitor);
String[] desiredErrLogTypes = { "err", ".log" };
YarnLogUtil.copyAggregatedYarnLogs(udfso, stdOutPath, stdErrFinalDestination, desiredErrLogTypes, monitor);
} catch (IOException | InterruptedException | YarnException ex) {
LOGGER.log(Level.SEVERE, "error while aggregation logs" + ex.toString());
}
Execution execution = updateExecutionSTDPaths(stdOutFinalDestination, stdErrFinalDestination, exec);
finalize(exec, exec.getState());
return new AsyncResult<>(execution);
} finally {
dfs.closeDfsClient(udfso);
monitor.close();
}
}
use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.
the class YarnJobsMonitor method monitor.
@Schedule(persistent = false, second = "*/5", minute = "*", hour = "*")
public synchronized void monitor(Timer timer) {
try {
Map<String, Execution> executions = new HashMap<>();
List<Execution> execs = executionFacade.findNotFinished();
if (execs != null && !execs.isEmpty()) {
for (Execution exec : execs) {
if (exec.getAppId() != null) {
executions.put(exec.getAppId(), exec);
}
}
// Remove (Close) all monitors of deleted jobs
Iterator<Map.Entry<String, YarnMonitor>> monitorsIter = monitors.entrySet().iterator();
while (monitorsIter.hasNext()) {
Map.Entry<String, YarnMonitor> entry = monitorsIter.next();
// Check if Value associated with Key is 10
if (!executions.keySet().contains(entry.getKey())) {
// Remove the element
entry.getValue().close();
monitorsIter.remove();
}
}
maxStatusPollRetry = settings.getMaxStatusPollRetry();
List<String> toRemove = new ArrayList<>();
for (Map.Entry<String, Execution> entry : executions.entrySet()) {
YarnMonitor monitor = monitors.get(entry.getKey());
if (monitor == null) {
ApplicationId appId = ApplicationId.fromString(entry.getKey());
YarnClientWrapper newYarnclientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
monitor = new YarnMonitor(appId, newYarnclientWrapper, ycs);
monitors.put(entry.getKey(), monitor);
}
Execution exec = internalMonitor(executions.get(entry.getKey()), monitor);
if (exec == null) {
toRemove.add(entry.getKey());
monitor.close();
}
}
for (String appID : toRemove) {
failures.remove(appID);
monitors.remove(appID);
}
// This is here to do bookkeeping. Remove from the map all the executions which have finished copying the logs
copyLogsFutures.entrySet().removeIf(futureResult -> futureResult.getValue().isDone());
}
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Error while monitoring jobs", ex);
}
}
use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.
the class ExecutionFacade method findByJob.
public CollectionInfo findByJob(Integer offset, Integer limit, Set<? extends AbstractFacade.FilterBy> filters, Set<? extends AbstractFacade.SortBy> sorts, Jobs job) {
String duration = "";
String queryStr = buildQuery("SELECT e" + duration + " FROM Execution e ", filters, sorts, "e.job = :job ");
String queryCountStr = buildQuery("SELECT COUNT(e.id) FROM Execution e ", filters, sorts, "e.job = :job ");
Query query = em.createQuery(queryStr, Execution.class).setParameter("job", job);
Query queryCount = em.createQuery(queryCountStr, Execution.class).setParameter("job", job);
setFilter(filters, query);
setFilter(filters, queryCount);
setOffsetAndLim(offset, limit, query);
return new CollectionInfo((Long) queryCount.getSingleResult(), query.getResultList());
}
use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.
the class FlinkController method startJob.
public Execution startJob(final Jobs job, final Users user) throws GenericException, JobException, ServiceException {
// First: some parameter checking.
if (job == null) {
throw new NullPointerException("Cannot run a null job.");
} else if (user == null) {
throw new NullPointerException("Cannot run a job as a null user.");
} else if (job.getJobType() != JobType.FLINK) {
throw new IllegalArgumentException("Job configuration is not a Flink job configuration.");
}
// Set Hopsworks consul service domain, don't use the address, use the name
String username = hdfsUsersBean.getHdfsUserName(job.getProject(), user);
FlinkJob flinkjob = null;
try {
String hopsworksRestEndpoint = "https://" + serviceDiscoveryController.constructServiceFQDNWithPort(ServiceDiscoveryController.HopsworksService.HOPSWORKS_APP);
UserGroupInformation proxyUser = ugiService.getProxyUser(username);
try {
flinkjob = proxyUser.doAs((PrivilegedExceptionAction<FlinkJob>) () -> new FlinkJob(job, submitter, user, hdfsUsersBean.getHdfsUserName(job.getProject(), job.getCreator()), settings, kafkaBrokers.getKafkaBrokersString(), hopsworksRestEndpoint, servingConfig, serviceDiscoveryController));
} catch (InterruptedException ex) {
LOGGER.log(Level.SEVERE, null, ex);
}
} catch (IOException ex) {
throw new JobException(RESTCodes.JobErrorCode.PROXY_ERROR, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
} catch (ServiceDiscoveryException ex) {
throw new ServiceException(RESTCodes.ServiceErrorCode.SERVICE_NOT_FOUND, Level.SEVERE, "job: " + job.getId() + ", user:" + user.getUsername(), ex.getMessage(), ex);
}
if (flinkjob == null) {
throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ERROR, Level.WARNING, "Could not instantiate job with name: " + job.getName() + " and id: " + job.getId(), "sparkjob object was null");
}
Execution execution = flinkjob.requestExecutionId();
submitter.startExecution(flinkjob);
activityFacade.persistActivity(ActivityFacade.RAN_JOB, job.getProject(), user.asUser(), ActivityFlag.JOB);
return execution;
}
use of io.hops.hopsworks.persistence.entity.jobs.history.Execution in project hopsworks by logicalclocks.
the class AbstractExecutionController method start.
@Override
@TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED)
public Execution start(Jobs job, String args, Users user) throws JobException, GenericException, ServiceException, ProjectException {
// If the limit for the number of executions for this job has been reached, return an error
checkExecutionLimit(job);
// A user should not be able to start a job if the project is prepaid and it doesn't have quota.
if (job.getProject().getPaymentType().equals(PaymentType.PREPAID)) {
YarnProjectsQuota projectQuota = yarnProjectsQuotaFacade.findByProjectName(job.getProject().getName());
if (projectQuota == null || projectQuota.getQuotaRemaining() <= 0) {
throw new ProjectException(RESTCodes.ProjectErrorCode.PROJECT_QUOTA_ERROR, Level.FINE);
}
}
// If enabled and nodemanagers are all offline throw an JobException exception
if (settings.isCheckingForNodemanagerStatusEnabled() && job.getJobType() != JobType.PYTHON) {
hostServicesFacade.findServices("nodemanager").stream().filter(s -> s.getStatus() == ServiceStatus.Started).findFirst().orElseThrow(() -> new JobException(RESTCodes.JobErrorCode.NODEMANAGERS_OFFLINE, Level.SEVERE));
}
Execution exec;
switch(job.getJobType()) {
case FLINK:
// Materialize certs
return flinkController.startJob(job, user);
case SPARK:
exec = sparkController.startJob(job, args, user);
if (exec == null) {
throw new IllegalArgumentException("Problem getting execution object for: " + job.getJobType());
}
SparkJobConfiguration config = (SparkJobConfiguration) job.getJobConfig();
String path = config.getAppPath();
String pathOfInode;
try {
pathOfInode = Utils.prepPath(path);
} catch (UnsupportedEncodingException ex) {
throw new JobException(RESTCodes.JobErrorCode.JOB_START_FAILED, Level.FINE, "Job name: " + job.getName(), ex.getMessage(), ex);
}
Inode inode = inodeController.getInodeAtPath(pathOfInode);
String inodeName = inode.getInodePK().getName();
activityFacade.persistActivity(ActivityFacade.EXECUTED_JOB + inodeName, job.getProject(), user, ActivityFlag.JOB);
break;
case PYSPARK:
if (job.getProject().getPythonEnvironment() == null) {
throw new ProjectException(RESTCodes.ProjectErrorCode.ANACONDA_NOT_ENABLED, Level.FINEST);
}
exec = sparkController.startJob(job, args, user);
if (exec == null) {
throw new IllegalArgumentException("Error while getting execution object for: " + job.getJobType());
}
break;
default:
throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ACTION, Level.FINE, "Unsupported job type: " + job.getJobType());
}
return exec;
}
Aggregations