use of io.hops.hopsworks.common.yarn.YarnClientWrapper in project hopsworks by logicalclocks.
the class ProjectController method cleanup.
public void cleanup(Project project, String sessionId, List<Future<?>> projectCreationFutures, boolean decreaseCreatedProj, Users owner) throws GenericException {
if (project == null) {
return;
}
int nbTry = 0;
while (nbTry < 2) {
YarnClientWrapper yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
YarnClient client = yarnClientWrapper.getYarnClient();
try {
// remove from project_team so that nobody can see the project anymore
updateProjectTeamRole(project, ProjectRoleTypes.UNDER_REMOVAL);
/*
* get all running yarn application owned by anny of the project members
* we will check later if this application have been stoped and their log aggregation have been finished
* it would be better to check all application (even the ones that have finished running)
* but the log aggregation status is not recovered when the resource manager restart. As a result
* we can't know if the status in "NOT_START" because we should wait for it or because the
* resourcemanager restarted.
*/
Collection<ProjectTeam> team = project.getProjectTeamCollection();
Set<String> hdfsUsers = new HashSet<>();
for (ProjectTeam pt : team) {
String hdfsUsername = hdfsUsersController.getHdfsUserName(project, pt.getUser());
hdfsUsers.add(hdfsUsername);
}
List<ApplicationReport> projectsApps = getYarnApplications(hdfsUsers, client);
// try and close all the jupyter jobs
removeJupyter(project);
removeAnacondaEnv(project);
removeAlertConfigs(project);
// kill jobs
killYarnJobs(project);
waitForJobLogs(projectsApps, client);
List<HdfsUsers> usersToClean = getUsersToClean(project);
List<HdfsGroups> groupsToClean = getGroupsToClean(project);
removeProjectInt(project, usersToClean, groupsToClean, projectCreationFutures, decreaseCreatedProj, owner);
removeCertificatesFromMaterializer(project);
// Delete online featurestore database
onlineFeaturestoreController.removeOnlineFeatureStore(project);
break;
} catch (Exception ex) {
nbTry++;
if (nbTry < 2) {
try {
Thread.sleep(nbTry * 1000);
} catch (InterruptedException ex1) {
LOGGER.log(Level.SEVERE, null, ex1);
}
} else {
throw new GenericException(RESTCodes.GenericErrorCode.UNKNOWN_ERROR, Level.SEVERE, null, ex.getMessage(), ex);
}
} finally {
ycs.closeYarnClient(yarnClientWrapper);
}
}
}
use of io.hops.hopsworks.common.yarn.YarnClientWrapper in project hopsworks by logicalclocks.
the class YarnExecutionFinalizer method copyLogs.
@Asynchronous
public Future<Execution> copyLogs(Execution exec) {
DistributedFileSystemOps udfso = dfs.getDfsOps(exec.getHdfsUser());
ApplicationId applicationId = ApplicationId.fromString(exec.getAppId());
YarnClientWrapper yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
YarnMonitor monitor = new YarnMonitor(applicationId, yarnClientWrapper, ycs);
try {
String stdOutPath = settings.getAggregatedLogPath(exec.getHdfsUser(), exec.getAppId());
String[] logOutputPaths = Utils.getJobLogLocation(exec.getJob().getProject().getName(), exec.getJob().getJobType());
String stdOutFinalDestination = logOutputPaths[0] + exec.getAppId() + File.separator + "stdout.log";
String stdErrFinalDestination = logOutputPaths[1] + exec.getAppId() + File.separator + "stderr.log";
try {
String[] desiredOutLogTypes = { "out" };
YarnLogUtil.copyAggregatedYarnLogs(udfso, stdOutPath, stdOutFinalDestination, desiredOutLogTypes, monitor);
String[] desiredErrLogTypes = { "err", ".log" };
YarnLogUtil.copyAggregatedYarnLogs(udfso, stdOutPath, stdErrFinalDestination, desiredErrLogTypes, monitor);
} catch (IOException | InterruptedException | YarnException ex) {
LOGGER.log(Level.SEVERE, "error while aggregation logs" + ex.toString());
}
Execution execution = updateExecutionSTDPaths(stdOutFinalDestination, stdErrFinalDestination, exec);
finalize(exec, exec.getState());
return new AsyncResult<>(execution);
} finally {
dfs.closeDfsClient(udfso);
monitor.close();
}
}
use of io.hops.hopsworks.common.yarn.YarnClientWrapper in project hopsworks by logicalclocks.
the class YarnJobsMonitor method monitor.
@Schedule(persistent = false, second = "*/5", minute = "*", hour = "*")
public synchronized void monitor(Timer timer) {
try {
Map<String, Execution> executions = new HashMap<>();
List<Execution> execs = executionFacade.findNotFinished();
if (execs != null && !execs.isEmpty()) {
for (Execution exec : execs) {
if (exec.getAppId() != null) {
executions.put(exec.getAppId(), exec);
}
}
// Remove (Close) all monitors of deleted jobs
Iterator<Map.Entry<String, YarnMonitor>> monitorsIter = monitors.entrySet().iterator();
while (monitorsIter.hasNext()) {
Map.Entry<String, YarnMonitor> entry = monitorsIter.next();
// Check if Value associated with Key is 10
if (!executions.keySet().contains(entry.getKey())) {
// Remove the element
entry.getValue().close();
monitorsIter.remove();
}
}
maxStatusPollRetry = settings.getMaxStatusPollRetry();
List<String> toRemove = new ArrayList<>();
for (Map.Entry<String, Execution> entry : executions.entrySet()) {
YarnMonitor monitor = monitors.get(entry.getKey());
if (monitor == null) {
ApplicationId appId = ApplicationId.fromString(entry.getKey());
YarnClientWrapper newYarnclientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
monitor = new YarnMonitor(appId, newYarnclientWrapper, ycs);
monitors.put(entry.getKey(), monitor);
}
Execution exec = internalMonitor(executions.get(entry.getKey()), monitor);
if (exec == null) {
toRemove.add(entry.getKey());
monitor.close();
}
}
for (String appID : toRemove) {
failures.remove(appID);
monitors.remove(appID);
}
// This is here to do bookkeeping. Remove from the map all the executions which have finished copying the logs
copyLogsFutures.entrySet().removeIf(futureResult -> futureResult.getValue().isDone());
}
} catch (Exception ex) {
LOGGER.log(Level.SEVERE, "Error while monitoring jobs", ex);
}
}
use of io.hops.hopsworks.common.yarn.YarnClientWrapper in project hopsworks by logicalclocks.
the class FlinkController method getFlinkMasterAddr.
/**
* Retrieves the Flink master address from a running Flink session in YARN.
*
* @param appId flink ApplicationId in YARN
* @return String of ip:port of flink master
*/
@TransactionAttribute(TransactionAttributeType.NEVER)
public String getFlinkMasterAddr(String appId) {
LOGGER.log(Level.INFO, "Getting Flink Master Addr for:" + appId);
Configuration conf = settings.getConfiguration();
org.apache.flink.configuration.Configuration flinkConf = org.apache.flink.configuration.GlobalConfiguration.loadConfiguration(settings.getFlinkConfDir());
YarnConfiguration yarnConf = new YarnConfiguration(conf);
YarnClientWrapper yarnClientWrapper = null;
YarnClusterDescriptor cluster = null;
String flinkMasterURL = null;
try {
yarnConf.addResource(new File(settings.getHadoopConfDir() + "/yarn-site.xml").toURI().toURL());
yarnClientWrapper = ycs.getYarnClientSuper();
YarnClient yarnClient = yarnClientWrapper.getYarnClient();
cluster = new YarnClusterDescriptor(flinkConf, yarnConf, settings.getFlinkConfDir(), yarnClient, true);
ClusterClient<ApplicationId> clusterClient = cluster.retrieve(ApplicationId.fromString(appId));
flinkMasterURL = clusterClient.getClusterConnectionInfo().getHostname() + ":" + clusterClient.getClusterConnectionInfo().getPort();
} catch (Exception ex) {
LOGGER.log(Level.FINE, "Could not retrieve Flink Master URL for applicationID: " + appId, ex);
} finally {
if (cluster != null) {
cluster.close();
}
if (yarnClientWrapper != null) {
ycs.closeYarnClient(yarnClientWrapper);
}
}
return flinkMasterURL;
}
use of io.hops.hopsworks.common.yarn.YarnClientWrapper in project hopsworks by logicalclocks.
the class AbstractExecutionController method retryLogAggregation.
@Override
public JobLogDTO retryLogAggregation(Execution execution, JobLogDTO.LogType type) throws JobException {
if (!execution.getState().isFinalState()) {
throw new JobException(RESTCodes.JobErrorCode.JOB_EXECUTION_INVALID_STATE, Level.FINE, "Job still running.");
}
DistributedFileSystemOps dfso = null;
DistributedFileSystemOps udfso = null;
Users user = execution.getUser();
String hdfsUser = hdfsUsersController.getHdfsUserName(execution.getJob().getProject(), user);
String aggregatedLogPath = settings.getAggregatedLogPath(hdfsUser, execution.getAppId());
if (aggregatedLogPath == null) {
throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, Level.INFO, "Log aggregation is not enabled");
}
try {
dfso = dfs.getDfsOps();
udfso = dfs.getDfsOps(hdfsUser);
if (!dfso.exists(aggregatedLogPath)) {
throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, Level.WARNING, "Logs not available. This could be caused by the retention policy.");
}
String hdfsLogPath = null;
String[] desiredLogTypes = null;
switch(type) {
case OUT:
hdfsLogPath = REMOTE_PROTOCOL + execution.getStdoutPath();
desiredLogTypes = new String[] { type.name() };
break;
case ERR:
hdfsLogPath = REMOTE_PROTOCOL + execution.getStderrPath();
desiredLogTypes = new String[] { type.name(), ".log" };
break;
default:
break;
}
if (!Strings.isNullOrEmpty(hdfsLogPath)) {
YarnClientWrapper yarnClientWrapper = ycs.getYarnClientSuper(settings.getConfiguration());
ApplicationId applicationId = ConverterUtils.toApplicationId(execution.getAppId());
YarnMonitor monitor = new YarnMonitor(applicationId, yarnClientWrapper, ycs);
try {
YarnLogUtil.copyAggregatedYarnLogs(udfso, aggregatedLogPath, hdfsLogPath, desiredLogTypes, monitor);
} catch (IOException | InterruptedException | YarnException ex) {
LOGGER.log(Level.SEVERE, null, ex);
throw new JobException(RESTCodes.JobErrorCode.JOB_LOG, null, ex.getMessage());
} finally {
monitor.close();
}
}
} catch (IOException ex) {
LOGGER.log(Level.SEVERE, null, ex);
} finally {
if (dfso != null) {
dfso.close();
}
if (udfso != null) {
dfs.closeDfsClient(udfso);
}
}
return getLog(execution, type);
}
Aggregations