use of com.vip.saturn.job.console.repository.zookeeper.CuratorRepository.CuratorFrameworkOp in project Saturn by vipshop.
the class DashboardServiceImpl method refreshStatistics2DB.
private void refreshStatistics2DB(ZkCluster zkCluster) {
HashMap<String, JobStatistics> /**
* {jobname}-{domain}
*/
jobMap = new HashMap<>();
HashMap<String, ExecutorStatistics> /**
* {executorName}-{domain}
*/
executorMap = new HashMap<>();
List<JobStatistics> jobList = new ArrayList<>();
List<ExecutorStatistics> executorList = new ArrayList<>();
List<AbnormalJob> unnormalJobList = new ArrayList<>();
List<AbnormalJob> unableFailoverJobList = new ArrayList<>();
List<Timeout4AlarmJob> timeout4AlarmJobList = new ArrayList<>();
List<DomainStatistics> domainList = new ArrayList<>();
List<AbnormalContainer> abnormalContainerList = new ArrayList<>();
// 不同版本的域数量
Map<String, Long> versionDomainNumber = new HashMap<>();
// 不同版本的executor数量
Map<String, Long> versionExecutorNumber = new HashMap<>();
int exeInDocker = 0;
int exeNotInDocker = 0;
int totalCount = 0;
int errorCount = 0;
for (RegistryCenterConfiguration config : zkCluster.getRegCenterConfList()) {
// 过滤非当前zk连接
if (zkCluster.getZkAddr().equals(config.getZkAddressList())) {
int processCountOfThisDomainAllTime = 0;
int errorCountOfThisDomainAllTime = 0;
int processCountOfThisDomainThisDay = 0;
int errorCountOfThisDomainThisDay = 0;
DomainStatistics domain = new DomainStatistics(config.getNamespace(), zkCluster.getZkAddr(), config.getNameAndNamespace());
RegistryCenterClient registryCenterClient = registryCenterService.connect(config.getNameAndNamespace());
try {
if (registryCenterClient != null && registryCenterClient.isConnected()) {
CuratorFramework curatorClient = registryCenterClient.getCuratorClient();
CuratorFrameworkOp curatorFrameworkOp = curatorRepository.newCuratorFrameworkOp(curatorClient);
// 统计稳定性
if (checkExists(curatorClient, ExecutorNodePath.SHARDING_COUNT_PATH)) {
String countStr = getData(curatorClient, ExecutorNodePath.SHARDING_COUNT_PATH);
domain.setShardingCount(Integer.valueOf(countStr));
}
// 该域的版本号
String version = null;
// 该域的在线executor数量
long executorNumber = 0L;
// 统计物理容器资源,统计版本数据
if (null != curatorClient.checkExists().forPath(ExecutorNodePath.getExecutorNodePath())) {
List<String> executors = curatorClient.getChildren().forPath(ExecutorNodePath.getExecutorNodePath());
if (executors != null) {
for (String exe : executors) {
// 在线的才统计
if (null != curatorClient.checkExists().forPath(ExecutorNodePath.getExecutorIpNodePath(exe))) {
// 统计是物理机还是容器
String executorMapKey = exe + "-" + config.getNamespace();
ExecutorStatistics executorStatistics = executorMap.get(executorMapKey);
if (executorStatistics == null) {
executorStatistics = new ExecutorStatistics(exe, config.getNamespace());
executorStatistics.setNns(domain.getNns());
executorStatistics.setIp(getData(curatorClient, ExecutorNodePath.getExecutorIpNodePath(exe)));
executorMap.put(executorMapKey, executorStatistics);
}
// set runInDocker field
if (checkExists(curatorClient, ExecutorNodePath.get$ExecutorTaskNodePath(exe))) {
executorStatistics.setRunInDocker(true);
exeInDocker++;
} else {
exeNotInDocker++;
}
}
// 获取版本号
if (version == null) {
version = getData(curatorClient, ExecutorNodePath.getExecutorVersionNodePath(exe));
}
}
executorNumber = executors.size();
}
}
// 统计版本数据
if (version == null) {
// 未知版本
version = "-1";
}
if (versionDomainNumber.containsKey(version)) {
Long domainNumber = versionDomainNumber.get(version);
versionDomainNumber.put(version, domainNumber + 1);
} else {
versionDomainNumber.put(version, 1L);
}
if (versionExecutorNumber.containsKey(version)) {
Long executorNumber0 = versionExecutorNumber.get(version);
versionExecutorNumber.put(version, executorNumber0 + executorNumber);
} else {
if (executorNumber != 0) {
versionExecutorNumber.put(version, executorNumber);
}
}
// 遍历所有$Jobs子节点,非系统作业
List<String> jobs = jobDimensionService.getAllUnSystemJobs(curatorFrameworkOp);
SaturnStatistics saturnStatistics = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.UNNORMAL_JOB, zkCluster.getZkAddr());
List<AbnormalJob> oldAbnormalJobs = new ArrayList<>();
if (saturnStatistics != null) {
String result = saturnStatistics.getResult();
if (StringUtils.isNotBlank(result)) {
oldAbnormalJobs = JSON.parseArray(result, AbnormalJob.class);
}
}
saturnStatistics = saturnStatisticsService.findStatisticsByNameAndZkList(StatisticsTableKeyConstant.TIMEOUT_4_ALARM_JOB, zkCluster.getZkAddr());
List<Timeout4AlarmJob> oldTimeout4AlarmJobs = new ArrayList<>();
if (saturnStatistics != null) {
String result = saturnStatistics.getResult();
if (StringUtils.isNotBlank(result)) {
oldTimeout4AlarmJobs = JSON.parseArray(result, Timeout4AlarmJob.class);
}
}
for (String job : jobs) {
try {
Boolean localMode = Boolean.valueOf(getData(curatorClient, JobNodePath.getConfigNodePath(job, "localMode")));
String jobDomainKey = job + "-" + config.getNamespace();
JobStatistics jobStatistics = jobMap.get(jobDomainKey);
if (jobStatistics == null) {
jobStatistics = new JobStatistics(job, config.getNamespace(), config.getNameAndNamespace());
jobMap.put(jobDomainKey, jobStatistics);
}
String jobDegree = getData(curatorClient, JobNodePath.getConfigNodePath(job, "jobDegree"));
if (Strings.isNullOrEmpty(jobDegree)) {
jobDegree = "0";
}
jobStatistics.setJobDegree(Integer.parseInt(jobDegree));
// 非本地作业才参与判断
if (!localMode) {
AbnormalJob unnormalJob = new AbnormalJob(job, config.getNamespace(), config.getNameAndNamespace(), config.getDegree());
checkJavaOrShellJobHasProblem(oldAbnormalJobs, curatorClient, unnormalJob, jobDegree, unnormalJobList);
}
// 查找超时告警作业
Timeout4AlarmJob timeout4AlarmJob = new Timeout4AlarmJob(job, config.getNamespace(), config.getNameAndNamespace(), config.getDegree());
if (isTimeout4AlarmJob(oldTimeout4AlarmJobs, timeout4AlarmJob, curatorFrameworkOp) != null) {
timeout4AlarmJob.setJobDegree(jobDegree);
timeout4AlarmJobList.add(timeout4AlarmJob);
}
// 查找无法高可用的作业
AbnormalJob unableFailoverJob = new AbnormalJob(job, config.getNamespace(), config.getNameAndNamespace(), config.getDegree());
if (isUnableFailoverJob(curatorClient, unableFailoverJob, curatorFrameworkOp) != null) {
unableFailoverJob.setJobDegree(jobDegree);
unableFailoverJobList.add(unableFailoverJob);
}
String processCountOfThisJobAllTimeStr = getData(curatorClient, JobNodePath.getProcessCountPath(job));
String errorCountOfThisJobAllTimeStr = getData(curatorClient, JobNodePath.getErrorCountPath(job));
int processCountOfThisJobAllTime = processCountOfThisJobAllTimeStr == null ? 0 : Integer.valueOf(processCountOfThisJobAllTimeStr);
int errorCountOfThisJobAllTime = processCountOfThisJobAllTimeStr == null ? 0 : Integer.valueOf(errorCountOfThisJobAllTimeStr);
processCountOfThisDomainAllTime += processCountOfThisJobAllTime;
errorCountOfThisDomainAllTime += errorCountOfThisJobAllTime;
int processCountOfThisJobThisDay = 0;
int errorCountOfThisJobThisDay = 0;
// loadLevel of this job
int loadLevel = Integer.parseInt(getData(curatorClient, JobNodePath.getConfigNodePath(job, "loadLevel")));
int shardingTotalCount = Integer.parseInt(getData(curatorClient, JobNodePath.getConfigNodePath(job, "shardingTotalCount")));
List<String> servers = null;
if (null != curatorClient.checkExists().forPath(JobNodePath.getServerNodePath(job))) {
servers = curatorClient.getChildren().forPath(JobNodePath.getServerNodePath(job));
for (String server : servers) {
// processFailureCount,用以统计作业每天的执行次数;2.统计executor的loadLevel;,
if (checkExists(curatorClient, JobNodePath.getServerStatus(job, server))) {
// processFailureCount,用以统计作业每天的执行次数;
try {
String processSuccessCountOfThisExeStr = getData(curatorClient, JobNodePath.getProcessSucessCount(job, server));
String processFailureCountOfThisExeStr = getData(curatorClient, JobNodePath.getProcessFailureCount(job, server));
int processSuccessCountOfThisExe = processSuccessCountOfThisExeStr == null ? 0 : Integer.valueOf(processSuccessCountOfThisExeStr);
int processFailureCountOfThisExe = processFailureCountOfThisExeStr == null ? 0 : Integer.valueOf(processFailureCountOfThisExeStr);
// 该作业当天运行统计
processCountOfThisJobThisDay += processSuccessCountOfThisExe + processFailureCountOfThisExe;
errorCountOfThisJobThisDay += processFailureCountOfThisExe;
// 全部域当天的成功数与失败数
totalCount += processSuccessCountOfThisExe + processFailureCountOfThisExe;
errorCount += processFailureCountOfThisExe;
// 全域当天运行统计
processCountOfThisDomainThisDay += processCountOfThisJobThisDay;
errorCountOfThisDomainThisDay += errorCountOfThisJobThisDay;
// executor当天运行成功失败数
String executorMapKey = server + "-" + config.getNamespace();
ExecutorStatistics executorStatistics = executorMap.get(executorMapKey);
if (executorStatistics == null) {
executorStatistics = new ExecutorStatistics(server, config.getNamespace());
executorStatistics.setNns(domain.getNns());
executorStatistics.setIp(getData(curatorClient, ExecutorNodePath.getExecutorIpNodePath(server)));
executorMap.put(executorMapKey, executorStatistics);
}
executorStatistics.setFailureCountOfTheDay(executorStatistics.getFailureCountOfTheDay() + processFailureCountOfThisExe);
executorStatistics.setProcessCountOfTheDay(executorStatistics.getProcessCountOfTheDay() + processSuccessCountOfThisExe + processFailureCountOfThisExe);
} catch (Exception e) {
log.info(e.getMessage());
}
// 2.统计executor的loadLevel;
try {
// enabled 的作业才需要计算权重
if (Boolean.valueOf(getData(curatorClient, JobNodePath.getConfigNodePath(job, "enabled")))) {
String sharding = getData(curatorClient, JobNodePath.getServerSharding(job, server));
if (StringUtils.isNotEmpty(sharding)) {
// 更新job的executorsAndshards
String exesAndShards = (jobStatistics.getExecutorsAndShards() == null ? "" : jobStatistics.getExecutorsAndShards()) + server + ":" + sharding + "; ";
jobStatistics.setExecutorsAndShards(exesAndShards);
// 2.统计是物理机还是容器
String executorMapKey = server + "-" + config.getNamespace();
ExecutorStatistics executorStatistics = executorMap.get(executorMapKey);
if (executorStatistics == null) {
executorStatistics = new ExecutorStatistics(server, config.getNamespace());
executorStatistics.setNns(domain.getNns());
executorStatistics.setIp(getData(curatorClient, ExecutorNodePath.getExecutorIpNodePath(server)));
executorMap.put(executorMapKey, executorStatistics);
// set runInDocker field
if (checkExists(curatorClient, ExecutorNodePath.get$ExecutorTaskNodePath(server))) {
executorStatistics.setRunInDocker(true);
exeInDocker++;
} else {
exeNotInDocker++;
}
}
if (executorStatistics.getJobAndShardings() != null) {
executorStatistics.setJobAndShardings(executorStatistics.getJobAndShardings() + job + ":" + sharding + ";");
} else {
executorStatistics.setJobAndShardings(job + ":" + sharding + ";");
}
int newLoad = executorStatistics.getLoadLevel() + (loadLevel * sharding.split(",").length);
executorStatistics.setLoadLevel(newLoad);
}
}
} catch (Exception e) {
log.info(e.getMessage());
}
}
}
}
// local-mode job = server count(regardless server status)
if (localMode) {
jobStatistics.setTotalLoadLevel(servers == null ? 0 : (servers.size() * loadLevel));
} else {
jobStatistics.setTotalLoadLevel(loadLevel * shardingTotalCount);
}
jobStatistics.setErrorCountOfAllTime(errorCountOfThisJobAllTime);
jobStatistics.setProcessCountOfAllTime(processCountOfThisJobAllTime);
jobStatistics.setFailureCountOfTheDay(errorCountOfThisJobThisDay);
jobStatistics.setProcessCountOfTheDay(processCountOfThisJobThisDay);
jobMap.put(jobDomainKey, jobStatistics);
} catch (Exception e) {
log.info("statistics namespace:{} ,jobName:{} ,exception:{}", domain.getNns(), job, e.getMessage());
}
}
// 遍历容器资源,获取异常资源
String dcosTasksNodePath = ContainerNodePath.getDcosTasksNodePath();
List<String> tasks = curatorFrameworkOp.getChildren(dcosTasksNodePath);
if (tasks != null && !tasks.isEmpty()) {
for (String taskId : tasks) {
AbnormalContainer abnormalContainer = new AbnormalContainer(taskId, config.getNamespace(), config.getNameAndNamespace(), config.getDegree());
if (isContainerInstanceMismatch(abnormalContainer, curatorFrameworkOp) != null) {
abnormalContainerList.add(abnormalContainer);
}
}
}
}
} catch (Exception e) {
log.info("refreshStatistics2DB namespace:{} ,exception:{}", domain.getNns(), e.getMessage());
}
domain.setErrorCountOfAllTime(errorCountOfThisDomainAllTime);
domain.setProcessCountOfAllTime(processCountOfThisDomainAllTime);
domain.setErrorCountOfTheDay(errorCountOfThisDomainThisDay);
domain.setProcessCountOfTheDay(processCountOfThisDomainThisDay);
domainList.add(domain);
}
}
jobList.addAll(jobMap.values());
executorList.addAll(executorMap.values());
// 全域当天处理总数,失败总数
saveOrUpdateDomainProcessCount(new ZkStatistics(totalCount, errorCount), zkCluster.getZkAddr());
// 失败率Top10的域列表
saveOrUpdateTop10FailDomain(domainList, zkCluster.getZkAddr());
// 稳定性最差的Top10的域列表
saveOrUpdateTop10UnstableDomain(domainList, zkCluster.getZkAddr());
// 稳定性最差的Top10的executor列表
saveOrUpdateTop10FailExecutor(executorList, zkCluster.getZkAddr());
// 根据失败率Top10的作业列表
saveOrUpdateTop10FailJob(jobList, zkCluster.getZkAddr());
// 最活跃作业Top10的作业列表(即当天执行次数最多的作业)
saveOrUpdateTop10ActiveJob(jobList, zkCluster.getZkAddr());
// 负荷最重的Top10的作业列表
saveOrUpdateTop10LoadJob(jobList, zkCluster.getZkAddr());
// 负荷最重的Top10的Executor列表
saveOrUpdateTop10LoadExecutor(executorList, zkCluster.getZkAddr());
// 异常作业列表 (如下次调度时间已经过了,但是作业没有被调度)
saveOrUpdateAbnormalJob(unnormalJobList, zkCluster.getZkAddr());
// 超时告警的作业列表
saveOrUpdateTimeout4AlarmJob(timeout4AlarmJobList, zkCluster.getZkAddr());
// 无法高可用的作业列表
saveOrUpdateUnableFailoverJob(unableFailoverJobList, zkCluster.getZkAddr());
// 异常容器资源列表,包含实例数不匹配的资源列表
saveOrUpdateAbnormalContainer(abnormalContainerList, zkCluster.getZkAddr());
// 不同版本的域数量
saveOrUpdateVersionDomainNumber(versionDomainNumber, zkCluster.getZkAddr());
// 不同版本的executor数量
saveOrUpdateVersionExecutorNumber(versionExecutorNumber, zkCluster.getZkAddr());
// 不同作业等级的作业数量
saveOrUpdateJobRankDistribution(jobList, zkCluster.getZkAddr());
// 容器executor数量
saveOrUpdateExecutorInDockerCount(exeInDocker, zkCluster.getZkAddr());
// 物理机executor数量
saveOrUpdateExecutorNotInDockerCount(exeNotInDocker, zkCluster.getZkAddr());
// 作业数量
saveOrUpdateJobCount(jobList.size(), zkCluster.getZkAddr());
}
use of com.vip.saturn.job.console.repository.zookeeper.CuratorRepository.CuratorFrameworkOp in project Saturn by vipshop.
the class ExecutorServiceImpl method shardAllAtOnce.
@Override
public RequestResult shardAllAtOnce() throws SaturnJobConsoleException {
try {
RequestResult requestResult = new RequestResult();
CuratorFrameworkOp curatorFrameworkOp = curatorRepository.inSessionClient();
String shardAllAtOnceNodePath = ExecutorNodePath.getExecutorShardingNodePath("shardAllAtOnce");
if (curatorFrameworkOp.checkExists(shardAllAtOnceNodePath)) {
curatorFrameworkOp.deleteRecursive(shardAllAtOnceNodePath);
}
curatorFrameworkOp.create(shardAllAtOnceNodePath);
requestResult.setMessage("");
requestResult.setSuccess(true);
return requestResult;
} catch (Exception e) {
log.error(e.getMessage(), e);
throw new SaturnJobConsoleException(e);
}
}
use of com.vip.saturn.job.console.repository.zookeeper.CuratorRepository.CuratorFrameworkOp in project Saturn by vipshop.
the class ExecutorServiceImpl method jobIncExceeds.
@Override
public boolean jobIncExceeds(int maxJobNum, int inc) throws SaturnJobConsoleException {
if (maxJobNum <= 0) {
return false;
}
CuratorRepository.CuratorFrameworkOp curatorFrameworkOp = curatorRepository.inSessionClient();
int curJobSize = jobDimensionService.getAllUnSystemJobs(curatorFrameworkOp).size();
return (curJobSize + inc) > maxJobNum;
}
use of com.vip.saturn.job.console.repository.zookeeper.CuratorRepository.CuratorFrameworkOp in project Saturn by vipshop.
the class JobOperationServiceImpl method updateJobCron.
@Transactional
@Override
public void updateJobCron(String jobName, String cron, Map<String, String> customContext) throws SaturnJobConsoleException {
String cron0 = cron;
if (cron0 != null && !cron0.trim().isEmpty()) {
try {
cron0 = cron0.trim();
CronExpression.validateExpression(cron0);
} catch (ParseException e) {
throw new SaturnJobConsoleException("The cron expression is valid: " + cron);
}
} else {
cron0 = "";
}
CuratorRepository.CuratorFrameworkOp curatorFrameworkOp = curatorRepository.inSessionClient();
if (curatorFrameworkOp.checkExists(JobNodePath.getConfigNodePath(jobName))) {
String newCustomContextStr = null;
String newCron = null;
String oldCustomContextStr = curatorFrameworkOp.getData(JobNodePath.getConfigNodePath(jobName, "customContext"));
Map<String, String> oldCustomContextMap = toCustomContext(oldCustomContextStr);
if (customContext != null && !customContext.isEmpty()) {
oldCustomContextMap.putAll(customContext);
newCustomContextStr = toCustomContext(oldCustomContextMap);
if (newCustomContextStr.getBytes().length > 1024 * 1024) {
throw new SaturnJobConsoleException("The all customContext is out of zk limit memory(1M)");
}
}
String oldCron = curatorFrameworkOp.getData(JobNodePath.getConfigNodePath(jobName, "cron"));
if (cron0 != null && oldCron != null && !cron0.equals(oldCron.trim())) {
newCron = cron0;
}
if (newCustomContextStr != null || newCron != null) {
saveCronToDb(jobName, curatorFrameworkOp, newCustomContextStr, newCron);
}
if (newCustomContextStr != null) {
curatorFrameworkOp.update(JobNodePath.getConfigNodePath(jobName, "customContext"), newCustomContextStr);
}
if (newCron != null) {
curatorFrameworkOp.update(JobNodePath.getConfigNodePath(jobName, "cron"), newCron);
}
} else {
throw new SaturnJobConsoleException("The job is not found: " + jobName);
}
}
use of com.vip.saturn.job.console.repository.zookeeper.CuratorRepository.CuratorFrameworkOp in project Saturn by vipshop.
the class JobServiceImpl method persistJob.
private void persistJob(String namespace, JobConfig jobConfig, String createdBy) throws SaturnJobConsoleException {
CuratorRepository.CuratorFrameworkOp curatorFrameworkOp = registryCenterService.getCuratorFrameworkOp(namespace);
if (curatorFrameworkOp.checkExists(JobNodePath.getJobNodePath(jobConfig.getJobName()))) {
curatorFrameworkOp.deleteRecursive(JobNodePath.getJobNodePath(jobConfig.getJobName()));
}
correctConfigValueIfNeeded(jobConfig);
saveJobConfigToDb(namespace, jobConfig, createdBy);
saveJobConfigToZk(jobConfig, curatorFrameworkOp);
}
Aggregations