use of com.tencent.angel.master.worker.WorkerManager in project angel by Tencent.
the class TaskManagerTest method testTaskIteration.
@Test
public void testTaskIteration() throws Exception {
try {
LOG.info("===========================testTaskIteration===============================");
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertTrue(angelAppMaster != null);
AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
assertTrue(workerManager != null);
AMWorkerGroup workerGroup0 = workerManager.getWorkGroup(worker0Id);
AMWorker worker0 = workerGroup0.getWorker(worker0Id);
WorkerAttempt worker0Attempt0 = worker0.getWorkerAttempt(worker0Attempt0Id);
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
MasterClient masterClient = worker.getPSAgent().getMasterClient();
masterClient.taskIteration(task0Id.getIndex(), 1);
AMTask task0 = taskManager.getTask(task0Id);
AMTask task1 = taskManager.getTask(task1Id);
assertEquals(task0.getIteration(), 1);
assertEquals(task1.getIteration(), 0);
assertEquals(worker0Attempt0.getMinIteration(), 0);
assertEquals(worker0.getMinIteration(), 0);
assertEquals(workerGroup0.getMinIteration(), 0);
masterClient.taskIteration(task1Id.getIndex(), 1);
assertEquals(task0.getIteration(), 1);
assertEquals(task1.getIteration(), 1);
assertEquals(worker0Attempt0.getMinIteration(), 1);
assertEquals(worker0.getMinIteration(), 1);
assertEquals(workerGroup0.getMinIteration(), 1);
} catch (Exception x) {
LOG.error("run testTaskIteration failed ", x);
throw x;
}
}
use of com.tencent.angel.master.worker.WorkerManager in project angel by Tencent.
the class TaskCalPerfChecker method check.
@Override
public List<Id> check(AMContext context) {
double slowestDiscount = context.getConf().getDouble(AngelConf.ANGEL_AM_TASK_SLOWEST_DISCOUNT, AngelConf.DEFAULT_ANGEL_AM_TASK_SLOWEST_DISCOUNT);
LOG.info("start to check slow workers use TaskCalPerfChecker policy, slowestDiscount = " + slowestDiscount);
Set<Id> slowWorkers = new HashSet<Id>();
AMTaskManager taskManage = context.getTaskManager();
WorkerManager workerManager = context.getWorkerManager();
Collection<AMTask> tasks = taskManage.getTasks();
long totalSamples = 0;
long totalCalTimeMs = 0;
double averageRate = 0.0;
Map<TaskId, Double> taskIdToRateMap = new HashMap<TaskId, Double>(tasks.size());
for (AMTask task : tasks) {
if (task.getMetrics().containsKey(TaskCounter.TOTAL_CALCULATE_SAMPLES) && task.getMetrics().containsKey(TaskCounter.TOTAL_CALCULATE_TIME_MS)) {
long sampleNum = Long.valueOf(task.getMetrics().get(TaskCounter.TOTAL_CALCULATE_SAMPLES));
double calTimeMs = Long.valueOf(task.getMetrics().get(TaskCounter.TOTAL_CALCULATE_TIME_MS));
LOG.info("for task " + task.getTaskId() + ", sampleNum = " + sampleNum + ", calTimeMs = " + calTimeMs);
totalSamples += sampleNum;
totalCalTimeMs += calTimeMs;
if (sampleNum > 5000000) {
LOG.info("task " + task.getTaskId() + " calculate rate = " + (calTimeMs * 10000 / sampleNum));
taskIdToRateMap.put(task.getTaskId(), calTimeMs * 10000 / sampleNum);
}
}
}
if (totalSamples != 0) {
averageRate = (double) totalCalTimeMs * 10000 / totalSamples;
}
LOG.info("totalSamples = " + totalSamples + ", totalCalTimeMs = " + totalCalTimeMs + ", average calulate time for 10000 samples = " + averageRate + ", the maximum calulate time for 10000 sample = " + averageRate / slowestDiscount);
for (Map.Entry<TaskId, Double> rateEntry : taskIdToRateMap.entrySet()) {
if (averageRate < rateEntry.getValue() * slowestDiscount) {
LOG.info("task " + rateEntry.getKey() + " rate = " + rateEntry.getValue() + " is < " + averageRate * slowestDiscount);
AMWorker worker = workerManager.getWorker(rateEntry.getKey());
if (worker != null) {
LOG.info("put worker " + worker.getId() + " to slow worker list");
slowWorkers.add(worker.getId());
}
}
}
List<Id> slowWorkerList = new ArrayList<>(slowWorkers.size());
slowWorkerList.addAll(slowWorkers);
return slowWorkerList;
}
Aggregations