Search in sources :

Example 6 with AMWorker

use of com.tencent.angel.master.worker.worker.AMWorker in project angel by Tencent.

the class TaskManagerTest method testTaskIteration.

@Test
public void testTaskIteration() throws Exception {
    try {
        LOG.info("===========================testTaskIteration===============================");
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        AMWorkerGroup workerGroup0 = workerManager.getWorkGroup(worker0Id);
        AMWorker worker0 = workerGroup0.getWorker(worker0Id);
        WorkerAttempt worker0Attempt0 = worker0.getWorkerAttempt(worker0Attempt0Id);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        MasterClient masterClient = worker.getPSAgent().getMasterClient();
        masterClient.taskIteration(task0Id.getIndex(), 1);
        AMTask task0 = taskManager.getTask(task0Id);
        AMTask task1 = taskManager.getTask(task1Id);
        assertEquals(task0.getIteration(), 1);
        assertEquals(task1.getIteration(), 0);
        assertEquals(worker0Attempt0.getMinIteration(), 0);
        assertEquals(worker0.getMinIteration(), 0);
        assertEquals(workerGroup0.getMinIteration(), 0);
        masterClient.taskIteration(task1Id.getIndex(), 1);
        assertEquals(task0.getIteration(), 1);
        assertEquals(task1.getIteration(), 1);
        assertEquals(worker0Attempt0.getMinIteration(), 1);
        assertEquals(worker0.getMinIteration(), 1);
        assertEquals(workerGroup0.getMinIteration(), 1);
    } catch (Exception x) {
        LOG.error("run testTaskIteration failed ", x);
        throw x;
    }
}
Also used : WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) MasterClient(com.tencent.angel.psagent.client.MasterClient) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) Worker(com.tencent.angel.worker.Worker) AMTask(com.tencent.angel.master.task.AMTask) ServiceException(com.google.protobuf.ServiceException) AngelException(com.tencent.angel.exception.AngelException) Test(org.junit.Test)

Example 7 with AMWorker

use of com.tencent.angel.master.worker.worker.AMWorker in project angel by Tencent.

the class WorkerGroupBlock method render.

@Override
protected void render(Block html) {
    String workerGroupIdSr = $(WORKERGROUP_ID);
    if (workerGroupIdSr.isEmpty()) {
        html.p()._("Sorry, can't do anything without a WorkerGroupId.")._();
        return;
    }
    WorkerGroupId workerGroupId;
    try {
        workerGroupId = new WorkerGroupId(workerGroupIdSr);
    } catch (UnvalidIdStrException e) {
        LOG.error("unvalid id string, ", e);
        return;
    }
    AMWorkerGroup workerGroup = amContext.getWorkerManager().getWorkerGroup(workerGroupId);
    if (workerGroup == null) {
        html.p()._("Sorry, can't find group " + workerGroupId)._();
        return;
    }
    set(TITLE, join("Angel WorkerGroup ", $(WORKERGROUP_ID)));
    html.h1(workerGroupIdSr);
    TABLE<DIV<Hamlet>> table = html.div(_INFO_WRAP).table("#job");
    TR<THEAD<TABLE<DIV<Hamlet>>>> headTr = table.thead().tr();
    headTr.th(_TH, "id").th(_TH, "state").th(_TH, "node address").th(_TH, "start time").th(_TH, "end time").th(_TH, "elapsed time").th(_TH, "log").th(_TH, "threadstack").th(_TH, "workercounter");
    headTr._()._();
    TBODY<TABLE<DIV<Hamlet>>> tbody = table.tbody();
    for (AMWorker worker : workerGroup.getWorkerSet()) {
        Map<WorkerAttemptId, WorkerAttempt> workerAttempts = worker.getAttempts();
        for (WorkerAttempt workerAttempt : workerAttempts.values()) {
            TR<TBODY<TABLE<DIV<Hamlet>>>> tr = tbody.tr();
            long elaspedTs = 0;
            if (workerAttempt.getLaunchTime() != 0 && workerAttempt.getFinishTime() != 0) {
                elaspedTs = workerAttempt.getFinishTime() - workerAttempt.getLaunchTime();
            } else if (workerAttempt.getLaunchTime() != 0 && workerAttempt.getFinishTime() == 0) {
                elaspedTs = System.currentTimeMillis() - workerAttempt.getLaunchTime();
            }
            if (workerAttempt.getNodeHttpAddr() == null) {
                tr.td().a(url("angel/workerPage", workerAttempt.getId().toString()), workerAttempt.getId().toString())._().td(workerAttempt.getState().toString()).td("N/A").td((workerAttempt.getLaunchTime() == 0) ? "N/A" : new Date(workerAttempt.getLaunchTime()).toString()).td((workerAttempt.getFinishTime() == 0) ? "N/A" : new Date(workerAttempt.getFinishTime()).toString()).td((elaspedTs == 0) ? "N/A" : StringUtils.formatTime(elaspedTs)).td("N/A").td("N/A").td("N/A");
            } else {
                tr.td().a(url("angel/workerPage", workerAttempt.getId().toString()), workerAttempt.getId().toString())._().td(workerAttempt.getState().toString()).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr()), workerAttempt.getNodeHttpAddr())._().td((workerAttempt.getLaunchTime() == 0) ? "N/A" : new Date(workerAttempt.getLaunchTime()).toString()).td((workerAttempt.getFinishTime() == 0) ? "N/A" : new Date(workerAttempt.getFinishTime()).toString()).td((elaspedTs == 0) ? "N/A" : StringUtils.formatTime(elaspedTs)).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr(), "node", "containerlogs", workerAttempt.getContainerIdStr(), amContext.getUser().toString()), "log")._().td().a(url("angel/workerThreadStackPage/", workerAttempt.getId().toString()), "workerthreadstack")._().td().a(url("angel/workerCounterPage/", workerAttempt.getId().toString()), "workercounter")._();
            }
            tr._();
        }
    }
    tbody._()._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) Date(java.util.Date) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) UnvalidIdStrException(com.tencent.angel.exception.UnvalidIdStrException) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt)

Example 8 with AMWorker

use of com.tencent.angel.master.worker.worker.AMWorker in project angel by Tencent.

the class TaskCalPerfChecker method check.

@Override
public List<Id> check(AMContext context) {
    double slowestDiscount = context.getConf().getDouble(AngelConf.ANGEL_AM_TASK_SLOWEST_DISCOUNT, AngelConf.DEFAULT_ANGEL_AM_TASK_SLOWEST_DISCOUNT);
    LOG.info("start to check slow workers use TaskCalPerfChecker policy, slowestDiscount = " + slowestDiscount);
    Set<Id> slowWorkers = new HashSet<Id>();
    AMTaskManager taskManage = context.getTaskManager();
    WorkerManager workerManager = context.getWorkerManager();
    Collection<AMTask> tasks = taskManage.getTasks();
    long totalSamples = 0;
    long totalCalTimeMs = 0;
    double averageRate = 0.0;
    Map<TaskId, Double> taskIdToRateMap = new HashMap<TaskId, Double>(tasks.size());
    for (AMTask task : tasks) {
        if (task.getMetrics().containsKey(TaskCounter.TOTAL_CALCULATE_SAMPLES) && task.getMetrics().containsKey(TaskCounter.TOTAL_CALCULATE_TIME_MS)) {
            long sampleNum = Long.valueOf(task.getMetrics().get(TaskCounter.TOTAL_CALCULATE_SAMPLES));
            double calTimeMs = Long.valueOf(task.getMetrics().get(TaskCounter.TOTAL_CALCULATE_TIME_MS));
            LOG.info("for task " + task.getTaskId() + ", sampleNum = " + sampleNum + ", calTimeMs = " + calTimeMs);
            totalSamples += sampleNum;
            totalCalTimeMs += calTimeMs;
            if (sampleNum > 5000000) {
                LOG.info("task " + task.getTaskId() + " calculate rate = " + (calTimeMs * 10000 / sampleNum));
                taskIdToRateMap.put(task.getTaskId(), calTimeMs * 10000 / sampleNum);
            }
        }
    }
    if (totalSamples != 0) {
        averageRate = (double) totalCalTimeMs * 10000 / totalSamples;
    }
    LOG.info("totalSamples = " + totalSamples + ", totalCalTimeMs = " + totalCalTimeMs + ", average calulate time for 10000 samples = " + averageRate + ", the maximum calulate time for 10000 sample = " + averageRate / slowestDiscount);
    for (Map.Entry<TaskId, Double> rateEntry : taskIdToRateMap.entrySet()) {
        if (averageRate < rateEntry.getValue() * slowestDiscount) {
            LOG.info("task " + rateEntry.getKey() + " rate = " + rateEntry.getValue() + " is < " + averageRate * slowestDiscount);
            AMWorker worker = workerManager.getWorker(rateEntry.getKey());
            if (worker != null) {
                LOG.info("put worker " + worker.getId() + " to slow worker list");
                slowWorkers.add(worker.getId());
            }
        }
    }
    List<Id> slowWorkerList = new ArrayList<>(slowWorkers.size());
    slowWorkerList.addAll(slowWorkers);
    return slowWorkerList;
}
Also used : TaskId(com.tencent.angel.worker.task.TaskId) WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) Id(com.tencent.angel.common.Id) TaskId(com.tencent.angel.worker.task.TaskId) AMTask(com.tencent.angel.master.task.AMTask)

Example 9 with AMWorker

use of com.tencent.angel.master.worker.worker.AMWorker in project angel by Tencent.

the class WorkerManager method startAllWorker.

/**
 * init and start all workers
 */
public void startAllWorker() {
    LOG.info("to start all workers.....");
    try {
        writeLock.lock();
        initWorkers();
        for (int i = 0; i < workerGroupMap.size(); i++) {
            AMWorkerGroup group = workerGroupMap.get(new WorkerGroupId(i));
            for (AMWorker worker : group.getWorkerSet()) {
                worker.handle(new AMWorkerEvent(AMWorkerEventType.SCHEDULE, worker.getId()));
            }
        }
        isInited = true;
    } finally {
        writeLock.unlock();
    }
}
Also used : AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) AMWorkerEvent(com.tencent.angel.master.worker.worker.AMWorkerEvent) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId)

Aggregations

AMWorker (com.tencent.angel.master.worker.worker.AMWorker)9 WorkerAttempt (com.tencent.angel.master.worker.attempt.WorkerAttempt)5 AMWorkerGroup (com.tencent.angel.master.worker.workergroup.AMWorkerGroup)5 AMTask (com.tencent.angel.master.task.AMTask)4 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)3 Hamlet (org.apache.hadoop.yarn.webapp.hamlet.Hamlet)3 ServiceException (com.google.protobuf.ServiceException)2 UnvalidIdStrException (com.tencent.angel.exception.UnvalidIdStrException)2 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)2 WorkerManager (com.tencent.angel.master.worker.WorkerManager)2 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)2 WorkerId (com.tencent.angel.worker.WorkerId)2 TaskId (com.tencent.angel.worker.task.TaskId)2 Id (com.tencent.angel.common.Id)1 Location (com.tencent.angel.common.location.Location)1 AngelException (com.tencent.angel.exception.AngelException)1 AMWorkerEvent (com.tencent.angel.master.worker.worker.AMWorkerEvent)1 AMWorkerGroupEvent (com.tencent.angel.master.worker.workergroup.AMWorkerGroupEvent)1 SplitInfoProto (com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.SplitInfoProto)1 WorkerGroupMetaInfoProto (com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerGroupMetaInfoProto)1