Search in sources :

Example 1 with AMWorkerGroup

use of com.tencent.angel.master.worker.workergroup.AMWorkerGroup in project angel by Tencent.

the class AngelAppBlock method render.

@Override
protected void render(Block html) {
    set(TITLE, join("Angel Application", amContext.getApplicationId()));
    App app = amContext.getApp();
    long elaspedTs = 0;
    if (app.getLaunchTime() != 0 && app.getFinishTime() != 0) {
        elaspedTs = app.getFinishTime() - app.getLaunchTime();
    } else if (app.getLaunchTime() != 0 && app.getFinishTime() == 0) {
        elaspedTs = System.currentTimeMillis() - app.getLaunchTime();
    }
    info("Job Overview")._("Job Name:", amContext.getApplicationName())._("State:", app.getExternAppState().toString())._("Started:", new Date(app.getLaunchTime()))._("Elapsed:", StringUtils.formatTime(elaspedTs))._("Environment:", "nomeaning" == null ? "#" : "angel/EnvironmentPage", "Runtime Information And Properties")._("Task Progress:", "nomeaning" == null ? "#" : "angel/ProgressPage", "progress")._("Master Threaddump:", "nomeaning" == null ? "#" : "angel/ExecutorsPage", "threaddump");
    DIV<Hamlet> div = html._(InfoBlock.class).div(_INFO_WRAP);
    TABLE<DIV<Hamlet>> table = div.table("#job");
    table.tr().th(_TH, "module").th(_TH, "new").th(_TH, "running").th(_TH, "failed").th(_TH, "killed").th(_TH, "success")._();
    int newGroupNum = 0;
    int runningGroupNum = 0;
    int failedGroupNum = 0;
    int killedGroupNum = 0;
    int successGroupNum = 0;
    int newPSNum = 0;
    int runningPSNum = 0;
    int failedPSNum = 0;
    int killedPSNum = 0;
    int successPSNum = 0;
    LOG.info("before compute worker state items");
    if (amContext.getWorkerManager() != null) {
        for (AMWorkerGroup group : amContext.getWorkerManager().getWorkerGroupMap().values()) {
            switch(group.getState()) {
                case NEW:
                case INITED:
                    newGroupNum += 1;
                    break;
                case RUNNING:
                    runningGroupNum += 1;
                    break;
                case KILLED:
                    killedGroupNum += 1;
                    break;
                case FAILED:
                    failedGroupNum += 1;
                    break;
                case SUCCESS:
                    successGroupNum += 1;
                    break;
                default:
                    break;
            }
        }
    }
    for (AMParameterServer ps : amContext.getParameterServerManager().getParameterServerMap().values()) {
        for (PSAttempt psAttemp : ps.getPSAttempts().values()) {
            switch(psAttemp.getInternalState()) {
                case NEW:
                case SCHEDULED:
                case LAUNCHED:
                    newPSNum += 1;
                    break;
                case RUNNING:
                case COMMITTING:
                    runningPSNum += 1;
                    break;
                case KILLED:
                    killedPSNum += 1;
                    break;
                case FAILED:
                    failedPSNum += 1;
                    break;
                case SUCCESS:
                    successPSNum += 1;
                    break;
                default:
                    break;
            }
        }
    }
    table.tr().td("workergroups").td().a(url("angel/workerGroupsPage", "NEW"), String.valueOf(newGroupNum))._().td().a(url("angel/workerGroupsPage", "RUNNING"), String.valueOf(runningGroupNum))._().td().a(url("angel/workerGroupsPage", "FAILED"), String.valueOf(failedGroupNum))._().td().a(url("angel/workerGroupsPage", "KILLED"), String.valueOf(killedGroupNum))._().td().a(url("angel/workerGroupsPage", "SUCCESS"), String.valueOf(successGroupNum))._()._().tr().td("parameterservers").td().a(url("angel/parameterServersPage", "NEW"), String.valueOf(newPSNum))._().td().a(url("angel/parameterServersPage", "RUNNING"), String.valueOf(runningPSNum))._().td().a(url("angel/parameterServersPage", "FAILED"), String.valueOf(failedPSNum))._().td().a(url("angel/parameterServersPage", "KILLED"), String.valueOf(killedPSNum))._().td().a(url("angel/parameterServersPage", "SUCCESS"), String.valueOf(successPSNum))._()._();
    table._();
    div._();
}
Also used : App(com.tencent.angel.master.app.App) Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) InfoBlock(org.apache.hadoop.yarn.webapp.view.InfoBlock) DIV(org.apache.hadoop.yarn.webapp.hamlet.Hamlet.DIV) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) Date(java.util.Date)

Example 2 with AMWorkerGroup

use of com.tencent.angel.master.worker.workergroup.AMWorkerGroup in project angel by Tencent.

the class ProgressBlock method render.

@Override
protected void render(Block html) {
    set(TITLE, join("Angel Progress"));
    Hamlet.TABLE<Hamlet.DIV<Hamlet>> table = html.div(_INFO_WRAP).table("#job");
    Hamlet.TR<Hamlet.THEAD<Hamlet.TABLE<Hamlet.DIV<Hamlet>>>> headTr = table.thead().tr();
    headTr.th(_TH, "taskid").th(_TH, "state").th(_TH, "current iteration").th(_TH, "workerlog");
    headTr._()._();
    float current_iteration_progress = (float) 0.0;
    float current_clock_progress = (float) 0.0;
    Hamlet.TBODY<Hamlet.TABLE<Hamlet.DIV<Hamlet>>> tbody = table.tbody();
    List<AMTask> amTaskList = new ArrayList();
    Map<AMTask, WorkerAttempt> map = new HashMap<>();
    Collection<AMWorkerGroup> amWorkerGroupSet = amContext.getWorkerManager().getWorkerGroupMap().values();
    for (AMWorkerGroup amWorkerGroup : amWorkerGroupSet) {
        Collection<AMWorker> amWorkerSet = amWorkerGroup.getWorkerSet();
        for (AMWorker amWorker : amWorkerSet) {
            Collection<WorkerAttempt> workerAttempts = amWorker.getAttempts().values();
            for (WorkerAttempt workerAttempt : workerAttempts) {
                Collection<AMTask> amTasks = workerAttempt.getTaskMap().values();
                for (AMTask amTask : amTasks) {
                    map.put(amTask, workerAttempt);
                }
            }
        }
    }
    for (AMTask amTask : amContext.getTaskManager().getTasks()) {
        amTaskList.add(amTask);
    }
    Collections.sort(amTaskList, new Comparator<AMTask>() {

        @Override
        public int compare(AMTask task1, AMTask task2) {
            return task1.getTaskId().getIndex() - task2.getTaskId().getIndex();
        }
    });
    for (AMTask task : amTaskList) {
        WorkerAttempt workerAttempt = map.get(task);
        if (task.getProgress() >= 0 && task.getProgress() <= 1)
            current_iteration_progress = task.getProgress();
        current_clock_progress = ((float) task.getIteration()) / ((float) amContext.getTotalIterationNum());
        Hamlet.TR<Hamlet.TBODY<Hamlet.TABLE<Hamlet.DIV<Hamlet>>>> tr = tbody.tr();
        tr.td(task.getTaskId().toString()).td(task.getState().toString()).td(String.valueOf(task.getIteration()) + "/" + amContext.getTotalIterationNum()).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr(), "node", "containerlogs", workerAttempt.getContainerIdStr(), amContext.getUser().toString()), workerAttempt.getId().toString())._();
        tr._();
    }
    tbody._()._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) AMTask(com.tencent.angel.master.task.AMTask)

Example 3 with AMWorkerGroup

use of com.tencent.angel.master.worker.workergroup.AMWorkerGroup in project angel by Tencent.

the class WorkerManager method initWorkers.

private void initWorkers() {
    int base = 0;
    // init all tasks , workers and worker groups and put them to the corresponding maps
    for (int i = 0; i < workergroupNumber; i++) {
        Map<WorkerId, AMWorker> workers = new HashMap<WorkerId, AMWorker>();
        WorkerId leader = null;
        WorkerGroupId groupId = new WorkerGroupId(i);
        for (int j = 0; j < workersInGroup; j++) {
            base = (i * workersInGroup + j) * taskNumberInEachWorker;
            List<TaskId> taskIds = new ArrayList<TaskId>(taskNumberInEachWorker);
            for (int k = 0; k < taskNumberInEachWorker && (base < totalTaskNumber); k++, base++) {
                taskIds.add(new TaskId(base));
            }
            WorkerId workerId = new WorkerId(groupId, i * workersInGroup + j);
            AMWorker worker = new AMWorker(workerId, context, taskIds);
            workersMap.put(workerId, worker);
            workers.put(workerId, worker);
            if (j == 0) {
                leader = workerId;
            }
        }
        AMWorkerGroup group = new AMWorkerGroup(groupId, context, workers, leader, i);
        for (WorkerId id : workers.keySet()) {
            findWorkerGroupMap.put(id, group);
            for (TaskId taskId : workers.get(id).getTaskIds()) {
                taskIdToWorkerMap.put(taskId, workers.get(id));
            }
        }
        workerGroupMap.put(groupId, group);
        group.handle(new AMWorkerGroupEvent(AMWorkerGroupEventType.INIT, groupId));
    }
    LOG.info("to init taskClockManager!");
}
Also used : AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) TaskId(com.tencent.angel.worker.task.TaskId) AMWorkerGroupEvent(com.tencent.angel.master.worker.workergroup.AMWorkerGroupEvent) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerId(com.tencent.angel.worker.WorkerId) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId)

Example 4 with AMWorkerGroup

use of com.tencent.angel.master.worker.workergroup.AMWorkerGroup in project angel by Tencent.

the class WorkerGroupBlock method render.

@Override
protected void render(Block html) {
    String workerGroupIdSr = $(WORKERGROUP_ID);
    if (workerGroupIdSr.isEmpty()) {
        html.p()._("Sorry, can't do anything without a WorkerGroupId.")._();
        return;
    }
    WorkerGroupId workerGroupId;
    try {
        workerGroupId = new WorkerGroupId(workerGroupIdSr);
    } catch (UnvalidIdStrException e) {
        LOG.error("unvalid id string, ", e);
        return;
    }
    AMWorkerGroup workerGroup = amContext.getWorkerManager().getWorkerGroup(workerGroupId);
    if (workerGroup == null) {
        html.p()._("Sorry, can't find group " + workerGroupId)._();
        return;
    }
    set(TITLE, join("Angel WorkerGroup ", $(WORKERGROUP_ID)));
    html.h1(workerGroupIdSr);
    TABLE<DIV<Hamlet>> table = html.div(_INFO_WRAP).table("#job");
    TR<THEAD<TABLE<DIV<Hamlet>>>> headTr = table.thead().tr();
    headTr.th(_TH, "id").th(_TH, "state").th(_TH, "node address").th(_TH, "start time").th(_TH, "end time").th(_TH, "elapsed time").th(_TH, "log").th(_TH, "threadstack").th(_TH, "workercounter");
    headTr._()._();
    TBODY<TABLE<DIV<Hamlet>>> tbody = table.tbody();
    for (AMWorker worker : workerGroup.getWorkerSet()) {
        Map<WorkerAttemptId, WorkerAttempt> workerAttempts = worker.getAttempts();
        for (WorkerAttempt workerAttempt : workerAttempts.values()) {
            TR<TBODY<TABLE<DIV<Hamlet>>>> tr = tbody.tr();
            long elaspedTs = 0;
            if (workerAttempt.getLaunchTime() != 0 && workerAttempt.getFinishTime() != 0) {
                elaspedTs = workerAttempt.getFinishTime() - workerAttempt.getLaunchTime();
            } else if (workerAttempt.getLaunchTime() != 0 && workerAttempt.getFinishTime() == 0) {
                elaspedTs = System.currentTimeMillis() - workerAttempt.getLaunchTime();
            }
            if (workerAttempt.getNodeHttpAddr() == null) {
                tr.td().a(url("angel/workerPage", workerAttempt.getId().toString()), workerAttempt.getId().toString())._().td(workerAttempt.getState().toString()).td("N/A").td((workerAttempt.getLaunchTime() == 0) ? "N/A" : new Date(workerAttempt.getLaunchTime()).toString()).td((workerAttempt.getFinishTime() == 0) ? "N/A" : new Date(workerAttempt.getFinishTime()).toString()).td((elaspedTs == 0) ? "N/A" : StringUtils.formatTime(elaspedTs)).td("N/A").td("N/A").td("N/A");
            } else {
                tr.td().a(url("angel/workerPage", workerAttempt.getId().toString()), workerAttempt.getId().toString())._().td(workerAttempt.getState().toString()).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr()), workerAttempt.getNodeHttpAddr())._().td((workerAttempt.getLaunchTime() == 0) ? "N/A" : new Date(workerAttempt.getLaunchTime()).toString()).td((workerAttempt.getFinishTime() == 0) ? "N/A" : new Date(workerAttempt.getFinishTime()).toString()).td((elaspedTs == 0) ? "N/A" : StringUtils.formatTime(elaspedTs)).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr(), "node", "containerlogs", workerAttempt.getContainerIdStr(), amContext.getUser().toString()), "log")._().td().a(url("angel/workerThreadStackPage/", workerAttempt.getId().toString()), "workerthreadstack")._().td().a(url("angel/workerCounterPage/", workerAttempt.getId().toString()), "workercounter")._();
            }
            tr._();
        }
    }
    tbody._()._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) Date(java.util.Date) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) UnvalidIdStrException(com.tencent.angel.exception.UnvalidIdStrException) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt)

Example 5 with AMWorkerGroup

use of com.tencent.angel.master.worker.workergroup.AMWorkerGroup in project angel by Tencent.

the class WorkerGroupsBlock method render.

@Override
protected void render(Block html) {
    set(TITLE, join("Angel WorkerGroups ", $(WORKERGROUP_STATE)));
    TABLE<Hamlet> table = html.table("#job");
    TR<THEAD<TABLE<Hamlet>>> tr = table.thead().tr();
    tr.th(_TH, "id").th(_TH, "state").th(_TH, "leader").th(_TH, "start time").th(_TH, "end time").th(_TH, "elapsed time");
    tr._()._();
    Set<AMWorkerGroupState> stateSet = transformToInternalState($(WORKERGROUP_STATE));
    TBODY<TABLE<Hamlet>> tbody = table.tbody();
    LOG.info("before get groups, group size is " + amContext.getWorkerManager().getWorkerGroupMap().size());
    for (AMWorkerGroupState s : stateSet) {
        LOG.info("s = " + s);
    }
    for (AMWorkerGroup workerGroup : amContext.getWorkerManager().getWorkerGroupMap().values()) {
        LOG.info("group state is " + workerGroup.getState());
        if (stateSet.contains(workerGroup.getState())) {
            TR<TBODY<TABLE<Hamlet>>> tr1 = tbody.tr();
            long elaspedTs = 0;
            if (workerGroup.getLaunchTime() != 0 && workerGroup.getFinishTime() != 0) {
                elaspedTs = workerGroup.getFinishTime() - workerGroup.getLaunchTime();
            } else if (workerGroup.getLaunchTime() != 0 && workerGroup.getFinishTime() == 0) {
                elaspedTs = System.currentTimeMillis() - workerGroup.getLaunchTime();
            }
            tr1.td().a(url("angel/workerGroupPage/", workerGroup.getId().toString()), workerGroup.getId().toString())._().td($(WORKERGROUP_STATE)).td(workerGroup.getLeader().toString()).td(workerGroup.getLaunchTime() == 0 ? "N/A" : new Date(workerGroup.getLaunchTime()).toString()).td(workerGroup.getFinishTime() == 0 ? "N/A" : new Date(workerGroup.getFinishTime()).toString()).td(elaspedTs == 0 ? "N/A" : StringUtils.formatTime(elaspedTs));
            tr1._();
        }
    }
    tbody._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) TBODY(org.apache.hadoop.yarn.webapp.hamlet.Hamlet.TBODY) Date(java.util.Date) TABLE(org.apache.hadoop.yarn.webapp.hamlet.Hamlet.TABLE) THEAD(org.apache.hadoop.yarn.webapp.hamlet.Hamlet.THEAD) AMWorkerGroupState(com.tencent.angel.master.worker.workergroup.AMWorkerGroupState)

Aggregations

AMWorkerGroup (com.tencent.angel.master.worker.workergroup.AMWorkerGroup)8 AMWorker (com.tencent.angel.master.worker.worker.AMWorker)5 Hamlet (org.apache.hadoop.yarn.webapp.hamlet.Hamlet)4 WorkerAttempt (com.tencent.angel.master.worker.attempt.WorkerAttempt)3 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)3 Date (java.util.Date)3 ServiceException (com.google.protobuf.ServiceException)2 AMTask (com.tencent.angel.master.task.AMTask)2 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)2 AngelException (com.tencent.angel.exception.AngelException)1 UnvalidIdStrException (com.tencent.angel.exception.UnvalidIdStrException)1 App (com.tencent.angel.master.app.App)1 PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)1 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)1 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)1 WorkerManager (com.tencent.angel.master.worker.WorkerManager)1 AMWorkerEvent (com.tencent.angel.master.worker.worker.AMWorkerEvent)1 AMWorkerGroupEvent (com.tencent.angel.master.worker.workergroup.AMWorkerGroupEvent)1 AMWorkerGroupState (com.tencent.angel.master.worker.workergroup.AMWorkerGroupState)1 MasterClient (com.tencent.angel.psagent.client.MasterClient)1