Search in sources :

Example 1 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class ProgressBlock method render.

@Override
protected void render(Block html) {
    set(TITLE, join("Angel Progress"));
    Hamlet.TABLE<Hamlet.DIV<Hamlet>> table = html.div(_INFO_WRAP).table("#job");
    Hamlet.TR<Hamlet.THEAD<Hamlet.TABLE<Hamlet.DIV<Hamlet>>>> headTr = table.thead().tr();
    headTr.th(_TH, "taskid").th(_TH, "state").th(_TH, "current iteration").th(_TH, "workerlog");
    headTr._()._();
    float current_iteration_progress = (float) 0.0;
    float current_clock_progress = (float) 0.0;
    Hamlet.TBODY<Hamlet.TABLE<Hamlet.DIV<Hamlet>>> tbody = table.tbody();
    List<AMTask> amTaskList = new ArrayList();
    Map<AMTask, WorkerAttempt> map = new HashMap<>();
    Collection<AMWorkerGroup> amWorkerGroupSet = amContext.getWorkerManager().getWorkerGroupMap().values();
    for (AMWorkerGroup amWorkerGroup : amWorkerGroupSet) {
        Collection<AMWorker> amWorkerSet = amWorkerGroup.getWorkerSet();
        for (AMWorker amWorker : amWorkerSet) {
            Collection<WorkerAttempt> workerAttempts = amWorker.getAttempts().values();
            for (WorkerAttempt workerAttempt : workerAttempts) {
                Collection<AMTask> amTasks = workerAttempt.getTaskMap().values();
                for (AMTask amTask : amTasks) {
                    map.put(amTask, workerAttempt);
                }
            }
        }
    }
    for (AMTask amTask : amContext.getTaskManager().getTasks()) {
        amTaskList.add(amTask);
    }
    Collections.sort(amTaskList, new Comparator<AMTask>() {

        @Override
        public int compare(AMTask task1, AMTask task2) {
            return task1.getTaskId().getIndex() - task2.getTaskId().getIndex();
        }
    });
    for (AMTask task : amTaskList) {
        WorkerAttempt workerAttempt = map.get(task);
        if (task.getProgress() >= 0 && task.getProgress() <= 1)
            current_iteration_progress = task.getProgress();
        current_clock_progress = ((float) task.getIteration()) / ((float) amContext.getTotalIterationNum());
        Hamlet.TR<Hamlet.TBODY<Hamlet.TABLE<Hamlet.DIV<Hamlet>>>> tr = tbody.tr();
        tr.td(task.getTaskId().toString()).td(task.getState().toString()).td(String.valueOf(task.getIteration()) + "/" + amContext.getTotalIterationNum()).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr(), "node", "containerlogs", workerAttempt.getContainerIdStr(), amContext.getUser().toString()), workerAttempt.getId().toString())._();
        tr._();
    }
    tbody._()._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) AMTask(com.tencent.angel.master.task.AMTask)

Example 2 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class WorkerBlock method render.

@Override
protected void render(Block html) {
    set(TITLE, join("Angel Worker Attempt ", $(WORKER_ATTEMPT_ID)));
    String workerAttemptIdStr = $(WORKER_ATTEMPT_ID);
    if (workerAttemptIdStr == null || workerAttemptIdStr.isEmpty()) {
        html.p()._("Sorry, can't do anything without a WorkerId.")._();
        return;
    }
    WorkerAttemptId workerAttemptId = null;
    try {
        workerAttemptId = new WorkerAttemptId(workerAttemptIdStr);
    } catch (UnvalidIdStrException e) {
        LOG.error("unvalid id string, ", e);
        return;
    }
    AMWorker worker;
    worker = amContext.getWorkerManager().getWorker(workerAttemptId.getWorkerId());
    if (worker == null) {
        html.p()._("Sorry, can't find worker " + workerAttemptId.getWorkerId())._();
        return;
    }
    WorkerAttempt workerAttempt = worker.getWorkerAttempt(workerAttemptId);
    TABLE<DIV<Hamlet>> table = html.div(_INFO_WRAP).table("#job");
    TR<THEAD<TABLE<DIV<Hamlet>>>> headTr = table.thead().tr();
    headTr.th(_TH, "taskid").th(_TH, "state").th(_TH, "current iteration").th(_TH, "current iteration bar").th(_TH, "current progress").th(_TH, "current progress bar").th(_TH, "taskcounters");
    headTr._()._();
    float current_iteration_progress = (float) 0.0;
    float current_clock_progress = (float) 0.0;
    TBODY<TABLE<DIV<Hamlet>>> tbody = table.tbody();
    for (AMTask task : workerAttempt.getTaskMap().values()) {
        if (task.getProgress() >= 0 && task.getProgress() <= 1)
            current_iteration_progress = task.getProgress();
        current_clock_progress = ((float) task.getIteration()) / ((float) amContext.getTotalIterationNum());
        TR<TBODY<TABLE<DIV<Hamlet>>>> tr = tbody.tr();
        tr.td(task.getTaskId().toString()).td(task.getState().toString()).td(String.valueOf(task.getIteration()) + "/" + amContext.getTotalIterationNum()).td().div(_PROGRESSBAR).$title(// tooltip
        join(String.valueOf(current_clock_progress * 100), '%')).div(_PROGRESSBAR_VALUE).$style(join("width:", String.valueOf(current_clock_progress * 100), '%'))._()._()._().td(String.valueOf(current_iteration_progress)).td().div(_PROGRESSBAR).$title(join(String.valueOf(current_iteration_progress * 100), '%')).div(_PROGRESSBAR_VALUE).$style(join("width:", String.valueOf(current_iteration_progress * 100), '%'))._()._()._().td().a(url("angel/taskCountersPage/", task.getTaskId().toString()), "taskcounters")._();
        tr._();
    }
    tbody._()._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) UnvalidIdStrException(com.tencent.angel.exception.UnvalidIdStrException) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMTask(com.tencent.angel.master.task.AMTask)

Example 3 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class AMWorker method createWorkerAttempt.

private WorkerAttempt createWorkerAttempt() {
    WorkerAttempt attempt = null;
    if (lastAttemptId != null) {
        attempt = new WorkerAttempt(id, nextAttemptNumber, context, taskIds, attempts.get(lastAttemptId));
    } else {
        attempt = new WorkerAttempt(id, nextAttemptNumber, context, taskIds, null);
    }
    nextAttemptNumber++;
    return attempt;
}
Also used : WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt)

Example 4 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class AMWorker method addAndScheduleAttempt.

@SuppressWarnings("unchecked")
private void addAndScheduleAttempt() {
    WorkerAttempt attempt = null;
    writeLock.lock();
    try {
        // init a worker attempt for the worker
        attempt = createWorkerAttempt();
        for (TaskId taskId : taskIds) {
            AMTask task = context.getTaskManager().getTask(taskId);
            if (task != null) {
                task.resetCounters();
            }
        }
        attempts.put(attempt.getId(), attempt);
        LOG.info("scheduling " + attempt.getId());
        runningAttemptId = attempt.getId();
        lastAttemptId = attempt.getId();
    } finally {
        writeLock.unlock();
    }
    // schedule the worker attempt
    context.getEventHandler().handle(new WorkerAttemptEvent(WorkerAttemptEventType.SCHEDULE, attempt.getId()));
}
Also used : TaskId(com.tencent.angel.worker.task.TaskId) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) WorkerAttemptEvent(com.tencent.angel.master.worker.attempt.WorkerAttemptEvent) AMTask(com.tencent.angel.master.task.AMTask)

Example 5 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class ProtobufUtil method buildWorkerMetaProto.

private static WorkerMetaInfoProto buildWorkerMetaProto(AMWorker worker) {
    WorkerMetaInfoProto.Builder builder = WorkerMetaInfoProto.newBuilder();
    WorkerAttempt attempt = worker.getRunningAttempt();
    WorkerAttemptIdProto workerAttemptIdProto = convertToIdProto(attempt.getId());
    Location location = attempt.getLocation();
    WorkerLocationProto.Builder locBuilder = WorkerLocationProto.newBuilder();
    locBuilder.setWorkerAttemptId(workerAttemptIdProto);
    if (location != null) {
        locBuilder.setLocation(buildLocation(location));
    }
    builder.setWorkerLocation(locBuilder.build());
    TaskMetaInfoProto.Builder taskMetaBuilder = TaskMetaInfoProto.newBuilder();
    MatrixClock.Builder clockBuilder = MatrixClock.newBuilder();
    for (Entry<TaskId, AMTask> taskEntry : attempt.getTaskMap().entrySet()) {
        AMTask task = taskEntry.getValue();
        taskMetaBuilder.setTaskId(convertToIdProto(taskEntry.getKey()));
        taskMetaBuilder.setIteration(task.getIteration());
        Int2IntOpenHashMap matrixClocks = task.getMatrixClocks();
        for (it.unimi.dsi.fastutil.ints.Int2IntMap.Entry clockEntry : matrixClocks.int2IntEntrySet()) {
            taskMetaBuilder.addMatrixClock(clockBuilder.setMatrixId(clockEntry.getIntKey()).setClock(clockEntry.getIntValue()).build());
        }
        builder.addTasks(taskMetaBuilder.build());
        LOG.debug("task meta=" + taskMetaBuilder.build());
    }
    return builder.build();
}
Also used : TaskId(com.tencent.angel.worker.task.TaskId) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMTask(com.tencent.angel.master.task.AMTask) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) Location(com.tencent.angel.common.location.Location)

Aggregations

WorkerAttempt (com.tencent.angel.master.worker.attempt.WorkerAttempt)8 AMTask (com.tencent.angel.master.task.AMTask)6 AMWorker (com.tencent.angel.master.worker.worker.AMWorker)4 AMWorkerGroup (com.tencent.angel.master.worker.workergroup.AMWorkerGroup)3 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)3 Hamlet (org.apache.hadoop.yarn.webapp.hamlet.Hamlet)3 Location (com.tencent.angel.common.location.Location)2 UnvalidIdStrException (com.tencent.angel.exception.UnvalidIdStrException)2 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)2 Worker (com.tencent.angel.worker.Worker)2 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)2 TaskId (com.tencent.angel.worker.task.TaskId)2 Test (org.junit.Test)2 ServiceException (com.google.protobuf.ServiceException)1 AngelException (com.tencent.angel.exception.AngelException)1 TConnection (com.tencent.angel.ipc.TConnection)1 WorkerManager (com.tencent.angel.master.worker.WorkerManager)1 WorkerAttemptEvent (com.tencent.angel.master.worker.attempt.WorkerAttemptEvent)1 PSLocation (com.tencent.angel.ml.matrix.transport.PSLocation)1 Pair (com.tencent.angel.protobuf.generated.MLProtos.Pair)1