Search in sources :

Example 1 with PSAttempt

use of com.tencent.angel.master.ps.attempt.PSAttempt in project angel by Tencent.

the class AngelAppBlock method render.

@Override
protected void render(Block html) {
    set(TITLE, join("Angel Application", amContext.getApplicationId()));
    App app = amContext.getApp();
    long elaspedTs = 0;
    if (app.getLaunchTime() != 0 && app.getFinishTime() != 0) {
        elaspedTs = app.getFinishTime() - app.getLaunchTime();
    } else if (app.getLaunchTime() != 0 && app.getFinishTime() == 0) {
        elaspedTs = System.currentTimeMillis() - app.getLaunchTime();
    }
    info("Job Overview")._("Job Name:", amContext.getApplicationName())._("State:", app.getExternAppState().toString())._("Started:", new Date(app.getLaunchTime()))._("Elapsed:", StringUtils.formatTime(elaspedTs))._("Environment:", "nomeaning" == null ? "#" : "angel/EnvironmentPage", "Runtime Information And Properties")._("Task Progress:", "nomeaning" == null ? "#" : "angel/ProgressPage", "progress")._("Master Threaddump:", "nomeaning" == null ? "#" : "angel/ExecutorsPage", "threaddump");
    DIV<Hamlet> div = html._(InfoBlock.class).div(_INFO_WRAP);
    TABLE<DIV<Hamlet>> table = div.table("#job");
    table.tr().th(_TH, "module").th(_TH, "new").th(_TH, "running").th(_TH, "failed").th(_TH, "killed").th(_TH, "success")._();
    int newGroupNum = 0;
    int runningGroupNum = 0;
    int failedGroupNum = 0;
    int killedGroupNum = 0;
    int successGroupNum = 0;
    int newPSNum = 0;
    int runningPSNum = 0;
    int failedPSNum = 0;
    int killedPSNum = 0;
    int successPSNum = 0;
    LOG.info("before compute worker state items");
    if (amContext.getWorkerManager() != null) {
        for (AMWorkerGroup group : amContext.getWorkerManager().getWorkerGroupMap().values()) {
            switch(group.getState()) {
                case NEW:
                case INITED:
                    newGroupNum += 1;
                    break;
                case RUNNING:
                    runningGroupNum += 1;
                    break;
                case KILLED:
                    killedGroupNum += 1;
                    break;
                case FAILED:
                    failedGroupNum += 1;
                    break;
                case SUCCESS:
                    successGroupNum += 1;
                    break;
                default:
                    break;
            }
        }
    }
    for (AMParameterServer ps : amContext.getParameterServerManager().getParameterServerMap().values()) {
        for (PSAttempt psAttemp : ps.getPSAttempts().values()) {
            switch(psAttemp.getInternalState()) {
                case NEW:
                case SCHEDULED:
                case LAUNCHED:
                    newPSNum += 1;
                    break;
                case RUNNING:
                case COMMITTING:
                    runningPSNum += 1;
                    break;
                case KILLED:
                    killedPSNum += 1;
                    break;
                case FAILED:
                    failedPSNum += 1;
                    break;
                case SUCCESS:
                    successPSNum += 1;
                    break;
                default:
                    break;
            }
        }
    }
    table.tr().td("workergroups").td().a(url("angel/workerGroupsPage", "NEW"), String.valueOf(newGroupNum))._().td().a(url("angel/workerGroupsPage", "RUNNING"), String.valueOf(runningGroupNum))._().td().a(url("angel/workerGroupsPage", "FAILED"), String.valueOf(failedGroupNum))._().td().a(url("angel/workerGroupsPage", "KILLED"), String.valueOf(killedGroupNum))._().td().a(url("angel/workerGroupsPage", "SUCCESS"), String.valueOf(successGroupNum))._()._().tr().td("parameterservers").td().a(url("angel/parameterServersPage", "NEW"), String.valueOf(newPSNum))._().td().a(url("angel/parameterServersPage", "RUNNING"), String.valueOf(runningPSNum))._().td().a(url("angel/parameterServersPage", "FAILED"), String.valueOf(failedPSNum))._().td().a(url("angel/parameterServersPage", "KILLED"), String.valueOf(killedPSNum))._().td().a(url("angel/parameterServersPage", "SUCCESS"), String.valueOf(successPSNum))._()._();
    table._();
    div._();
}
Also used : App(com.tencent.angel.master.app.App) Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) InfoBlock(org.apache.hadoop.yarn.webapp.view.InfoBlock) DIV(org.apache.hadoop.yarn.webapp.hamlet.Hamlet.DIV) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) Date(java.util.Date)

Example 2 with PSAttempt

use of com.tencent.angel.master.ps.attempt.PSAttempt in project angel by Tencent.

the class ParameterServersBlock method render.

@Override
protected void render(Block html) {
    set(TITLE, join("Angel ParameterServers"));
    TABLE<Hamlet> table = html.table("#job");
    TR<THEAD<TABLE<Hamlet>>> headTr = table.thead().tr();
    headTr.th(_TH, "id").th(_TH, "state").th(_TH, "node address").th(_TH, "start time").th(_TH, "end time").th(_TH, "elapsed time").th(_TH, "log").th(_TH, "threadstack");
    headTr._()._();
    Set<PSAttemptStateInternal> stateSet = transformToInternalState($(PARAMETERSERVER_STATE));
    TBODY<TABLE<Hamlet>> tbody = table.tbody();
    for (AMParameterServer ps : amContext.getParameterServerManager().getParameterServerMap().values()) {
        Map<PSAttemptId, PSAttempt> psAttempts = ps.getPSAttempts();
        for (PSAttempt psAttempt : psAttempts.values()) {
            if (stateSet.contains(psAttempt.getInternalState())) {
                TR<TBODY<TABLE<Hamlet>>> tr = tbody.tr();
                long elaspedTs = 0;
                if (psAttempt.getLaunchTime() != 0 && psAttempt.getFinishTime() != 0) {
                    elaspedTs = psAttempt.getFinishTime() - psAttempt.getLaunchTime();
                } else if (psAttempt.getLaunchTime() != 0 && psAttempt.getFinishTime() == 0) {
                    elaspedTs = System.currentTimeMillis() - psAttempt.getLaunchTime();
                }
                if (psAttempt.getNodeHttpAddr() == null) {
                    tr.td(psAttempt.getId().toString()).td($(PARAMETERSERVER_STATE)).td("N/A").td(psAttempt.getLaunchTime() == 0 ? "N/A" : new Date(psAttempt.getLaunchTime()).toString()).td(psAttempt.getFinishTime() == 0 ? "N/A" : new Date(psAttempt.getFinishTime()).toString()).td(elaspedTs == 0 ? "N/A" : new Date(elaspedTs).toString()).td("N/A").td("N/A");
                    tr._();
                } else {
                    tr.td(psAttempt.getId().toString()).td($(PARAMETERSERVER_STATE)).td().a(url(MRWebAppUtil.getYARNWebappScheme(), psAttempt.getNodeHttpAddr()), psAttempt.getNodeHttpAddr())._().td(psAttempt.getLaunchTime() == 0 ? "N/A" : new Date(psAttempt.getLaunchTime()).toString()).td(psAttempt.getFinishTime() == 0 ? "N/A" : new Date(psAttempt.getFinishTime()).toString()).td(elaspedTs == 0 ? "N/A" : StringUtils.formatTime(elaspedTs)).td().a(url(MRWebAppUtil.getYARNWebappScheme(), psAttempt.getNodeHttpAddr(), "node", "containerlogs", psAttempt.getContainerIdStr(), amContext.getUser().toString()), "log")._().td().a(url("/angel/parameterServerThreadStackPage/", psAttempt.getId().toString()), "psthreadstack")._();
                    tr._();
                }
            }
        }
    }
    tbody._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) TBODY(org.apache.hadoop.yarn.webapp.hamlet.Hamlet.TBODY) PSAttemptStateInternal(com.tencent.angel.master.ps.attempt.PSAttemptStateInternal) Date(java.util.Date) TABLE(org.apache.hadoop.yarn.webapp.hamlet.Hamlet.TABLE) PSAttemptId(com.tencent.angel.ps.PSAttemptId) THEAD(org.apache.hadoop.yarn.webapp.hamlet.Hamlet.THEAD) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt)

Example 3 with PSAttempt

use of com.tencent.angel.master.ps.attempt.PSAttempt in project angel by Tencent.

the class AMParameterServer method addAndScheduleAttempt.

@SuppressWarnings("unchecked")
private void addAndScheduleAttempt() {
    PSAttempt attempt = null;
    writeLock.lock();
    try {
        attempt = createPSAttempt();
        attempts.put(attempt.getId(), attempt);
        LOG.info("scheduling " + attempt.getId());
        runningPSAttemptId = attempt.getId();
    } finally {
        writeLock.unlock();
    }
    // getContext().getLocationManager().setPsLocation(id, null);
    getContext().getEventHandler().handle(new PSAttemptEvent(PSAttemptEventType.PA_SCHEDULE, attempt.getId()));
}
Also used : PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) PSAttemptEvent(com.tencent.angel.master.ps.attempt.PSAttemptEvent)

Example 4 with PSAttempt

use of com.tencent.angel.master.ps.attempt.PSAttempt in project angel by Tencent.

the class AMParameterServer method createPSAttempt.

private PSAttempt createPSAttempt() {
    PSAttempt attempt = new PSAttempt(ip, id, nextAttemptNumber, context);
    nextAttemptNumber++;
    return attempt;
}
Also used : PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt)

Example 5 with PSAttempt

use of com.tencent.angel.master.ps.attempt.PSAttempt in project angel by Tencent.

the class PSManagerTest method testPSDone.

@SuppressWarnings("unchecked")
@Test
public void testPSDone() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Location masterLoc = ps.getMasterLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        WorkerDoneRequest workerRequest = WorkerDoneRequest.newBuilder().setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id)).build();
        WorkerDoneResponse workerResponse = master.workerDone(null, workerRequest);
        assertEquals(workerResponse.getCommand(), WorkerCommandProto.W_SUCCESS);
        Thread.sleep(5000);
        angelAppMaster.getAppContext().getEventHandler().handle(new AppEvent(AppEventType.COMMIT));
        PSDoneRequest request = PSDoneRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).build();
        master.psDone(null, request);
        Thread.sleep(5000);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
        assertEquals(psAttempt.getInternalState(), PSAttemptStateInternal.SUCCESS);
        assertTrue(amPs.getState() == AMParameterServerState.SUCCESS);
        assertEquals(amPs.getNextAttemptNumber(), 1);
        assertNull(amPs.getRunningAttemptId());
        assertEquals(amPs.getSuccessAttemptId(), psAttempt0Id);
        assertEquals(amPs.getPSAttempts().size(), 1);
    } catch (Exception x) {
        LOG.error("run testPSDone failed ", x);
        throw x;
    }
}
Also used : WorkerDoneRequest(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneRequest) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) AngelException(com.tencent.angel.exception.AngelException) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) AppEvent(com.tencent.angel.master.app.AppEvent) TConnection(com.tencent.angel.ipc.TConnection) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) WorkerDoneResponse(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneResponse) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Aggregations

PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)9 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)6 AngelException (com.tencent.angel.exception.AngelException)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 PSAttemptId (com.tencent.angel.ps.PSAttemptId)4 Test (org.junit.Test)4 Location (com.tencent.angel.common.location.Location)3 TConnection (com.tencent.angel.ipc.TConnection)3 ParameterServer (com.tencent.angel.ps.impl.ParameterServer)3 Date (java.util.Date)2 Hamlet (org.apache.hadoop.yarn.webapp.hamlet.Hamlet)2 App (com.tencent.angel.master.app.App)1 AppEvent (com.tencent.angel.master.app.AppEvent)1 PSAttemptEvent (com.tencent.angel.master.ps.attempt.PSAttemptEvent)1 PSAttemptStateInternal (com.tencent.angel.master.ps.attempt.PSAttemptStateInternal)1 AMWorkerGroup (com.tencent.angel.master.worker.workergroup.AMWorkerGroup)1 DenseIntVector (com.tencent.angel.ml.math.vector.DenseIntVector)1 Pair (com.tencent.angel.protobuf.generated.MLProtos.Pair)1 WorkerDoneRequest (com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneRequest)1 WorkerDoneResponse (com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneResponse)1