Search in sources :

Example 6 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class MasterServiceTest method testMasterService.

@Test
public void testMasterService() throws Exception {
    try {
        LOG.info("===========================testMasterService===============================");
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(worker.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int psAgentId = master.getPSAgentId(null, PSAgentMasterServiceProtos.GetPSAgentIdRequest.getDefaultInstance()).getPsAgentId();
        // worker register
        WorkerAttemptId worker1Attempt0Id = new WorkerAttemptId(new WorkerId(new WorkerGroupId(1), 0), 0);
        WorkerRegisterRequest registeRequest = WorkerRegisterRequest.newBuilder().setPsAgentId(psAgentId).setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker1Attempt0Id)).setLocation(LocationProto.newBuilder().setIp("0.0.0.0").setPort(10000).build()).build();
        WorkerRegisterResponse registerResponse = master.workerRegister(null, registeRequest);
        assertTrue(registerResponse.getCommand() == WorkerCommandProto.W_SHUTDOWN);
        WorkerReportRequest.Builder reportBuilder = WorkerReportRequest.newBuilder();
        Pair.Builder kvBuilder = Pair.newBuilder();
        TaskStateProto.Builder taskBuilder = TaskStateProto.newBuilder();
        reportBuilder.setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id));
        taskBuilder.setProgress(0.20f);
        taskBuilder.setState("RUNNING");
        taskBuilder.setTaskId(ProtobufUtil.convertToIdProto(task0Id));
        kvBuilder.setKey("task_key1");
        kvBuilder.setValue("100");
        taskBuilder.addCounters(kvBuilder.build());
        kvBuilder.setKey("task_key2");
        kvBuilder.setValue("200");
        taskBuilder.addCounters(kvBuilder.build());
        reportBuilder.addTaskReports(taskBuilder.build());
        taskBuilder.setProgress(0.30f);
        taskBuilder.setState("RUNNING");
        taskBuilder.setTaskId(ProtobufUtil.convertToIdProto(task1Id));
        kvBuilder.setKey("task_key1");
        kvBuilder.setValue("1000");
        taskBuilder.addCounters(kvBuilder.build());
        kvBuilder.setKey("task_key2");
        kvBuilder.setValue("2000");
        taskBuilder.addCounters(kvBuilder.build());
        reportBuilder.addTaskReports(taskBuilder.build());
        kvBuilder.setKey("worker_key1");
        kvBuilder.setValue("100");
        reportBuilder.addPairs(kvBuilder.build());
        kvBuilder.setKey("worker_key2");
        kvBuilder.setValue("200");
        reportBuilder.addPairs(kvBuilder.build());
        WorkerReportResponse reportResponse = master.workerReport(null, reportBuilder.build());
        assertTrue(reportResponse.getCommand() == WorkerCommandProto.W_SUCCESS);
        assertEquals(reportResponse.getActiveTaskNum(), 2);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        WorkerAttempt worker0Attempt = angelAppMaster.getAppContext().getWorkerManager().getWorker(worker0Attempt0Id.getWorkerId()).getWorkerAttempt(worker0Attempt0Id);
        assertTrue(worker0Attempt != null);
        Map<String, String> workerMetrics = worker0Attempt.getMetrics();
        String valueForWorkerKey1 = workerMetrics.get("worker_key1");
        String valueForWorkerKey2 = workerMetrics.get("worker_key2");
        assertNotNull(valueForWorkerKey1);
        assertNotNull(valueForWorkerKey2);
        assertEquals(valueForWorkerKey1, "100");
        assertEquals(valueForWorkerKey2, "200");
        AMTaskManager amTaskManager = angelAppMaster.getAppContext().getTaskManager();
        AMTask task0 = amTaskManager.getTask(task0Id);
        AMTask task1 = amTaskManager.getTask(task1Id);
        assertTrue(task0 != null);
        assertTrue(task1 != null);
        Map<String, String> task0Metrics = task0.getMetrics();
        Map<String, String> task1Metrics = task1.getMetrics();
        String valueForTask0Key1 = task0Metrics.get("task_key1");
        String valueForTask0Key2 = task0Metrics.get("task_key2");
        String valueForTask1Key1 = task1Metrics.get("task_key1");
        String valueForTask1Key2 = task1Metrics.get("task_key2");
        assertTrue(valueForTask0Key1 != null);
        assertTrue(valueForTask0Key2 != null);
        assertTrue(valueForTask1Key1 != null);
        assertTrue(valueForTask1Key2 != null);
        assertEquals(valueForTask0Key1, "100");
        assertEquals(valueForTask0Key2, "200");
        assertEquals(valueForTask1Key1, "1000");
        assertEquals(valueForTask1Key2, "2000");
        assertEquals(task0.getProgress(), 0.20f, 0.000001);
        assertEquals(task1.getProgress(), 0.30f, 0.000001);
    } catch (Exception x) {
        LOG.error("run testMasterService failed ", x);
        throw x;
    }
}
Also used : WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) WorkerId(com.tencent.angel.worker.WorkerId) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) TConnection(com.tencent.angel.ipc.TConnection) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) Worker(com.tencent.angel.worker.Worker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMTask(com.tencent.angel.master.task.AMTask) Location(com.tencent.angel.common.location.Location) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair) Test(org.junit.Test)

Example 7 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class MasterService method getWorkerLogDir.

/**
 * Get worker log url
 *
 * @param controller rpc controller
 * @param request rpc request contains worker id
 * @return worker log url
 * @throws ServiceException worker does not exist
 */
@Override
public GetWorkerLogDirResponse getWorkerLogDir(RpcController controller, GetWorkerLogDirRequest request) throws ServiceException {
    WorkerId workerId = ProtobufUtil.convertToId(request.getWorkerId());
    AMWorker worker = context.getWorkerManager().getWorker(workerId);
    if (worker == null) {
        throw new ServiceException("can not find worker " + workerId);
    }
    WorkerAttempt workerAttempt = worker.getRunningAttempt();
    if (workerAttempt == null) {
        return GetWorkerLogDirResponse.newBuilder().setLogDir("").build();
    }
    Location loc = workerAttempt.getLocation();
    Container container = workerAttempt.getContainer();
    if (loc == null || container == null) {
        return GetWorkerLogDirResponse.newBuilder().setLogDir("").build();
    }
    return GetWorkerLogDirResponse.newBuilder().setLogDir("http://" + loc.getIp() + ":" + yarnNMWebPort + "/node/containerlogs/" + container.getId() + "/angel/syslog/?start=0").build();
}
Also used : Container(org.apache.hadoop.yarn.api.records.Container) ServiceException(com.google.protobuf.ServiceException) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) WorkerId(com.tencent.angel.worker.WorkerId) PSLocation(com.tencent.angel.ps.server.data.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 8 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class TaskManagerTest method testTaskIteration.

@Test
public void testTaskIteration() throws Exception {
    try {
        LOG.info("===========================testTaskIteration===============================");
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        AMWorkerGroup workerGroup0 = workerManager.getWorkGroup(worker0Id);
        AMWorker worker0 = workerGroup0.getWorker(worker0Id);
        WorkerAttempt worker0Attempt0 = worker0.getWorkerAttempt(worker0Attempt0Id);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        MasterClient masterClient = worker.getPSAgent().getMasterClient();
        masterClient.taskIteration(task0Id.getIndex(), 1);
        AMTask task0 = taskManager.getTask(task0Id);
        AMTask task1 = taskManager.getTask(task1Id);
        assertEquals(task0.getIteration(), 1);
        assertEquals(task1.getIteration(), 0);
        assertEquals(worker0Attempt0.getMinIteration(), 0);
        assertEquals(worker0.getMinIteration(), 0);
        assertEquals(workerGroup0.getMinIteration(), 0);
        masterClient.taskIteration(task1Id.getIndex(), 1);
        assertEquals(task0.getIteration(), 1);
        assertEquals(task1.getIteration(), 1);
        assertEquals(worker0Attempt0.getMinIteration(), 1);
        assertEquals(worker0.getMinIteration(), 1);
        assertEquals(workerGroup0.getMinIteration(), 1);
    } catch (Exception x) {
        LOG.error("run testTaskIteration failed ", x);
        throw x;
    }
}
Also used : WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) MasterClient(com.tencent.angel.psagent.client.MasterClient) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) Worker(com.tencent.angel.worker.Worker) AMTask(com.tencent.angel.master.task.AMTask) ServiceException(com.google.protobuf.ServiceException) AngelException(com.tencent.angel.exception.AngelException) Test(org.junit.Test)

Example 9 with WorkerAttempt

use of com.tencent.angel.master.worker.attempt.WorkerAttempt in project angel by Tencent.

the class WorkerGroupBlock method render.

@Override
protected void render(Block html) {
    String workerGroupIdSr = $(WORKERGROUP_ID);
    if (workerGroupIdSr.isEmpty()) {
        html.p()._("Sorry, can't do anything without a WorkerGroupId.")._();
        return;
    }
    WorkerGroupId workerGroupId;
    try {
        workerGroupId = new WorkerGroupId(workerGroupIdSr);
    } catch (UnvalidIdStrException e) {
        LOG.error("unvalid id string, ", e);
        return;
    }
    AMWorkerGroup workerGroup = amContext.getWorkerManager().getWorkerGroup(workerGroupId);
    if (workerGroup == null) {
        html.p()._("Sorry, can't find group " + workerGroupId)._();
        return;
    }
    set(TITLE, join("Angel WorkerGroup ", $(WORKERGROUP_ID)));
    html.h1(workerGroupIdSr);
    TABLE<DIV<Hamlet>> table = html.div(_INFO_WRAP).table("#job");
    TR<THEAD<TABLE<DIV<Hamlet>>>> headTr = table.thead().tr();
    headTr.th(_TH, "id").th(_TH, "state").th(_TH, "node address").th(_TH, "start time").th(_TH, "end time").th(_TH, "elapsed time").th(_TH, "log").th(_TH, "threadstack").th(_TH, "workercounter");
    headTr._()._();
    TBODY<TABLE<DIV<Hamlet>>> tbody = table.tbody();
    for (AMWorker worker : workerGroup.getWorkerSet()) {
        Map<WorkerAttemptId, WorkerAttempt> workerAttempts = worker.getAttempts();
        for (WorkerAttempt workerAttempt : workerAttempts.values()) {
            TR<TBODY<TABLE<DIV<Hamlet>>>> tr = tbody.tr();
            long elaspedTs = 0;
            if (workerAttempt.getLaunchTime() != 0 && workerAttempt.getFinishTime() != 0) {
                elaspedTs = workerAttempt.getFinishTime() - workerAttempt.getLaunchTime();
            } else if (workerAttempt.getLaunchTime() != 0 && workerAttempt.getFinishTime() == 0) {
                elaspedTs = System.currentTimeMillis() - workerAttempt.getLaunchTime();
            }
            if (workerAttempt.getNodeHttpAddr() == null) {
                tr.td().a(url("angel/workerPage", workerAttempt.getId().toString()), workerAttempt.getId().toString())._().td(workerAttempt.getState().toString()).td("N/A").td((workerAttempt.getLaunchTime() == 0) ? "N/A" : new Date(workerAttempt.getLaunchTime()).toString()).td((workerAttempt.getFinishTime() == 0) ? "N/A" : new Date(workerAttempt.getFinishTime()).toString()).td((elaspedTs == 0) ? "N/A" : StringUtils.formatTime(elaspedTs)).td("N/A").td("N/A").td("N/A");
            } else {
                tr.td().a(url("angel/workerPage", workerAttempt.getId().toString()), workerAttempt.getId().toString())._().td(workerAttempt.getState().toString()).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr()), workerAttempt.getNodeHttpAddr())._().td((workerAttempt.getLaunchTime() == 0) ? "N/A" : new Date(workerAttempt.getLaunchTime()).toString()).td((workerAttempt.getFinishTime() == 0) ? "N/A" : new Date(workerAttempt.getFinishTime()).toString()).td((elaspedTs == 0) ? "N/A" : StringUtils.formatTime(elaspedTs)).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr(), "node", "containerlogs", workerAttempt.getContainerIdStr(), amContext.getUser().toString()), "log")._().td().a(url("angel/workerThreadStackPage/", workerAttempt.getId().toString()), "workerthreadstack")._().td().a(url("angel/workerCounterPage/", workerAttempt.getId().toString()), "workercounter")._();
            }
            tr._();
        }
    }
    tbody._()._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) Date(java.util.Date) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) UnvalidIdStrException(com.tencent.angel.exception.UnvalidIdStrException) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt)

Aggregations

WorkerAttempt (com.tencent.angel.master.worker.attempt.WorkerAttempt)9 AMTask (com.tencent.angel.master.task.AMTask)6 AMWorker (com.tencent.angel.master.worker.worker.AMWorker)5 Location (com.tencent.angel.common.location.Location)3 AMWorkerGroup (com.tencent.angel.master.worker.workergroup.AMWorkerGroup)3 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)3 Hamlet (org.apache.hadoop.yarn.webapp.hamlet.Hamlet)3 ServiceException (com.google.protobuf.ServiceException)2 UnvalidIdStrException (com.tencent.angel.exception.UnvalidIdStrException)2 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)2 PSLocation (com.tencent.angel.ps.server.data.PSLocation)2 Worker (com.tencent.angel.worker.Worker)2 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)2 WorkerId (com.tencent.angel.worker.WorkerId)2 TaskId (com.tencent.angel.worker.task.TaskId)2 Test (org.junit.Test)2 AngelException (com.tencent.angel.exception.AngelException)1 TConnection (com.tencent.angel.ipc.TConnection)1 WorkerManager (com.tencent.angel.master.worker.WorkerManager)1 WorkerAttemptEvent (com.tencent.angel.master.worker.attempt.WorkerAttemptEvent)1