Search in sources :

Example 31 with WorkerGroupId

use of com.tencent.angel.worker.WorkerGroupId in project angel by Tencent.

the class WorkerGroupBlock method render.

@Override
protected void render(Block html) {
    String workerGroupIdSr = $(WORKERGROUP_ID);
    if (workerGroupIdSr.isEmpty()) {
        html.p()._("Sorry, can't do anything without a WorkerGroupId.")._();
        return;
    }
    WorkerGroupId workerGroupId;
    try {
        workerGroupId = new WorkerGroupId(workerGroupIdSr);
    } catch (UnvalidIdStrException e) {
        LOG.error("unvalid id string, ", e);
        return;
    }
    AMWorkerGroup workerGroup = amContext.getWorkerManager().getWorkerGroup(workerGroupId);
    if (workerGroup == null) {
        html.p()._("Sorry, can't find group " + workerGroupId)._();
        return;
    }
    set(TITLE, join("Angel WorkerGroup ", $(WORKERGROUP_ID)));
    html.h1(workerGroupIdSr);
    TABLE<DIV<Hamlet>> table = html.div(_INFO_WRAP).table("#job");
    TR<THEAD<TABLE<DIV<Hamlet>>>> headTr = table.thead().tr();
    headTr.th(_TH, "id").th(_TH, "state").th(_TH, "node address").th(_TH, "start time").th(_TH, "end time").th(_TH, "elapsed time").th(_TH, "log").th(_TH, "threadstack").th(_TH, "workercounter");
    headTr._()._();
    TBODY<TABLE<DIV<Hamlet>>> tbody = table.tbody();
    for (AMWorker worker : workerGroup.getWorkerSet()) {
        Map<WorkerAttemptId, WorkerAttempt> workerAttempts = worker.getAttempts();
        for (WorkerAttempt workerAttempt : workerAttempts.values()) {
            TR<TBODY<TABLE<DIV<Hamlet>>>> tr = tbody.tr();
            long elaspedTs = 0;
            if (workerAttempt.getLaunchTime() != 0 && workerAttempt.getFinishTime() != 0) {
                elaspedTs = workerAttempt.getFinishTime() - workerAttempt.getLaunchTime();
            } else if (workerAttempt.getLaunchTime() != 0 && workerAttempt.getFinishTime() == 0) {
                elaspedTs = System.currentTimeMillis() - workerAttempt.getLaunchTime();
            }
            if (workerAttempt.getNodeHttpAddr() == null) {
                tr.td().a(url("angel/workerPage", workerAttempt.getId().toString()), workerAttempt.getId().toString())._().td(workerAttempt.getState().toString()).td("N/A").td((workerAttempt.getLaunchTime() == 0) ? "N/A" : new Date(workerAttempt.getLaunchTime()).toString()).td((workerAttempt.getFinishTime() == 0) ? "N/A" : new Date(workerAttempt.getFinishTime()).toString()).td((elaspedTs == 0) ? "N/A" : StringUtils.formatTime(elaspedTs)).td("N/A").td("N/A").td("N/A");
            } else {
                tr.td().a(url("angel/workerPage", workerAttempt.getId().toString()), workerAttempt.getId().toString())._().td(workerAttempt.getState().toString()).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr()), workerAttempt.getNodeHttpAddr())._().td((workerAttempt.getLaunchTime() == 0) ? "N/A" : new Date(workerAttempt.getLaunchTime()).toString()).td((workerAttempt.getFinishTime() == 0) ? "N/A" : new Date(workerAttempt.getFinishTime()).toString()).td((elaspedTs == 0) ? "N/A" : StringUtils.formatTime(elaspedTs)).td().a(url(MRWebAppUtil.getYARNWebappScheme(), workerAttempt.getNodeHttpAddr(), "node", "containerlogs", workerAttempt.getContainerIdStr(), amContext.getUser().toString()), "log")._().td().a(url("angel/workerThreadStackPage/", workerAttempt.getId().toString()), "workerthreadstack")._().td().a(url("angel/workerCounterPage/", workerAttempt.getId().toString()), "workercounter")._();
            }
            tr._();
        }
    }
    tbody._()._()._();
}
Also used : Hamlet(org.apache.hadoop.yarn.webapp.hamlet.Hamlet) AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) Date(java.util.Date) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) UnvalidIdStrException(com.tencent.angel.exception.UnvalidIdStrException) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt)

Example 32 with WorkerGroupId

use of com.tencent.angel.worker.WorkerGroupId in project angel by Tencent.

the class GetNodeFeatsTest2 method setup.

@Before
public void setup() throws Exception {
    // set basic configuration keys
    Configuration conf = new Configuration();
    conf.setBoolean("mapred.mapper.new-api", true);
    conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
    conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
    // use local deploy mode and dummy dataspliter
    conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
    conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
    conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
    conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, "file:///F:\\test\\model_1");
    conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
    conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
    conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_PS_NUMBER, 1);
    conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1);
    // conf.setInt(AngelConf.ANGEL_MODEL_PARTITIONER_PARTITION_SIZE, 1000);
    conf.setInt(AngelConf.ANGEL_PSAGENT_CACHE_SYNC_TIMEINTERVAL_MS, 10);
    conf.setInt(AngelConf.ANGEL_WORKER_HEARTBEAT_INTERVAL_MS, 1000);
    conf.setInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, 1000);
    conf.setInt(AngelConf.ANGEL_WORKER_MAX_ATTEMPTS, 1);
    conf.setInt(AngelConf.ANGEL_PS_MAX_ATTEMPTS, 1);
    // get a angel client
    angelClient = AngelClientFactory.get(conf);
    // add sparse float matrix
    MatrixContext siMat = new MatrixContext();
    siMat.setName(NODE);
    siMat.setRowType(RowType.T_ANY_LONGKEY_SPARSE);
    siMat.setRowNum(1);
    siMat.setColNum(10);
    siMat.setMaxColNumInBlock(5);
    siMat.setMaxRowNumInBlock(1);
    // siMat.setValidIndexNum(100);
    // siMat.setColNum(10000000000L);
    siMat.setValueType(Node.class);
    // siMat.setPartitionStorageClass(LongElementMapStorage.class);
    // siMat.setPartitionClass(CSRPartition.class);
    angelClient.addMatrix(siMat);
    // Start PS
    angelClient.startPSServer();
    // Start to run application
    angelClient.run();
    Thread.sleep(5000);
    psId = new ParameterServerId(0);
    psAttempt0Id = new PSAttemptId(psId, 0);
    WorkerGroupId workerGroupId = new WorkerGroupId(0);
    workerId = new WorkerId(workerGroupId, 0);
    workerAttempt0Id = new WorkerAttemptId(workerId, 0);
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) MatrixContext(com.tencent.angel.ml.matrix.MatrixContext) Configuration(org.apache.hadoop.conf.Configuration) PSAttemptId(com.tencent.angel.ps.PSAttemptId) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) ParameterServerId(com.tencent.angel.ps.ParameterServerId) WorkerId(com.tencent.angel.worker.WorkerId) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) Before(org.junit.Before)

Example 33 with WorkerGroupId

use of com.tencent.angel.worker.WorkerGroupId in project angel by Tencent.

the class WorkerManager method startAllWorker.

/**
 * init and start all workers
 */
public void startAllWorker() {
    LOG.info("to start all workers.....");
    try {
        writeLock.lock();
        initWorkers();
        for (int i = 0; i < workerGroupMap.size(); i++) {
            AMWorkerGroup group = workerGroupMap.get(new WorkerGroupId(i));
            for (AMWorker worker : group.getWorkerSet()) {
                worker.handle(new AMWorkerEvent(AMWorkerEventType.SCHEDULE, worker.getId()));
            }
        }
        isInited = true;
    } finally {
        writeLock.unlock();
    }
}
Also used : AMWorkerGroup(com.tencent.angel.master.worker.workergroup.AMWorkerGroup) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) AMWorkerEvent(com.tencent.angel.master.worker.worker.AMWorkerEvent) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId)

Example 34 with WorkerGroupId

use of com.tencent.angel.worker.WorkerGroupId in project angel by Tencent.

the class WorkerManager method getDetailWorkerExitMessage.

private String getDetailWorkerExitMessage() {
    StringBuilder sb = new StringBuilder();
    sb.append("killed and failed workergroup is over tolerate ").append(tolerateFailedGroup);
    sb.append("There are some Workers failed\n");
    if (!failedGroups.isEmpty()) {
        sb.append("failed workergroups:");
        for (WorkerGroupId groupId : failedGroups) {
            sb.append("\n");
            sb.append(groupId);
            sb.append(". ");
            sb.append(StringUtils.join("\n", workerGroupMap.get(groupId).getDiagnostics()));
        }
        sb.append("\n");
    }
    if (!killedGroups.isEmpty()) {
        sb.append("killed workergroups:");
        for (WorkerGroupId groupId : killedGroups) {
            sb.append("\n");
            sb.append(groupId);
            sb.append(". ");
            sb.append(StringUtils.join("\n", workerGroupMap.get(groupId).getDiagnostics()));
        }
        sb.append("\n");
    }
    return sb.toString();
}
Also used : WorkerGroupId(com.tencent.angel.worker.WorkerGroupId)

Aggregations

WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)34 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)31 WorkerId (com.tencent.angel.worker.WorkerId)31 PSAttemptId (com.tencent.angel.ps.PSAttemptId)28 ParameterServerId (com.tencent.angel.ps.ParameterServerId)28 Configuration (org.apache.hadoop.conf.Configuration)27 CombineTextInputFormat (org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat)27 MatrixContext (com.tencent.angel.ml.matrix.MatrixContext)26 Before (org.junit.Before)22 TaskId (com.tencent.angel.worker.task.TaskId)10 Test (org.junit.Test)4 AngelException (com.tencent.angel.exception.AngelException)3 AMWorker (com.tencent.angel.master.worker.worker.AMWorker)3 AMWorkerGroup (com.tencent.angel.master.worker.workergroup.AMWorkerGroup)3 Worker (com.tencent.angel.worker.Worker)3 BeforeClass (org.junit.BeforeClass)3 WorkerAttempt (com.tencent.angel.master.worker.attempt.WorkerAttempt)2 DenseIntVector (com.tencent.angel.ml.math.vector.DenseIntVector)2 MatrixStorageManager (com.tencent.angel.ps.impl.MatrixStorageManager)2 ParameterServer (com.tencent.angel.ps.impl.ParameterServer)2