use of com.tencent.angel.master.ps.ps.AMParameterServer in project angel by Tencent.
the class AngelAppBlock method render.
@Override
protected void render(Block html) {
set(TITLE, join("Angel Application", amContext.getApplicationId()));
App app = amContext.getApp();
long elaspedTs = 0;
if (app.getLaunchTime() != 0 && app.getFinishTime() != 0) {
elaspedTs = app.getFinishTime() - app.getLaunchTime();
} else if (app.getLaunchTime() != 0 && app.getFinishTime() == 0) {
elaspedTs = System.currentTimeMillis() - app.getLaunchTime();
}
info("Job Overview")._("Job Name:", amContext.getApplicationName())._("State:", app.getExternAppState().toString())._("Started:", new Date(app.getLaunchTime()))._("Elapsed:", StringUtils.formatTime(elaspedTs))._("Environment:", "nomeaning" == null ? "#" : "angel/EnvironmentPage", "Runtime Information And Properties")._("Task Progress:", "nomeaning" == null ? "#" : "angel/ProgressPage", "progress")._("Master Threaddump:", "nomeaning" == null ? "#" : "angel/ExecutorsPage", "threaddump");
DIV<Hamlet> div = html._(InfoBlock.class).div(_INFO_WRAP);
TABLE<DIV<Hamlet>> table = div.table("#job");
table.tr().th(_TH, "module").th(_TH, "new").th(_TH, "running").th(_TH, "failed").th(_TH, "killed").th(_TH, "success")._();
int newGroupNum = 0;
int runningGroupNum = 0;
int failedGroupNum = 0;
int killedGroupNum = 0;
int successGroupNum = 0;
int newPSNum = 0;
int runningPSNum = 0;
int failedPSNum = 0;
int killedPSNum = 0;
int successPSNum = 0;
LOG.info("before compute worker state items");
if (amContext.getWorkerManager() != null) {
for (AMWorkerGroup group : amContext.getWorkerManager().getWorkerGroupMap().values()) {
switch(group.getState()) {
case NEW:
case INITED:
newGroupNum += 1;
break;
case RUNNING:
runningGroupNum += 1;
break;
case KILLED:
killedGroupNum += 1;
break;
case FAILED:
failedGroupNum += 1;
break;
case SUCCESS:
successGroupNum += 1;
break;
default:
break;
}
}
}
for (AMParameterServer ps : amContext.getParameterServerManager().getParameterServerMap().values()) {
for (PSAttempt psAttemp : ps.getPSAttempts().values()) {
switch(psAttemp.getInternalState()) {
case NEW:
case SCHEDULED:
case LAUNCHED:
newPSNum += 1;
break;
case RUNNING:
case COMMITTING:
runningPSNum += 1;
break;
case KILLED:
killedPSNum += 1;
break;
case FAILED:
failedPSNum += 1;
break;
case SUCCESS:
successPSNum += 1;
break;
default:
break;
}
}
}
table.tr().td("workergroups").td().a(url("angel/workerGroupsPage", "NEW"), String.valueOf(newGroupNum))._().td().a(url("angel/workerGroupsPage", "RUNNING"), String.valueOf(runningGroupNum))._().td().a(url("angel/workerGroupsPage", "FAILED"), String.valueOf(failedGroupNum))._().td().a(url("angel/workerGroupsPage", "KILLED"), String.valueOf(killedGroupNum))._().td().a(url("angel/workerGroupsPage", "SUCCESS"), String.valueOf(successGroupNum))._()._().tr().td("parameterservers").td().a(url("angel/parameterServersPage", "NEW"), String.valueOf(newPSNum))._().td().a(url("angel/parameterServersPage", "RUNNING"), String.valueOf(runningPSNum))._().td().a(url("angel/parameterServersPage", "FAILED"), String.valueOf(failedPSNum))._().td().a(url("angel/parameterServersPage", "KILLED"), String.valueOf(killedPSNum))._().td().a(url("angel/parameterServersPage", "SUCCESS"), String.valueOf(successPSNum))._()._();
table._();
div._();
}
use of com.tencent.angel.master.ps.ps.AMParameterServer in project angel by Tencent.
the class ParameterServersBlock method render.
@Override
protected void render(Block html) {
set(TITLE, join("Angel ParameterServers"));
TABLE<Hamlet> table = html.table("#job");
TR<THEAD<TABLE<Hamlet>>> headTr = table.thead().tr();
headTr.th(_TH, "id").th(_TH, "state").th(_TH, "node address").th(_TH, "start time").th(_TH, "end time").th(_TH, "elapsed time").th(_TH, "log").th(_TH, "threadstack");
headTr._()._();
Set<PSAttemptStateInternal> stateSet = transformToInternalState($(PARAMETERSERVER_STATE));
TBODY<TABLE<Hamlet>> tbody = table.tbody();
for (AMParameterServer ps : amContext.getParameterServerManager().getParameterServerMap().values()) {
Map<PSAttemptId, PSAttempt> psAttempts = ps.getPSAttempts();
for (PSAttempt psAttempt : psAttempts.values()) {
if (stateSet.contains(psAttempt.getInternalState())) {
TR<TBODY<TABLE<Hamlet>>> tr = tbody.tr();
long elaspedTs = 0;
if (psAttempt.getLaunchTime() != 0 && psAttempt.getFinishTime() != 0) {
elaspedTs = psAttempt.getFinishTime() - psAttempt.getLaunchTime();
} else if (psAttempt.getLaunchTime() != 0 && psAttempt.getFinishTime() == 0) {
elaspedTs = System.currentTimeMillis() - psAttempt.getLaunchTime();
}
if (psAttempt.getNodeHttpAddr() == null) {
tr.td(psAttempt.getId().toString()).td($(PARAMETERSERVER_STATE)).td("N/A").td(psAttempt.getLaunchTime() == 0 ? "N/A" : new Date(psAttempt.getLaunchTime()).toString()).td(psAttempt.getFinishTime() == 0 ? "N/A" : new Date(psAttempt.getFinishTime()).toString()).td(elaspedTs == 0 ? "N/A" : new Date(elaspedTs).toString()).td("N/A").td("N/A");
tr._();
} else {
tr.td(psAttempt.getId().toString()).td($(PARAMETERSERVER_STATE)).td().a(url(MRWebAppUtil.getYARNWebappScheme(), psAttempt.getNodeHttpAddr()), psAttempt.getNodeHttpAddr())._().td(psAttempt.getLaunchTime() == 0 ? "N/A" : new Date(psAttempt.getLaunchTime()).toString()).td(psAttempt.getFinishTime() == 0 ? "N/A" : new Date(psAttempt.getFinishTime()).toString()).td(elaspedTs == 0 ? "N/A" : StringUtils.formatTime(elaspedTs)).td().a(url(MRWebAppUtil.getYARNWebappScheme(), psAttempt.getNodeHttpAddr(), "node", "containerlogs", psAttempt.getContainerIdStr(), amContext.getUser().toString()), "log")._().td().a(url("/angel/parameterServerThreadStackPage/", psAttempt.getId().toString()), "psthreadstack")._();
tr._();
}
}
}
}
tbody._()._();
}
use of com.tencent.angel.master.ps.ps.AMParameterServer in project angel by Tencent.
the class AppStateStorage method writePSMeta.
/**
* write ps meta to file
* @param psManager ps meta storage
* @throws IOException
*/
public void writePSMeta(ParameterServerManager psManager) throws IOException {
try {
psMetaLock.lock();
// generate a temporary file
String psMetaFile = getPsMetaFile();
String tmpFile = getPSMetaTmpeFile(psMetaFile);
Path tmpPath = new Path(writeDir, tmpFile);
FSDataOutputStream outputStream = fs.create(tmpPath);
// write ps meta to the temporary file first.
Map<ParameterServerId, AMParameterServer> psMap = psManager.getParameterServerMap();
outputStream.writeInt(psMap.size());
PSAttemptId attemptId = null;
int nextAttemptIndex = 0;
for (Entry<ParameterServerId, AMParameterServer> entry : psMap.entrySet()) {
outputStream.writeInt(entry.getKey().getIndex());
attemptId = entry.getValue().getRunningAttemptId();
nextAttemptIndex = entry.getValue().getNextAttemptNumber();
if (attemptId != null) {
nextAttemptIndex = attemptId.getIndex();
}
outputStream.writeInt(nextAttemptIndex);
}
outputStream.close();
// rename the temporary file to the final file
Path psMetaFilePath = new Path(writeDir, psMetaFile);
HdfsUtil.rename(tmpPath, psMetaFilePath, fs);
// if the old final file exist, just remove it
if (lastPsMetaFilePath != null) {
fs.delete(lastPsMetaFilePath, false);
}
lastPsMetaFilePath = psMetaFilePath;
} finally {
psMetaLock.unlock();
}
}
use of com.tencent.angel.master.ps.ps.AMParameterServer in project angel by Tencent.
the class ParameterServerManager method init.
/**
* Init all PS
*/
public void init() {
for (int i = 0; i < psNumber; i++) {
ParameterServerId id = new ParameterServerId(i);
AMParameterServer server = null;
if (ips != null) {
server = new AMParameterServer(ips[i], id, context);
} else {
server = new AMParameterServer(id, context);
}
if (psIdToAttemptIndexMap != null && psIdToAttemptIndexMap.containsKey(id)) {
server.setNextAttemptNumber(psIdToAttemptIndexMap.get(id));
}
psMap.put(id, server);
}
}
use of com.tencent.angel.master.ps.ps.AMParameterServer in project angel by Tencent.
the class PSManagerTest method testPSDone.
@SuppressWarnings("unchecked")
@Test
public void testPSDone() throws Exception {
try {
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Location masterLoc = ps.getMasterLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
WorkerDoneRequest workerRequest = WorkerDoneRequest.newBuilder().setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id)).build();
WorkerDoneResponse workerResponse = master.workerDone(null, workerRequest);
assertEquals(workerResponse.getCommand(), WorkerCommandProto.W_SUCCESS);
Thread.sleep(5000);
angelAppMaster.getAppContext().getEventHandler().handle(new AppEvent(AppEventType.COMMIT));
PSDoneRequest request = PSDoneRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).build();
master.psDone(null, request);
Thread.sleep(5000);
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
assertEquals(psAttempt.getInternalState(), PSAttemptStateInternal.SUCCESS);
assertTrue(amPs.getState() == AMParameterServerState.SUCCESS);
assertEquals(amPs.getNextAttemptNumber(), 1);
assertNull(amPs.getRunningAttemptId());
assertEquals(amPs.getSuccessAttemptId(), psAttempt0Id);
assertEquals(amPs.getPSAttempts().size(), 1);
} catch (Exception x) {
LOG.error("run testPSDone failed ", x);
throw x;
}
}
Aggregations