use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class MasterService method psError.
/**
* notify a parameter server run failed
* @param controller rpc controller of protobuf
* @param request contains parameter server id and error message
* @throws ServiceException
*/
@SuppressWarnings("unchecked")
@Override
public PSErrorResponse psError(RpcController controller, PSErrorRequest request) throws ServiceException {
PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
LOG.info("error happened in psAttempt " + psAttemptId + " error msg=" + request.getMsg());
// remove this parameter server attempt from monitor set
psLastHeartbeatTS.remove(psAttemptId);
context.getEventHandler().handle(new PSAttemptDiagnosticsUpdateEvent(request.getMsg(), psAttemptId));
context.getEventHandler().handle(new PSAttemptEvent(PSAttemptEventType.PA_FAILMSG, psAttemptId));
return PSErrorResponse.newBuilder().build();
}
use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class MasterService method psRegister.
/**
* response for parameter server register.
* @param controller rpc controller of protobuf
* @param request register request
* @throws ServiceException
*/
@SuppressWarnings("unchecked")
@Override
public PSRegisterResponse psRegister(RpcController controller, PSRegisterRequest request) throws ServiceException {
if (LOG.isDebugEnabled()) {
LOG.debug("receive ps register request. request=" + request);
}
PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
PSRegisterResponse.Builder resBuilder = PSRegisterResponse.newBuilder();
// if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
if (!psLastHeartbeatTS.containsKey(psAttemptId)) {
LOG.info(psAttemptId + " doesn't exists!");
for (PSAttemptId id : psLastHeartbeatTS.keySet()) {
LOG.info("contains psKey: " + id);
}
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
} else {
context.getEventHandler().handle(new PSAttemptRegisterEvent(psAttemptId, new Location(request.getLocation().getIp(), request.getLocation().getPort())));
LOG.info(psAttemptId + " is registered now!");
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
}
LOG.info(psAttemptId + " register finished!");
return resBuilder.build();
}
use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class MasterService method psReport.
/**
* response for parameter server heartbeat
* @param controller rpc controller of protobuf
* @param request heartbeat request
* @throws ServiceException
*/
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
if (LOG.isDebugEnabled()) {
LOG.debug("receive ps heartbeat request. request=" + request);
}
// parse parameter server counters
List<Pair> params = request.getMetricsList();
int size = params.size();
Map<String, String> paramsMap = new HashMap<String, String>();
for (int i = 0; i < size; i++) {
paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
}
PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
if (!psLastHeartbeatTS.containsKey(psAttemptId)) {
// if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
} else {
// refresh last heartbeat timestamp
psLastHeartbeatTS.put(psAttemptId, System.currentTimeMillis());
// send a state update event to the specific PSAttempt
context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
// check if parameter server can commit now.
if (context.getParameterServerManager().psCanCommit()) {
List<Integer> ids = context.getParameterServerManager().getNeedCommitMatrixIds();
LOG.info("notify ps" + psAttemptId + " to commit now! commit matrices:" + StringUtils.joinInts(",", ids));
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_COMMIT);
NeedSaveMatrixProto.Builder saveBuilder = NeedSaveMatrixProto.newBuilder();
for (int matrixId : ids) {
resBuilder.addNeedSaveMatrices(saveBuilder.setMatrixId(matrixId).addAllPartIds(context.getMatrixMetaManager().getMasterPartsInPS(matrixId, psAttemptId.getPsId())).build());
saveBuilder.clear();
}
} else {
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
}
}
// Update PS failed counters
context.getParameterServerManager().psFailedReports(ProtobufUtil.convert(request.getPsFailedReports()));
// check matrix metadata inconsistencies between master and parameter server.
// if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
// if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
List<Integer> needReleaseMatrices = new ArrayList<>();
List<MatrixMeta> needCreateMatrices = new ArrayList<>();
List<RecoverPartKey> needRecoverParts = new ArrayList<>();
List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
size = needCreateMatrices.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
}
size = needReleaseMatrices.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
}
size = needRecoverParts.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
}
return resBuilder.build();
}
use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class LocalContainerAllocator method requestContainer.
@SuppressWarnings("unchecked")
private void requestContainer(ContainerAllocatorEvent event) {
LocalContainer allocated = new LocalContainer();
Id id = event.getTaskId();
if (id instanceof PSAttemptId) {
context.getEventHandler().handle(new PSAttemptContainerAssignedEvent((PSAttemptId) id, allocated));
} else if (id instanceof PSAgentAttemptId) {
context.getEventHandler().handle(new PSAgentAttemptContainerAssignedEvent((PSAgentAttemptId) id, allocated));
} else if (id instanceof WorkerAttemptId) {
context.getEventHandler().handle(new WorkerAttemptContainerAssignedEvent((WorkerAttemptId) id, allocated));
}
}
use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.
the class LocalContainerLauncher method launch.
@SuppressWarnings("unchecked")
private void launch(ContainerLauncherEvent event) {
Id id = event.getId();
if (id instanceof PSAttemptId) {
LocalPS ps = new LocalPS((PSAttemptId) id, context.getMasterService().getLocation(), context.getConf());
context.getEventHandler().handle(new PSAttemptEvent(PSAttemptEventType.PA_CONTAINER_LAUNCHED, (PSAttemptId) id));
try {
ps.start();
LocalClusterContext.get().addPS((PSAttemptId) id, ps);
} catch (Exception e) {
LOG.error("launch ps failed.", e);
context.getEventHandler().handle(new PSAttemptEvent(PSAttemptEventType.PA_CONTAINER_LAUNCH_FAILED, (PSAttemptId) id));
}
} else if (id instanceof PSAgentAttemptId) {
context.getEventHandler().handle(new PSAgentAttemptEvent(PSAgentAttemptEventType.PSAGENT_ATTEMPT_CONTAINER_LAUNCHED, (PSAgentAttemptId) id));
} else if (id instanceof WorkerAttemptId) {
LocalWorker worker = new LocalWorker(context.getConf(), context.getApplicationId(), context.getUser(), (WorkerAttemptId) id, context.getMasterService().getLocation(), 0, false);
context.getEventHandler().handle(new WorkerAttemptEvent(WorkerAttemptEventType.CONTAINER_LAUNCHED, (WorkerAttemptId) id));
try {
worker.start();
LocalClusterContext.get().addWorker((WorkerAttemptId) id, worker);
} catch (Exception e) {
LOG.error("launch worker failed.", e);
context.getEventHandler().handle(new WorkerAttemptEvent(WorkerAttemptEventType.CONTAINER_LAUNCH_FAILED, (WorkerAttemptId) id));
}
}
}
Aggregations