use of com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse in project angel by Tencent.
the class MasterService method psReport.
/**
* response for parameter server heartbeat
*
* @param controller rpc controller of protobuf
* @param request heartbeat request
*/
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
if (LOG.isDebugEnabled()) {
LOG.debug("receive ps heartbeat request. request=" + request);
}
// parse parameter server counters
List<Pair> params = request.getMetricsList();
int size = params.size();
Map<String, String> paramsMap = new HashMap<String, String>();
for (int i = 0; i < size; i++) {
paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
}
PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
if (!context.getParameterServerManager().isAlive(psAttemptId)) {
// if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
} else {
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
// refresh last heartbeat timestamp
context.getParameterServerManager().alive(psAttemptId);
// send a state update event to the specific PSAttempt
context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
// Check is there save request
PSMatricesSaveContext subSaveContext = context.getModelSaver().getSaveContext(psAttemptId.getPsId());
PSMatricesSaveResult subSaveResult = context.getModelSaver().getSaveResult(psAttemptId.getPsId());
if (subSaveContext != null && subSaveResult != null && (subSaveContext.getRequestId() == subSaveResult.getRequestId()) && (subSaveResult.getState() == SaveState.INIT || subSaveResult.getState() == SaveState.SAVING)) {
// LOG.info("PS " + psAttemptId + " need save " + subSaveContext);
resBuilder.setNeedSaveMatrices(ProtobufUtil.convert(subSaveContext));
}
// Check is there load request
PSMatricesLoadContext subLoadContext = context.getModelLoader().getLoadContext(psAttemptId.getPsId());
PSMatricesLoadResult subLoadResult = context.getModelLoader().getLoadResult(psAttemptId.getPsId());
if (subLoadContext != null && subLoadResult != null && subLoadContext.getRequestId() == subLoadResult.getRequestId() && (subLoadResult.getState() == LoadState.INIT || subLoadResult.getState() == LoadState.LOADING)) {
// LOG.info("PS " + psAttemptId + " need load " + subLoadContext);
resBuilder.setNeedLoadMatrices(ProtobufUtil.convert(subLoadContext));
}
// check matrix metadata inconsistencies between master and parameter server.
// if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
// if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
List<Integer> needReleaseMatrices = new ArrayList<>();
List<MatrixMeta> needCreateMatrices = new ArrayList<>();
List<RecoverPartKey> needRecoverParts = new ArrayList<>();
List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
size = needCreateMatrices.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
}
size = needReleaseMatrices.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
}
size = needRecoverParts.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
}
}
return resBuilder.build();
}
use of com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse in project angel by Tencent.
the class ParameterServer method heartbeat.
private void heartbeat() throws Exception {
PSReportRequest.Builder builder = PSReportRequest.newBuilder();
builder.setPsAttemptId(attemptIdProto);
if (heartbeatCount.incrementAndGet() % dataCollectionInterval == 0) {
// calculate data size of all partitions of ps
Map<Integer, ServerMatrix> serverMatrixMap = matrixStorageManager.getMatrices();
long dataSize = 0;
for (ServerMatrix serverMatrix : serverMatrixMap.values()) {
Map<Integer, ServerPartition> partitions = serverMatrix.getPartitions();
for (ServerPartition partition : partitions.values()) {
dataSize += partition.dataSize();
}
}
Pair.Builder pairBuilder = Pair.newBuilder();
pairBuilder.setKey("key");
pairBuilder.setValue("value");
builder.addMetrics(pairBuilder.build());
// totalRPC
pairBuilder.setKey("totalRPC");
pairBuilder.setValue(WorkerPool.total.toString());
builder.addMetrics(pairBuilder.build());
// request size
pairBuilder.setKey("requestSize");
pairBuilder.setValue(String.format("%.2f", WorkerPool.requestSize.longValue() * 1.0 / 1024 / 1024));
builder.addMetrics(pairBuilder.build());
// data size
pairBuilder.setKey("dataSize");
pairBuilder.setValue(String.format("%.2f", dataSize * 1.0 / 1024 / 1024));
builder.addMetrics(pairBuilder.build());
}
builder.addAllMatrixReports(buildMatrixReports());
PSReportResponse ret;
PSReportRequest request = builder.build();
LOG.debug("ps hb = " + request);
ret = master.psReport(request);
switch(ret.getPsCommand()) {
case PSCOMMAND_REGISTER:
try {
register();
} catch (Exception x) {
LOG.error("register failed: ", x);
stop(-1);
}
break;
case PSCOMMAND_SHUTDOWN:
LOG.error("shutdown command come from appmaster, exit now!!");
stop(-1);
break;
default:
break;
}
LOG.debug("ps hb ret = " + ret);
if (ret.hasNeedSaveMatrices()) {
saver.save(ProtobufUtil.convert(ret.getNeedSaveMatrices()));
}
if (ret.hasNeedLoadMatrices()) {
loader.load(ProtobufUtil.convert(ret.getNeedLoadMatrices()));
}
syncMatrices(ret.getNeedCreateMatricesList(), ret.getNeedReleaseMatrixIdsList(), ret.getNeedRecoverPartsList());
}
Aggregations