Search in sources :

Example 1 with MatrixReport

use of com.tencent.angel.ml.matrix.MatrixReport in project angel by Tencent.

the class MasterService method psReport.

/**
 * response for parameter server heartbeat
 * @param controller rpc controller of protobuf
 * @param request heartbeat request
 * @throws ServiceException
 */
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("receive ps heartbeat request. request=" + request);
    }
    // parse parameter server counters
    List<Pair> params = request.getMetricsList();
    int size = params.size();
    Map<String, String> paramsMap = new HashMap<String, String>();
    for (int i = 0; i < size; i++) {
        paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
    }
    PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
    PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
    if (!psLastHeartbeatTS.containsKey(psAttemptId)) {
        // if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
        LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
    } else {
        // refresh last heartbeat timestamp
        psLastHeartbeatTS.put(psAttemptId, System.currentTimeMillis());
        // send a state update event to the specific PSAttempt
        context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
        // check if parameter server can commit now.
        if (context.getParameterServerManager().psCanCommit()) {
            List<Integer> ids = context.getParameterServerManager().getNeedCommitMatrixIds();
            LOG.info("notify ps" + psAttemptId + " to commit now! commit matrices:" + StringUtils.joinInts(",", ids));
            resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_COMMIT);
            NeedSaveMatrixProto.Builder saveBuilder = NeedSaveMatrixProto.newBuilder();
            for (int matrixId : ids) {
                resBuilder.addNeedSaveMatrices(saveBuilder.setMatrixId(matrixId).addAllPartIds(context.getMatrixMetaManager().getMasterPartsInPS(matrixId, psAttemptId.getPsId())).build());
                saveBuilder.clear();
            }
        } else {
            resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
        }
    }
    // Update PS failed counters
    context.getParameterServerManager().psFailedReports(ProtobufUtil.convert(request.getPsFailedReports()));
    // check matrix metadata inconsistencies between master and parameter server.
    // if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
    // if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
    List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
    List<Integer> needReleaseMatrices = new ArrayList<>();
    List<MatrixMeta> needCreateMatrices = new ArrayList<>();
    List<RecoverPartKey> needRecoverParts = new ArrayList<>();
    List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
    context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
    size = needCreateMatrices.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
    }
    size = needReleaseMatrices.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
    }
    size = needRecoverParts.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
    }
    return resBuilder.build();
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) PSAttemptId(com.tencent.angel.ps.PSAttemptId) MatrixReport(com.tencent.angel.ml.matrix.MatrixReport)

Aggregations

MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)1 MatrixReport (com.tencent.angel.ml.matrix.MatrixReport)1 PSAttemptId (com.tencent.angel.ps.PSAttemptId)1 RecoverPartKey (com.tencent.angel.ps.recovery.ha.RecoverPartKey)1 Int2IntOpenHashMap (it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1