Search in sources :

Example 1 with PSReportResponse

use of com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse in project angel by Tencent.

the class MasterService method psReport.

/**
 * response for parameter server heartbeat
 *
 * @param controller rpc controller of protobuf
 * @param request heartbeat request
 */
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("receive ps heartbeat request. request=" + request);
    }
    // parse parameter server counters
    List<Pair> params = request.getMetricsList();
    int size = params.size();
    Map<String, String> paramsMap = new HashMap<String, String>();
    for (int i = 0; i < size; i++) {
        paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
    }
    PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
    PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
    if (!context.getParameterServerManager().isAlive(psAttemptId)) {
        // if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
        LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
    } else {
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
        // refresh last heartbeat timestamp
        context.getParameterServerManager().alive(psAttemptId);
        // send a state update event to the specific PSAttempt
        context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
        // Check is there save request
        PSMatricesSaveContext subSaveContext = context.getModelSaver().getSaveContext(psAttemptId.getPsId());
        PSMatricesSaveResult subSaveResult = context.getModelSaver().getSaveResult(psAttemptId.getPsId());
        if (subSaveContext != null && subSaveResult != null && (subSaveContext.getRequestId() == subSaveResult.getRequestId()) && (subSaveResult.getState() == SaveState.INIT || subSaveResult.getState() == SaveState.SAVING)) {
            // LOG.info("PS " + psAttemptId + " need save " + subSaveContext);
            resBuilder.setNeedSaveMatrices(ProtobufUtil.convert(subSaveContext));
        }
        // Check is there load request
        PSMatricesLoadContext subLoadContext = context.getModelLoader().getLoadContext(psAttemptId.getPsId());
        PSMatricesLoadResult subLoadResult = context.getModelLoader().getLoadResult(psAttemptId.getPsId());
        if (subLoadContext != null && subLoadResult != null && subLoadContext.getRequestId() == subLoadResult.getRequestId() && (subLoadResult.getState() == LoadState.INIT || subLoadResult.getState() == LoadState.LOADING)) {
            // LOG.info("PS " + psAttemptId + " need load " + subLoadContext);
            resBuilder.setNeedLoadMatrices(ProtobufUtil.convert(subLoadContext));
        }
        // check matrix metadata inconsistencies between master and parameter server.
        // if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
        // if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
        List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
        List<Integer> needReleaseMatrices = new ArrayList<>();
        List<MatrixMeta> needCreateMatrices = new ArrayList<>();
        List<RecoverPartKey> needRecoverParts = new ArrayList<>();
        List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
        context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
        size = needCreateMatrices.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
        }
        size = needReleaseMatrices.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
        }
        size = needRecoverParts.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
        }
    }
    return resBuilder.build();
}
Also used : MatrixReportProto(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.MatrixReportProto) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) ArrayList(java.util.ArrayList) PSAttemptId(com.tencent.angel.ps.PSAttemptId) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair) RecoverPartKey(com.tencent.angel.ps.ha.RecoverPartKey) PSReportResponse(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse) PSAttemptStateUpdateEvent(com.tencent.angel.master.ps.attempt.PSAttemptStateUpdateEvent) PSMatricesLoadContext(com.tencent.angel.model.PSMatricesLoadContext) PSMatricesSaveContext(com.tencent.angel.model.PSMatricesSaveContext) PSMatricesSaveResult(com.tencent.angel.model.PSMatricesSaveResult) MatrixReport(com.tencent.angel.ml.matrix.MatrixReport) PSMatricesLoadResult(com.tencent.angel.model.PSMatricesLoadResult)

Example 2 with PSReportResponse

use of com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse in project angel by Tencent.

the class ParameterServer method heartbeat.

private void heartbeat() throws Exception {
    PSReportRequest.Builder builder = PSReportRequest.newBuilder();
    builder.setPsAttemptId(attemptIdProto);
    if (heartbeatCount.incrementAndGet() % dataCollectionInterval == 0) {
        // calculate data size of all partitions of ps
        Map<Integer, ServerMatrix> serverMatrixMap = matrixStorageManager.getMatrices();
        long dataSize = 0;
        for (ServerMatrix serverMatrix : serverMatrixMap.values()) {
            Map<Integer, ServerPartition> partitions = serverMatrix.getPartitions();
            for (ServerPartition partition : partitions.values()) {
                dataSize += partition.dataSize();
            }
        }
        Pair.Builder pairBuilder = Pair.newBuilder();
        pairBuilder.setKey("key");
        pairBuilder.setValue("value");
        builder.addMetrics(pairBuilder.build());
        // totalRPC
        pairBuilder.setKey("totalRPC");
        pairBuilder.setValue(WorkerPool.total.toString());
        builder.addMetrics(pairBuilder.build());
        // request size
        pairBuilder.setKey("requestSize");
        pairBuilder.setValue(String.format("%.2f", WorkerPool.requestSize.longValue() * 1.0 / 1024 / 1024));
        builder.addMetrics(pairBuilder.build());
        // data size
        pairBuilder.setKey("dataSize");
        pairBuilder.setValue(String.format("%.2f", dataSize * 1.0 / 1024 / 1024));
        builder.addMetrics(pairBuilder.build());
    }
    builder.addAllMatrixReports(buildMatrixReports());
    PSReportResponse ret;
    PSReportRequest request = builder.build();
    LOG.debug("ps hb = " + request);
    ret = master.psReport(request);
    switch(ret.getPsCommand()) {
        case PSCOMMAND_REGISTER:
            try {
                register();
            } catch (Exception x) {
                LOG.error("register failed: ", x);
                stop(-1);
            }
            break;
        case PSCOMMAND_SHUTDOWN:
            LOG.error("shutdown command come from appmaster, exit now!!");
            stop(-1);
            break;
        default:
            break;
    }
    LOG.debug("ps hb ret = " + ret);
    if (ret.hasNeedSaveMatrices()) {
        saver.save(ProtobufUtil.convert(ret.getNeedSaveMatrices()));
    }
    if (ret.hasNeedLoadMatrices()) {
        loader.load(ProtobufUtil.convert(ret.getNeedLoadMatrices()));
    }
    syncMatrices(ret.getNeedCreateMatricesList(), ret.getNeedReleaseMatrixIdsList(), ret.getNeedRecoverPartsList());
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ServerMatrix(com.tencent.angel.ps.storage.matrix.ServerMatrix) PSReportResponse(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse) ServerPartition(com.tencent.angel.ps.storage.partition.ServerPartition) ServiceException(com.google.protobuf.ServiceException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) PSReportRequest(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportRequest) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair)

Aggregations

Pair (com.tencent.angel.protobuf.generated.MLProtos.Pair)2 PSReportResponse (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse)2 ServiceException (com.google.protobuf.ServiceException)1 PSAttemptStateUpdateEvent (com.tencent.angel.master.ps.attempt.PSAttemptStateUpdateEvent)1 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)1 MatrixReport (com.tencent.angel.ml.matrix.MatrixReport)1 PSMatricesLoadContext (com.tencent.angel.model.PSMatricesLoadContext)1 PSMatricesLoadResult (com.tencent.angel.model.PSMatricesLoadResult)1 PSMatricesSaveContext (com.tencent.angel.model.PSMatricesSaveContext)1 PSMatricesSaveResult (com.tencent.angel.model.PSMatricesSaveResult)1 MatrixReportProto (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.MatrixReportProto)1 PSReportRequest (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportRequest)1 PSAttemptId (com.tencent.angel.ps.PSAttemptId)1 RecoverPartKey (com.tencent.angel.ps.ha.RecoverPartKey)1 ServerMatrix (com.tencent.angel.ps.storage.matrix.ServerMatrix)1 ServerPartition (com.tencent.angel.ps.storage.partition.ServerPartition)1 Int2IntOpenHashMap (it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap)1 IOException (java.io.IOException)1 UnknownHostException (java.net.UnknownHostException)1 ArrayList (java.util.ArrayList)1