Search in sources :

Example 1 with PSReportRequest

use of com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportRequest in project angel by Tencent.

the class ParameterServer method heartbeat.

private void heartbeat() throws Exception {
    PSReportRequest.Builder builder = PSReportRequest.newBuilder();
    builder.setPsAttemptId(attemptIdProto);
    if (heartbeatCount.incrementAndGet() % dataCollectionInterval == 0) {
        // calculate data size of all partitions of ps
        Map<Integer, ServerMatrix> serverMatrixMap = matrixStorageManager.getMatrices();
        long dataSize = 0;
        for (ServerMatrix serverMatrix : serverMatrixMap.values()) {
            Map<Integer, ServerPartition> partitions = serverMatrix.getPartitions();
            for (ServerPartition partition : partitions.values()) {
                dataSize += partition.dataSize();
            }
        }
        Pair.Builder pairBuilder = Pair.newBuilder();
        pairBuilder.setKey("key");
        pairBuilder.setValue("value");
        builder.addMetrics(pairBuilder.build());
        // totalRPC
        pairBuilder.setKey("totalRPC");
        pairBuilder.setValue(WorkerPool.total.toString());
        builder.addMetrics(pairBuilder.build());
        // request size
        pairBuilder.setKey("requestSize");
        pairBuilder.setValue(String.format("%.2f", WorkerPool.requestSize.longValue() * 1.0 / 1024 / 1024));
        builder.addMetrics(pairBuilder.build());
        // data size
        pairBuilder.setKey("dataSize");
        pairBuilder.setValue(String.format("%.2f", dataSize * 1.0 / 1024 / 1024));
        builder.addMetrics(pairBuilder.build());
    }
    builder.addAllMatrixReports(buildMatrixReports());
    PSReportResponse ret;
    PSReportRequest request = builder.build();
    LOG.debug("ps hb = " + request);
    ret = master.psReport(request);
    switch(ret.getPsCommand()) {
        case PSCOMMAND_REGISTER:
            try {
                register();
            } catch (Exception x) {
                LOG.error("register failed: ", x);
                stop(-1);
            }
            break;
        case PSCOMMAND_SHUTDOWN:
            LOG.error("shutdown command come from appmaster, exit now!!");
            stop(-1);
            break;
        default:
            break;
    }
    LOG.debug("ps hb ret = " + ret);
    if (ret.hasNeedSaveMatrices()) {
        saver.save(ProtobufUtil.convert(ret.getNeedSaveMatrices()));
    }
    if (ret.hasNeedLoadMatrices()) {
        loader.load(ProtobufUtil.convert(ret.getNeedLoadMatrices()));
    }
    syncMatrices(ret.getNeedCreateMatricesList(), ret.getNeedReleaseMatrixIdsList(), ret.getNeedRecoverPartsList());
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ServerMatrix(com.tencent.angel.ps.storage.matrix.ServerMatrix) PSReportResponse(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse) ServerPartition(com.tencent.angel.ps.storage.partition.ServerPartition) ServiceException(com.google.protobuf.ServiceException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) PSReportRequest(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportRequest) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair)

Aggregations

ServiceException (com.google.protobuf.ServiceException)1 Pair (com.tencent.angel.protobuf.generated.MLProtos.Pair)1 PSReportRequest (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportRequest)1 PSReportResponse (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse)1 ServerMatrix (com.tencent.angel.ps.storage.matrix.ServerMatrix)1 ServerPartition (com.tencent.angel.ps.storage.partition.ServerPartition)1 IOException (java.io.IOException)1 UnknownHostException (java.net.UnknownHostException)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 YarnRuntimeException (org.apache.hadoop.yarn.exceptions.YarnRuntimeException)1