Search in sources :

Example 1 with PSMatricesSaveContext

use of com.tencent.angel.model.PSMatricesSaveContext in project angel by Tencent.

the class MasterService method psReport.

/**
 * response for parameter server heartbeat
 *
 * @param controller rpc controller of protobuf
 * @param request heartbeat request
 */
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("receive ps heartbeat request. request=" + request);
    }
    // parse parameter server counters
    List<Pair> params = request.getMetricsList();
    int size = params.size();
    Map<String, String> paramsMap = new HashMap<String, String>();
    for (int i = 0; i < size; i++) {
        paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
    }
    PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
    PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
    if (!context.getParameterServerManager().isAlive(psAttemptId)) {
        // if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
        LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
    } else {
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
        // refresh last heartbeat timestamp
        context.getParameterServerManager().alive(psAttemptId);
        // send a state update event to the specific PSAttempt
        context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
        // Check is there save request
        PSMatricesSaveContext subSaveContext = context.getModelSaver().getSaveContext(psAttemptId.getPsId());
        PSMatricesSaveResult subSaveResult = context.getModelSaver().getSaveResult(psAttemptId.getPsId());
        if (subSaveContext != null && subSaveResult != null && (subSaveContext.getRequestId() == subSaveResult.getRequestId()) && (subSaveResult.getState() == SaveState.INIT || subSaveResult.getState() == SaveState.SAVING)) {
            // LOG.info("PS " + psAttemptId + " need save " + subSaveContext);
            resBuilder.setNeedSaveMatrices(ProtobufUtil.convert(subSaveContext));
        }
        // Check is there load request
        PSMatricesLoadContext subLoadContext = context.getModelLoader().getLoadContext(psAttemptId.getPsId());
        PSMatricesLoadResult subLoadResult = context.getModelLoader().getLoadResult(psAttemptId.getPsId());
        if (subLoadContext != null && subLoadResult != null && subLoadContext.getRequestId() == subLoadResult.getRequestId() && (subLoadResult.getState() == LoadState.INIT || subLoadResult.getState() == LoadState.LOADING)) {
            // LOG.info("PS " + psAttemptId + " need load " + subLoadContext);
            resBuilder.setNeedLoadMatrices(ProtobufUtil.convert(subLoadContext));
        }
        // check matrix metadata inconsistencies between master and parameter server.
        // if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
        // if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
        List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
        List<Integer> needReleaseMatrices = new ArrayList<>();
        List<MatrixMeta> needCreateMatrices = new ArrayList<>();
        List<RecoverPartKey> needRecoverParts = new ArrayList<>();
        List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
        context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
        size = needCreateMatrices.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
        }
        size = needReleaseMatrices.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
        }
        size = needRecoverParts.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
        }
    }
    return resBuilder.build();
}
Also used : MatrixReportProto(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.MatrixReportProto) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) ArrayList(java.util.ArrayList) PSAttemptId(com.tencent.angel.ps.PSAttemptId) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair) RecoverPartKey(com.tencent.angel.ps.ha.RecoverPartKey) PSReportResponse(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse) PSAttemptStateUpdateEvent(com.tencent.angel.master.ps.attempt.PSAttemptStateUpdateEvent) PSMatricesLoadContext(com.tencent.angel.model.PSMatricesLoadContext) PSMatricesSaveContext(com.tencent.angel.model.PSMatricesSaveContext) PSMatricesSaveResult(com.tencent.angel.model.PSMatricesSaveResult) MatrixReport(com.tencent.angel.ml.matrix.MatrixReport) PSMatricesLoadResult(com.tencent.angel.model.PSMatricesLoadResult)

Example 2 with PSMatricesSaveContext

use of com.tencent.angel.model.PSMatricesSaveContext in project angel by Tencent.

the class AMModelSaver method save.

private void save(ModelSaveRunningContext runningContext) {
    ModelSaveContext saveContext = runningContext.getSaveContext();
    try {
        lock.lock();
        currentRequestId = runningContext.getRequestId();
        LOG.info("Start to execute save request " + saveContext + " with request id=" + runningContext.getRequestId());
        // Split the user request to sub-requests to pss
        currentSubSaveContexts = split(currentRequestId, saveContext);
        subResults = new HashMap<>(currentSubSaveContexts.size());
        for (Map.Entry<ParameterServerId, PSMatricesSaveContext> entry : currentSubSaveContexts.entrySet()) {
            subResults.put(entry.getKey(), new PSMatricesSaveResult(entry.getValue().getRequestId(), entry.getValue().getSubRequestId(), SaveState.INIT));
        }
        receivedSubResult = 0;
    } finally {
        lock.unlock();
    }
}
Also used : PSMatricesSaveContext(com.tencent.angel.model.PSMatricesSaveContext) PSMatricesSaveResult(com.tencent.angel.model.PSMatricesSaveResult) ParameterServerId(com.tencent.angel.ps.ParameterServerId) ModelSaveContext(com.tencent.angel.model.ModelSaveContext) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Example 3 with PSMatricesSaveContext

use of com.tencent.angel.model.PSMatricesSaveContext in project angel by Tencent.

the class SnapshotDumper method checkpoint.

private void checkpoint(List<Integer> matrixIds, int checkpointId, boolean cloneFirst, boolean sortFirst) throws IOException {
    if (matrixIds == null || matrixIds.isEmpty()) {
        return;
    }
    List<PSMatrixSaveContext> saveContexts = new ArrayList<>(matrixIds.size());
    List<Path> checkpointItemPaths = new ArrayList<>(matrixIds.size());
    List<Path> tempPaths = new ArrayList<>(matrixIds.size());
    for (int matrixId : matrixIds) {
        Path checkpointItemPath = genCheckpointPath(matrixId, checkpointId);
        Path tempPath = genTmpCheckpointPath(checkpointItemPath);
        checkpointItemPaths.add(checkpointItemPath);
        tempPaths.add(tempPath);
        MatrixMeta meta = context.getMatrixMetaManager().getMatrixMeta(matrixId);
        saveContexts.add(new PSMatrixSaveContext(matrixId, new ArrayList<>(meta.getPartitionMetas().keySet()), null, SnapshotFormat.class.getName(), tempPath.toString(), cloneFirst, sortFirst));
    }
    context.getIOExecutors().save(new PSMatricesSaveContext(-1, -1, saveContexts), dumpParallel);
    // Rename temp to item path
    FileSystem fs = baseDirPath.getFileSystem(context.getConf());
    for (int i = 0; i < matrixIds.size(); i++) {
        HdfsUtil.rename(tempPaths.get(i), checkpointItemPaths.get(i), fs);
        clearOldCheckpoint(fs, genMatrixPath(matrixIds.get(0)));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) PSMatricesSaveContext(com.tencent.angel.model.PSMatricesSaveContext) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) FileSystem(org.apache.hadoop.fs.FileSystem) PSMatrixSaveContext(com.tencent.angel.model.PSMatrixSaveContext) ArrayList(java.util.ArrayList)

Example 4 with PSMatricesSaveContext

use of com.tencent.angel.model.PSMatricesSaveContext in project angel by Tencent.

the class AMModelSaver method split.

private Map<ParameterServerId, PSMatricesSaveContext> split(int requestId, ModelSaveContext saveContext) {
    List<MatrixSaveContext> matricesContext = saveContext.getMatricesContext();
    Map<ParameterServerId, List<PSMatrixSaveContext>> psIdToContextsMap = new HashMap<>();
    int size = matricesContext.size();
    for (int i = 0; i < size; i++) {
        Map<ParameterServerId, PSMatrixSaveContext> psIdToContextMap = split(matricesContext.get(i));
        for (Map.Entry<ParameterServerId, PSMatrixSaveContext> matrixEntry : psIdToContextMap.entrySet()) {
            List<PSMatrixSaveContext> contexts = psIdToContextsMap.get(matrixEntry.getKey());
            if (contexts == null) {
                contexts = new ArrayList<>();
                psIdToContextsMap.put(matrixEntry.getKey(), contexts);
            }
            contexts.add(matrixEntry.getValue());
        }
    }
    Map<ParameterServerId, PSMatricesSaveContext> ret = new HashMap<>(psIdToContextsMap.size());
    int subRequestId = 0;
    for (Map.Entry<ParameterServerId, List<PSMatrixSaveContext>> modelEntry : psIdToContextsMap.entrySet()) {
        Path psPath = new Path(new Path(new Path(saveContext.getTmpSavePath()), ModelFilesConstent.resultDirName), modelEntry.getKey().toString());
        List<PSMatrixSaveContext> psMatrixContexts = modelEntry.getValue();
        for (PSMatrixSaveContext matrixContext : psMatrixContexts) {
            matrixContext.setSavePath(new Path(psPath, context.getMatrixMetaManager().getMatrix(matrixContext.getMatrixId()).getName()).toString());
        }
        ret.put(modelEntry.getKey(), new PSMatricesSaveContext(requestId, subRequestId++, modelEntry.getValue()));
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) PSMatrixSaveContext(com.tencent.angel.model.PSMatrixSaveContext) PSMatrixSaveContext(com.tencent.angel.model.PSMatrixSaveContext) MatrixSaveContext(com.tencent.angel.model.MatrixSaveContext) PSMatricesSaveContext(com.tencent.angel.model.PSMatricesSaveContext) ArrayList(java.util.ArrayList) List(java.util.List) ParameterServerId(com.tencent.angel.ps.ParameterServerId) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Aggregations

PSMatricesSaveContext (com.tencent.angel.model.PSMatricesSaveContext)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)2 PSMatricesSaveResult (com.tencent.angel.model.PSMatricesSaveResult)2 PSMatrixSaveContext (com.tencent.angel.model.PSMatrixSaveContext)2 ParameterServerId (com.tencent.angel.ps.ParameterServerId)2 Map (java.util.Map)2 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)2 Path (org.apache.hadoop.fs.Path)2 PSAttemptStateUpdateEvent (com.tencent.angel.master.ps.attempt.PSAttemptStateUpdateEvent)1 MatrixReport (com.tencent.angel.ml.matrix.MatrixReport)1 MatrixSaveContext (com.tencent.angel.model.MatrixSaveContext)1 ModelSaveContext (com.tencent.angel.model.ModelSaveContext)1 PSMatricesLoadContext (com.tencent.angel.model.PSMatricesLoadContext)1 PSMatricesLoadResult (com.tencent.angel.model.PSMatricesLoadResult)1 Pair (com.tencent.angel.protobuf.generated.MLProtos.Pair)1 MatrixReportProto (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.MatrixReportProto)1 PSReportResponse (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse)1 PSAttemptId (com.tencent.angel.ps.PSAttemptId)1