use of com.tencent.angel.model.PSMatricesSaveContext in project angel by Tencent.
the class MasterService method psReport.
/**
* response for parameter server heartbeat
*
* @param controller rpc controller of protobuf
* @param request heartbeat request
*/
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
if (LOG.isDebugEnabled()) {
LOG.debug("receive ps heartbeat request. request=" + request);
}
// parse parameter server counters
List<Pair> params = request.getMetricsList();
int size = params.size();
Map<String, String> paramsMap = new HashMap<String, String>();
for (int i = 0; i < size; i++) {
paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
}
PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
if (!context.getParameterServerManager().isAlive(psAttemptId)) {
// if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
} else {
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
// refresh last heartbeat timestamp
context.getParameterServerManager().alive(psAttemptId);
// send a state update event to the specific PSAttempt
context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
// Check is there save request
PSMatricesSaveContext subSaveContext = context.getModelSaver().getSaveContext(psAttemptId.getPsId());
PSMatricesSaveResult subSaveResult = context.getModelSaver().getSaveResult(psAttemptId.getPsId());
if (subSaveContext != null && subSaveResult != null && (subSaveContext.getRequestId() == subSaveResult.getRequestId()) && (subSaveResult.getState() == SaveState.INIT || subSaveResult.getState() == SaveState.SAVING)) {
// LOG.info("PS " + psAttemptId + " need save " + subSaveContext);
resBuilder.setNeedSaveMatrices(ProtobufUtil.convert(subSaveContext));
}
// Check is there load request
PSMatricesLoadContext subLoadContext = context.getModelLoader().getLoadContext(psAttemptId.getPsId());
PSMatricesLoadResult subLoadResult = context.getModelLoader().getLoadResult(psAttemptId.getPsId());
if (subLoadContext != null && subLoadResult != null && subLoadContext.getRequestId() == subLoadResult.getRequestId() && (subLoadResult.getState() == LoadState.INIT || subLoadResult.getState() == LoadState.LOADING)) {
// LOG.info("PS " + psAttemptId + " need load " + subLoadContext);
resBuilder.setNeedLoadMatrices(ProtobufUtil.convert(subLoadContext));
}
// check matrix metadata inconsistencies between master and parameter server.
// if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
// if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
List<Integer> needReleaseMatrices = new ArrayList<>();
List<MatrixMeta> needCreateMatrices = new ArrayList<>();
List<RecoverPartKey> needRecoverParts = new ArrayList<>();
List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
size = needCreateMatrices.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
}
size = needReleaseMatrices.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
}
size = needRecoverParts.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
}
}
return resBuilder.build();
}
use of com.tencent.angel.model.PSMatricesSaveContext in project angel by Tencent.
the class AMModelSaver method save.
private void save(ModelSaveRunningContext runningContext) {
ModelSaveContext saveContext = runningContext.getSaveContext();
try {
lock.lock();
currentRequestId = runningContext.getRequestId();
LOG.info("Start to execute save request " + saveContext + " with request id=" + runningContext.getRequestId());
// Split the user request to sub-requests to pss
currentSubSaveContexts = split(currentRequestId, saveContext);
subResults = new HashMap<>(currentSubSaveContexts.size());
for (Map.Entry<ParameterServerId, PSMatricesSaveContext> entry : currentSubSaveContexts.entrySet()) {
subResults.put(entry.getKey(), new PSMatricesSaveResult(entry.getValue().getRequestId(), entry.getValue().getSubRequestId(), SaveState.INIT));
}
receivedSubResult = 0;
} finally {
lock.unlock();
}
}
use of com.tencent.angel.model.PSMatricesSaveContext in project angel by Tencent.
the class SnapshotDumper method checkpoint.
private void checkpoint(List<Integer> matrixIds, int checkpointId, boolean cloneFirst, boolean sortFirst) throws IOException {
if (matrixIds == null || matrixIds.isEmpty()) {
return;
}
List<PSMatrixSaveContext> saveContexts = new ArrayList<>(matrixIds.size());
List<Path> checkpointItemPaths = new ArrayList<>(matrixIds.size());
List<Path> tempPaths = new ArrayList<>(matrixIds.size());
for (int matrixId : matrixIds) {
Path checkpointItemPath = genCheckpointPath(matrixId, checkpointId);
Path tempPath = genTmpCheckpointPath(checkpointItemPath);
checkpointItemPaths.add(checkpointItemPath);
tempPaths.add(tempPath);
MatrixMeta meta = context.getMatrixMetaManager().getMatrixMeta(matrixId);
saveContexts.add(new PSMatrixSaveContext(matrixId, new ArrayList<>(meta.getPartitionMetas().keySet()), null, SnapshotFormat.class.getName(), tempPath.toString(), cloneFirst, sortFirst));
}
context.getIOExecutors().save(new PSMatricesSaveContext(-1, -1, saveContexts), dumpParallel);
// Rename temp to item path
FileSystem fs = baseDirPath.getFileSystem(context.getConf());
for (int i = 0; i < matrixIds.size(); i++) {
HdfsUtil.rename(tempPaths.get(i), checkpointItemPaths.get(i), fs);
clearOldCheckpoint(fs, genMatrixPath(matrixIds.get(0)));
}
}
use of com.tencent.angel.model.PSMatricesSaveContext in project angel by Tencent.
the class AMModelSaver method split.
private Map<ParameterServerId, PSMatricesSaveContext> split(int requestId, ModelSaveContext saveContext) {
List<MatrixSaveContext> matricesContext = saveContext.getMatricesContext();
Map<ParameterServerId, List<PSMatrixSaveContext>> psIdToContextsMap = new HashMap<>();
int size = matricesContext.size();
for (int i = 0; i < size; i++) {
Map<ParameterServerId, PSMatrixSaveContext> psIdToContextMap = split(matricesContext.get(i));
for (Map.Entry<ParameterServerId, PSMatrixSaveContext> matrixEntry : psIdToContextMap.entrySet()) {
List<PSMatrixSaveContext> contexts = psIdToContextsMap.get(matrixEntry.getKey());
if (contexts == null) {
contexts = new ArrayList<>();
psIdToContextsMap.put(matrixEntry.getKey(), contexts);
}
contexts.add(matrixEntry.getValue());
}
}
Map<ParameterServerId, PSMatricesSaveContext> ret = new HashMap<>(psIdToContextsMap.size());
int subRequestId = 0;
for (Map.Entry<ParameterServerId, List<PSMatrixSaveContext>> modelEntry : psIdToContextsMap.entrySet()) {
Path psPath = new Path(new Path(new Path(saveContext.getTmpSavePath()), ModelFilesConstent.resultDirName), modelEntry.getKey().toString());
List<PSMatrixSaveContext> psMatrixContexts = modelEntry.getValue();
for (PSMatrixSaveContext matrixContext : psMatrixContexts) {
matrixContext.setSavePath(new Path(psPath, context.getMatrixMetaManager().getMatrix(matrixContext.getMatrixId()).getName()).toString());
}
ret.put(modelEntry.getKey(), new PSMatricesSaveContext(requestId, subRequestId++, modelEntry.getValue()));
}
return ret;
}
Aggregations