Search in sources :

Example 1 with ModelSaveResult

use of com.tencent.angel.model.ModelSaveResult in project angel by Tencent.

the class AMModelSaver method psSaveFinish.

/**
 * PS finish save request
 *
 * @param psId parameter server id
 * @param subResult the result of sub save request
 */
public void psSaveFinish(ParameterServerId psId, PSMatricesSaveResult subResult) {
    try {
        lock.lock();
        if (subResults == null || subResult.getRequestId() != currentRequestId) {
            return;
        }
        receivedSubResult++;
        subResults.put(psId, subResult);
        LOG.info("save subrequest, complete number=" + receivedSubResult + ", total number=" + subResults.size());
        if (receivedSubResult >= subResults.size()) {
            ModelSaveResult result = results.get(subResult.getRequestId());
            if (canCombine()) {
                ModelSaveContext saveContext = saveContexts.get(subResult.getRequestId());
                try {
                    result.setState(SaveState.COMBINING);
                    combine(saveContext, result);
                } catch (Throwable e) {
                    LOG.error("Master combine model files failed ", e);
                    saveFailed(result, StringUtils.stringifyException(e));
                }
            } else {
                String failedMsg = combineFailedLogs();
                LOG.error("PS save model failed. " + failedMsg);
                saveFailed(result, failedMsg);
            }
        }
    } finally {
        lock.unlock();
    }
}
Also used : ModelSaveResult(com.tencent.angel.model.ModelSaveResult) ModelSaveContext(com.tencent.angel.model.ModelSaveContext)

Example 2 with ModelSaveResult

use of com.tencent.angel.model.ModelSaveResult in project angel by Tencent.

the class MasterService method checkModelSaved.

/**
 * Check save request is complete
 */
@Override
public CheckModelSavedResponse checkModelSaved(RpcController controller, CheckModelSavedRequest request) throws ServiceException {
    LOG.info("check model saved=" + request);
    ModelSaveResult result = context.getModelSaver().getModelSaveResult(request.getRequestId());
    if (result == null) {
        throw new ServiceException("can not find save request " + request.getRequestId());
    } else {
        CheckModelSavedResponse.Builder builder = CheckModelSavedResponse.newBuilder();
        builder.setStatus(result.getState().getStateId());
        if (result.getMessage() != null) {
            builder.setLog(result.getMessage());
        }
        return builder.build();
    }
}
Also used : CheckModelSavedResponse(com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.CheckModelSavedResponse) ServiceException(com.google.protobuf.ServiceException) ModelSaveResult(com.tencent.angel.model.ModelSaveResult)

Example 3 with ModelSaveResult

use of com.tencent.angel.model.ModelSaveResult in project angel by Tencent.

the class AMModelSaver method save.

/**
 * Save model
 *
 * @param saveContext save model context
 * @return save request id
 */
public int save(ModelSaveContext saveContext, SaveTriggerMode triggerMode) {
    try {
        lock.lock();
        int requestId = saveRequestIdGen++;
        saveContext.setTmpSavePath(HdfsUtil.generateTmpDirectory(context.getConf(), context.getApplicationId().toString(), new Path(saveContext.getSavePath())).toString());
        // Path tmpPath = new Path(new Path(context.getConf().get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH)),
        // String.valueOf(requestId));
        // Path tmpPath = HdfsUtil.toTmpPath(new Path(saveContext.getSavePath()));
        // saveContext.setTmpSavePath(tmpPath.toString());
        saveContexts.put(requestId, saveContext);
        results.put(requestId, new ModelSaveResult(requestId));
        results.get(requestId).setState(SaveState.INIT);
        boolean needAdd = true;
        // Filter old epoch trigger first
        if (triggerMode == SaveTriggerMode.EPOCH_TRIGGER) {
            int size = waitingTasks.size();
            for (int i = 0; i < size; i++) {
                if (waitingTasks.get(i).getTriggerMode() == SaveTriggerMode.EPOCH_TRIGGER) {
                    LOG.info("there is another epoch trigger model save request waiting, just exit");
                    needAdd = false;
                    break;
                }
            }
        }
        if (needAdd) {
            waitingTasks.add(new ModelSaveRunningContext(requestId, triggerMode, saveContext));
        }
        return requestId;
    } finally {
        lock.unlock();
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ModelSaveRunningContext(com.tencent.angel.model.ModelSaveRunningContext) ModelSaveResult(com.tencent.angel.model.ModelSaveResult)

Aggregations

ModelSaveResult (com.tencent.angel.model.ModelSaveResult)3 ServiceException (com.google.protobuf.ServiceException)1 ModelSaveContext (com.tencent.angel.model.ModelSaveContext)1 ModelSaveRunningContext (com.tencent.angel.model.ModelSaveRunningContext)1 CheckModelSavedResponse (com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.CheckModelSavedResponse)1 Path (org.apache.hadoop.fs.Path)1