Search in sources :

Example 1 with SaveResult

use of com.tencent.angel.master.matrix.committer.SaveResult in project angel by Tencent.

the class MasterService method getSaveResults.

/**
 * Get model save results
 */
@Override
public GetSaveResultsResponse getSaveResults(RpcController controller, GetSaveResultsRequest request) throws ServiceException {
    LOG.info("Get save result request=" + request);
    List<SaveResult> results = context.getModelSaver().getSaveResults(request.getMatrixId());
    if (results == null || results.isEmpty()) {
        return GetSaveResultsResponse.getDefaultInstance();
    }
    GetSaveResultsResponse.Builder builder = GetSaveResultsResponse.newBuilder();
    for (SaveResult result : results) {
        builder.addSaveResults(ProtobufUtil.convert(result));
    }
    return builder.build();
}
Also used : SaveResult(com.tencent.angel.master.matrix.committer.SaveResult) PSMatricesSaveResult(com.tencent.angel.model.PSMatricesSaveResult) ModelSaveResult(com.tencent.angel.model.ModelSaveResult) GetSaveResultsResponse(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.GetSaveResultsResponse)

Example 2 with SaveResult

use of com.tencent.angel.master.matrix.committer.SaveResult in project angel by Tencent.

the class MasterService method getCheckpointResults.

/**
 * Get checkpoint results
 */
@Override
public GetCheckpointsResponse getCheckpointResults(RpcController controller, GetCheckpointsRequest request) throws ServiceException {
    LOG.info("Get checkpoint result request=" + request);
    List<SaveResult> results = context.getModelSaver().getCheckpointResults(request.getMatrixId());
    if (results == null || results.isEmpty()) {
        return GetCheckpointsResponse.getDefaultInstance();
    }
    GetCheckpointsResponse.Builder builder = GetCheckpointsResponse.newBuilder();
    for (SaveResult result : results) {
        builder.addSaveResults(ProtobufUtil.convert(result));
    }
    return builder.build();
}
Also used : GetCheckpointsResponse(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.GetCheckpointsResponse) SaveResult(com.tencent.angel.master.matrix.committer.SaveResult) PSMatricesSaveResult(com.tencent.angel.model.PSMatricesSaveResult) ModelSaveResult(com.tencent.angel.model.ModelSaveResult)

Example 3 with SaveResult

use of com.tencent.angel.master.matrix.committer.SaveResult in project angel by Tencent.

the class ParameterServer method initMatricesData.

private void initMatricesData(final List<MatrixMeta> matrixMetas) throws IOException {
    if (context.getPartReplication() > 1 && context.getPSAttemptId().getIndex() > 0) {
        return;
    }
    // Recover PS from snapshot or load path
    if (context.getPSAttemptId().getIndex() > 0) {
        int matrixNum = matrixMetas.size();
        List<PSMatrixLoadContext> matrixLoadContexts = new ArrayList<>(matrixMetas.size());
        SnapshotRecover recover = new SnapshotRecover(context);
        for (int i = 0; i < matrixNum; i++) {
            // 1. First check old snapshot
            Path inputPath = null;
            try {
                inputPath = recover.getSnapshotPath(matrixMetas.get(i).getId());
            } catch (IOException e) {
                LOG.error("Get snapshot path failed, ", e);
            }
            // 2. Check new checkpoints
            if (inputPath == null) {
                try {
                    List<SaveResult> saveResults = master.getCheckpoints(matrixMetas.get(i).getId());
                    if (saveResults == null || saveResults.isEmpty()) {
                        LOG.info("There is no checkpoint results for matrix " + matrixMetas.get(i).getName());
                    } else {
                        inputPath = new Path(saveResults.get(saveResults.size() - 1).getMatrixPath());
                        LOG.info("There is " + saveResults.size() + " checkpoint results for matrix + " + matrixMetas.get(i).getName() + " we choose the latest result in dir " + saveResults.get(saveResults.size() - 1).getMatrixPath());
                    }
                } catch (ServiceException e) {
                    LOG.error("Get checkpoint results for matrix " + matrixMetas.get(i).getName() + " failed ", e);
                }
            }
            // 3. Check load path setting and old save result
            if (inputPath == null) {
                try {
                    List<SaveResult> saveResults = master.getSaveResult(matrixMetas.get(i).getId());
                    if (saveResults == null || saveResults.isEmpty()) {
                        LOG.info("There is no old save result for matrix " + matrixMetas.get(i).getName());
                    } else {
                        inputPath = new Path(saveResults.get(saveResults.size() - 1).getMatrixPath());
                        LOG.info("There is " + saveResults.size() + " old save results for matrix + " + matrixMetas.get(i).getName() + " we choose the latest result in dir " + saveResults.get(saveResults.size() - 1).getMatrixPath());
                    }
                } catch (ServiceException e) {
                    LOG.error("Get save results for matrix " + matrixMetas.get(i).getName() + " failed ", e);
                }
            }
            if (inputPath != null) {
                LOG.info("Load matrix " + matrixMetas.get(i).getName() + " from " + inputPath.toString());
                matrixLoadContexts.add(new PSMatrixLoadContext(matrixMetas.get(i).getId(), inputPath.toString(), new ArrayList<>(matrixMetas.get(i).getPartitionMetas().keySet()), SnapshotFormat.class.getName()));
            } else {
                // Just init it again
                if (matrixMetas.get(i).getInitFunc() != null) {
                    LOG.info("Matrix " + matrixMetas.get(i) + " has a init function " + matrixMetas.get(i).getInitFunc().getClass().getName() + ", use this function to reinit the matrix");
                    long startTs = System.currentTimeMillis();
                    matrixMetas.get(i).getInitFunc().init(context.getMatrixStorageManager().getMatrix(matrixMetas.get(i).getId()));
                    LOG.info("Reinit the matrix use time " + (System.currentTimeMillis() - startTs));
                }
            }
        }
        if (!matrixLoadContexts.isEmpty()) {
            context.getIOExecutors().load(new PSMatricesLoadContext(-1, -1, matrixLoadContexts));
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SnapshotRecover(com.tencent.angel.ps.io.load.SnapshotRecover) ServiceException(com.google.protobuf.ServiceException) PSMatricesLoadContext(com.tencent.angel.model.PSMatricesLoadContext) ArrayList(java.util.ArrayList) SaveResult(com.tencent.angel.master.matrix.committer.SaveResult) IOException(java.io.IOException) PSMatrixLoadContext(com.tencent.angel.model.PSMatrixLoadContext)

Aggregations

SaveResult (com.tencent.angel.master.matrix.committer.SaveResult)3 ModelSaveResult (com.tencent.angel.model.ModelSaveResult)2 PSMatricesSaveResult (com.tencent.angel.model.PSMatricesSaveResult)2 ServiceException (com.google.protobuf.ServiceException)1 PSMatricesLoadContext (com.tencent.angel.model.PSMatricesLoadContext)1 PSMatrixLoadContext (com.tencent.angel.model.PSMatrixLoadContext)1 GetCheckpointsResponse (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.GetCheckpointsResponse)1 GetSaveResultsResponse (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.GetSaveResultsResponse)1 SnapshotRecover (com.tencent.angel.ps.io.load.SnapshotRecover)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 Path (org.apache.hadoop.fs.Path)1