Search in sources :

Example 11 with MatrixMeta

use of com.tencent.angel.ml.matrix.MatrixMeta in project angel by Tencent.

the class MasterService method psReport.

/**
 * response for parameter server heartbeat
 *
 * @param controller rpc controller of protobuf
 * @param request heartbeat request
 */
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("receive ps heartbeat request. request=" + request);
    }
    // parse parameter server counters
    List<Pair> params = request.getMetricsList();
    int size = params.size();
    Map<String, String> paramsMap = new HashMap<String, String>();
    for (int i = 0; i < size; i++) {
        paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
    }
    PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
    PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
    if (!context.getParameterServerManager().isAlive(psAttemptId)) {
        // if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
        LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
    } else {
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
        // refresh last heartbeat timestamp
        context.getParameterServerManager().alive(psAttemptId);
        // send a state update event to the specific PSAttempt
        context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
        // Check is there save request
        PSMatricesSaveContext subSaveContext = context.getModelSaver().getSaveContext(psAttemptId.getPsId());
        PSMatricesSaveResult subSaveResult = context.getModelSaver().getSaveResult(psAttemptId.getPsId());
        if (subSaveContext != null && subSaveResult != null && (subSaveContext.getRequestId() == subSaveResult.getRequestId()) && (subSaveResult.getState() == SaveState.INIT || subSaveResult.getState() == SaveState.SAVING)) {
            // LOG.info("PS " + psAttemptId + " need save " + subSaveContext);
            resBuilder.setNeedSaveMatrices(ProtobufUtil.convert(subSaveContext));
        }
        // Check is there load request
        PSMatricesLoadContext subLoadContext = context.getModelLoader().getLoadContext(psAttemptId.getPsId());
        PSMatricesLoadResult subLoadResult = context.getModelLoader().getLoadResult(psAttemptId.getPsId());
        if (subLoadContext != null && subLoadResult != null && subLoadContext.getRequestId() == subLoadResult.getRequestId() && (subLoadResult.getState() == LoadState.INIT || subLoadResult.getState() == LoadState.LOADING)) {
            // LOG.info("PS " + psAttemptId + " need load " + subLoadContext);
            resBuilder.setNeedLoadMatrices(ProtobufUtil.convert(subLoadContext));
        }
        // check matrix metadata inconsistencies between master and parameter server.
        // if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
        // if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
        List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
        List<Integer> needReleaseMatrices = new ArrayList<>();
        List<MatrixMeta> needCreateMatrices = new ArrayList<>();
        List<RecoverPartKey> needRecoverParts = new ArrayList<>();
        List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
        context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
        size = needCreateMatrices.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
        }
        size = needReleaseMatrices.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
        }
        size = needRecoverParts.size();
        for (int i = 0; i < size; i++) {
            resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
        }
    }
    return resBuilder.build();
}
Also used : MatrixReportProto(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.MatrixReportProto) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) ArrayList(java.util.ArrayList) PSAttemptId(com.tencent.angel.ps.PSAttemptId) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair) RecoverPartKey(com.tencent.angel.ps.ha.RecoverPartKey) PSReportResponse(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSReportResponse) PSAttemptStateUpdateEvent(com.tencent.angel.master.ps.attempt.PSAttemptStateUpdateEvent) PSMatricesLoadContext(com.tencent.angel.model.PSMatricesLoadContext) PSMatricesSaveContext(com.tencent.angel.model.PSMatricesSaveContext) PSMatricesSaveResult(com.tencent.angel.model.PSMatricesSaveResult) MatrixReport(com.tencent.angel.ml.matrix.MatrixReport) PSMatricesLoadResult(com.tencent.angel.model.PSMatricesLoadResult)

Example 12 with MatrixMeta

use of com.tencent.angel.ml.matrix.MatrixMeta in project angel by Tencent.

the class AMModelSaver method epochUpdate.

/**
 * Model save trigger
 *
 * @param epochIndex current epoch index
 */
public void epochUpdate(int epochIndex) {
    if (epochTrigSave && (epochIndex > 0) && (epochIndex % saveModelFrequency == 0)) {
        LOG.info("Epoch " + epochIndex + " over, start to save model");
        Map<Integer, MatrixMeta> metas = context.getMatrixMetaManager().getMatrixMetas();
        if (metas.isEmpty()) {
            LOG.info("There are no matrices need save, just return");
            return;
        }
        String finalPath = context.getConf().get(AngelConf.ANGEL_JOB_OUTPUT_PATH);
        ModelSaveContext saveContext = new ModelSaveContext(finalPath);
        for (MatrixMeta meta : metas.values()) {
            String savePath = meta.getMatrixContext().getAttributes().get(MatrixConf.MATRIX_SAVE_PATH);
            if (savePath != null) {
                saveContext.addMatrix(new MatrixSaveContext(meta.getName()));
            }
        }
        try {
            save(saveContext, SaveTriggerMode.EPOCH_TRIGGER);
        } catch (Throwable x) {
            LOG.error("save model failed for epoch " + epochIndex, x);
        }
    }
}
Also used : MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) PSMatrixSaveContext(com.tencent.angel.model.PSMatrixSaveContext) MatrixSaveContext(com.tencent.angel.model.MatrixSaveContext) ModelSaveContext(com.tencent.angel.model.ModelSaveContext)

Example 13 with MatrixMeta

use of com.tencent.angel.ml.matrix.MatrixMeta in project angel by Tencent.

the class AMModelSaver method split.

private Map<ParameterServerId, PSMatrixSaveContext> split(MatrixSaveContext matrixSaveContext) {
    AMMatrixMetaManager matrixMetaManager = context.getMatrixMetaManager();
    MatrixMeta meta = matrixMetaManager.getMatrix(matrixSaveContext.getMatrixName());
    if (meta == null) {
        throw new IllegalStateException("Can not find matrix " + matrixSaveContext.getMatrixName());
    }
    Map<Integer, PartitionMeta> partitions = meta.getPartitionMetas();
    List<Integer> rowIndexes = matrixSaveContext.getRowIndexes();
    Map<ParameterServerId, Set<Integer>> psIdToPartIdsMap = new HashMap<>();
    if (rowIndexes == null || rowIndexes.isEmpty()) {
        for (Map.Entry<Integer, PartitionMeta> partEntry : partitions.entrySet()) {
            ParameterServerId psId = partEntry.getValue().getMasterPs();
            if (psId == null) {
                throw new IllegalStateException("Can not get ps for partition " + partEntry.getKey());
            }
            Set partIds = psIdToPartIdsMap.get(psId);
            if (partIds == null) {
                partIds = new HashSet();
                psIdToPartIdsMap.put(psId, partIds);
            }
            partIds.add(partEntry.getKey());
        }
    } else {
        int size = rowIndexes.size();
        for (int i = 0; i < size; i++) {
            for (Map.Entry<Integer, PartitionMeta> partEntry : partitions.entrySet()) {
                if (!partEntry.getValue().contain(rowIndexes.get(i))) {
                    continue;
                }
                ParameterServerId psId = partEntry.getValue().getMasterPs();
                if (psId == null) {
                    throw new IllegalStateException("Can not get ps for partition " + partEntry.getKey());
                }
                Set partIds = psIdToPartIdsMap.get(psId);
                if (partIds == null) {
                    partIds = new HashSet();
                    psIdToPartIdsMap.put(psId, partIds);
                }
                partIds.add(partEntry.getKey());
            }
        }
    }
    int matrixId = meta.getId();
    Map<ParameterServerId, PSMatrixSaveContext> ret = new HashMap<>(psIdToPartIdsMap.size());
    for (Map.Entry<ParameterServerId, Set<Integer>> entry : psIdToPartIdsMap.entrySet()) {
        List<Integer> partIds = new ArrayList<>(entry.getValue());
        partIds.sort(new Comparator<Integer>() {

            @Override
            public int compare(Integer id1, Integer id2) {
                return id1 - id2;
            }
        });
        PSMatrixSaveContext psMatrixSaveContext = new PSMatrixSaveContext(matrixId, partIds, matrixSaveContext.getRowIndexes(), matrixSaveContext.getFormatClassName(), null, false, true);
        ret.put(entry.getKey(), psMatrixSaveContext);
    }
    return ret;
}
Also used : HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) PSMatrixSaveContext(com.tencent.angel.model.PSMatrixSaveContext) ArrayList(java.util.ArrayList) PartitionMeta(com.tencent.angel.ml.matrix.PartitionMeta) AMMatrixMetaManager(com.tencent.angel.master.matrixmeta.AMMatrixMetaManager) ParameterServerId(com.tencent.angel.ps.ParameterServerId) HashMap(java.util.HashMap) Map(java.util.Map) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashSet(java.util.HashSet)

Example 14 with MatrixMeta

use of com.tencent.angel.ml.matrix.MatrixMeta in project angel by Tencent.

the class ValuesCombineUtils method mergeSparseLongCompVector.

public static CompIntLongVector mergeSparseLongCompVector(IndexGetParam param, List<PartitionGetResult> partResults) {
    Map<PartitionKey, PartitionGetResult> partKeyToResultMap = mapPartKeyToResult(partResults);
    List<PartitionKey> partKeys = getSortedPartKeys(param.matrixId, param.getRowId());
    MatrixMeta meta = PSAgentContext.get().getMatrixMetaManager().getMatrixMeta(param.matrixId);
    int dim = (int) meta.getColNum();
    int subDim = (int) meta.getBlockColNum();
    int size = partKeys.size();
    IntLongVector[] splitVecs = new IntLongVector[size];
    for (int i = 0; i < size; i++) {
        if (param.getPartKeyToIndexesMap().containsKey(partKeys.get(i))) {
            long[] values = ((IndexPartGetLongResult) partKeyToResultMap.get(partKeys.get(i))).getValues();
            int[] indices = param.getPartKeyToIndexesMap().get(partKeys.get(i));
            transformIndices(indices, partKeys.get(i));
            splitVecs[i] = VFactory.sparseLongVector(subDim, indices, values);
        } else {
            splitVecs[i] = VFactory.sparseLongVector(subDim, 0);
        }
    }
    CompIntLongVector vector = VFactory.compIntLongVector(dim, splitVecs, subDim);
    vector.setMatrixId(param.getMatrixId());
    vector.setRowId(param.getRowId());
    return vector;
}
Also used : MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) PartitionKey(com.tencent.angel.PartitionKey) PartitionGetResult(com.tencent.angel.ml.matrix.psf.get.base.PartitionGetResult)

Example 15 with MatrixMeta

use of com.tencent.angel.ml.matrix.MatrixMeta in project angel by Tencent.

the class ValuesCombineUtils method mergeSparseIntCompVector.

public static CompLongIntVector mergeSparseIntCompVector(LongIndexGetParam param, List<PartitionGetResult> partResults) {
    Map<PartitionKey, PartitionGetResult> partKeyToResultMap = mapPartKeyToResult(partResults);
    List<PartitionKey> partKeys = getSortedPartKeys(param.matrixId, param.getRowId());
    MatrixMeta meta = PSAgentContext.get().getMatrixMetaManager().getMatrixMeta(param.matrixId);
    long dim = meta.getColNum();
    long subDim = meta.getBlockColNum();
    int size = partKeys.size();
    LongIntVector[] splitVecs = new LongIntVector[size];
    for (int i = 0; i < size; i++) {
        if (param.getPartKeyToIndexesMap().containsKey(partKeys.get(i))) {
            int[] values = ((IndexPartGetIntResult) partKeyToResultMap.get(partKeys.get(i))).getValues();
            long[] indices = param.getPartKeyToIndexesMap().get(partKeys.get(i));
            transformIndices(indices, partKeys.get(i));
            splitVecs[i] = VFactory.sparseLongKeyIntVector(subDim, indices, values);
        } else {
            splitVecs[i] = VFactory.sparseLongKeyIntVector(subDim, 0);
        }
    }
    CompLongIntVector vector = VFactory.compLongIntVector(dim, splitVecs, subDim);
    vector.setMatrixId(param.getMatrixId());
    vector.setRowId(param.getRowId());
    return vector;
}
Also used : MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) PartitionKey(com.tencent.angel.PartitionKey) PartitionGetResult(com.tencent.angel.ml.matrix.psf.get.base.PartitionGetResult)

Aggregations

MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)78 PartitionKey (com.tencent.angel.PartitionKey)40 ArrayList (java.util.ArrayList)25 PartitionGetParam (com.tencent.angel.ml.matrix.psf.get.base.PartitionGetParam)13 KeyPart (com.tencent.angel.psagent.matrix.transport.router.KeyPart)13 PartitionGetResult (com.tencent.angel.ml.matrix.psf.get.base.PartitionGetResult)12 AngelException (com.tencent.angel.exception.AngelException)7 PartitionMeta (com.tencent.angel.ml.matrix.PartitionMeta)7 RowType (com.tencent.angel.ml.matrix.RowType)7 MatrixTransportClient (com.tencent.angel.psagent.matrix.transport.MatrixTransportClient)7 KeyValuePart (com.tencent.angel.psagent.matrix.transport.router.KeyValuePart)7 PartitionUpdateParam (com.tencent.angel.ml.matrix.psf.update.base.PartitionUpdateParam)6 Path (org.apache.hadoop.fs.Path)6 GeneralPartGetParam (com.tencent.angel.ml.matrix.psf.get.base.GeneralPartGetParam)5 ParameterServerId (com.tencent.angel.ps.ParameterServerId)5 FutureResult (com.tencent.angel.psagent.matrix.transport.FutureResult)5 MapResponseCache (com.tencent.angel.psagent.matrix.transport.response.MapResponseCache)5 ResponseCache (com.tencent.angel.psagent.matrix.transport.response.ResponseCache)5 AMMatrixMetaManager (com.tencent.angel.master.matrixmeta.AMMatrixMetaManager)4 Vector (com.tencent.angel.ml.math2.vector.Vector)4