Search in sources :

Example 1 with RecoverPartKey

use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.

the class PS2PSPusherImpl method start.

/**
 * Start
 */
public void start() {
    psClient.start();
    workerPool = Executors.newFixedThreadPool(16);
    recoverChecker = new Thread(() -> {
        while (!stopped.get() && !Thread.interrupted()) {
            try {
                Thread.sleep(30000);
                Map<RecoverPartKey, FutureResult> futures = new HashMap<>();
                try {
                    lock.readLock().lock();
                    for (Map.Entry<PartitionKey, Map<PSLocation, Integer>> partEntry : failedUpdateCounters.entrySet()) {
                        PartitionKey partKey = partEntry.getKey();
                        Map<PSLocation, Integer> failedCounters = partEntry.getValue();
                        if (failedCounters.isEmpty()) {
                            continue;
                        }
                        PartitionLocation partLoc = context.getMaster().getPartLocation(partKey.getMatrixId(), partKey.getPartitionId());
                        if (partLoc.psLocs.size() > 1 && partLoc.psLocs.get(0).psId.equals(context.getPSAttemptId().getPsId())) {
                            for (int i = 1; i < partLoc.psLocs.size(); i++) {
                                PSLocation psLoc = partLoc.psLocs.get(i);
                                if (failedCounters.containsKey(psLoc) && failedCounters.get(psLoc) > 0) {
                                    RecoverPartKey recoverPartKey = new RecoverPartKey(partKey, psLoc);
                                    futures.put(recoverPartKey, recover(recoverPartKey));
                                }
                            }
                        }
                    }
                } finally {
                    lock.readLock().unlock();
                }
                waitResults(futures);
            } catch (Throwable e) {
                if (!stopped.get()) {
                    LOG.error("Start to ");
                }
            }
        }
    });
    recoverChecker.setName("Recover-checker");
    recoverChecker.start();
}
Also used : PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) PartitionKey(com.tencent.angel.PartitionKey) HashMap(java.util.HashMap) Map(java.util.Map) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Example 2 with RecoverPartKey

use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.

the class MasterService method psReport.

/**
 * response for parameter server heartbeat
 * @param controller rpc controller of protobuf
 * @param request heartbeat request
 * @throws ServiceException
 */
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("receive ps heartbeat request. request=" + request);
    }
    // parse parameter server counters
    List<Pair> params = request.getMetricsList();
    int size = params.size();
    Map<String, String> paramsMap = new HashMap<String, String>();
    for (int i = 0; i < size; i++) {
        paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
    }
    PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
    PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
    if (!psLastHeartbeatTS.containsKey(psAttemptId)) {
        // if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
        LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
        resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
    } else {
        // refresh last heartbeat timestamp
        psLastHeartbeatTS.put(psAttemptId, System.currentTimeMillis());
        // send a state update event to the specific PSAttempt
        context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
        // check if parameter server can commit now.
        if (context.getParameterServerManager().psCanCommit()) {
            List<Integer> ids = context.getParameterServerManager().getNeedCommitMatrixIds();
            LOG.info("notify ps" + psAttemptId + " to commit now! commit matrices:" + StringUtils.joinInts(",", ids));
            resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_COMMIT);
            NeedSaveMatrixProto.Builder saveBuilder = NeedSaveMatrixProto.newBuilder();
            for (int matrixId : ids) {
                resBuilder.addNeedSaveMatrices(saveBuilder.setMatrixId(matrixId).addAllPartIds(context.getMatrixMetaManager().getMasterPartsInPS(matrixId, psAttemptId.getPsId())).build());
                saveBuilder.clear();
            }
        } else {
            resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
        }
    }
    // Update PS failed counters
    context.getParameterServerManager().psFailedReports(ProtobufUtil.convert(request.getPsFailedReports()));
    // check matrix metadata inconsistencies between master and parameter server.
    // if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
    // if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
    List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
    List<Integer> needReleaseMatrices = new ArrayList<>();
    List<MatrixMeta> needCreateMatrices = new ArrayList<>();
    List<RecoverPartKey> needRecoverParts = new ArrayList<>();
    List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
    context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
    size = needCreateMatrices.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
    }
    size = needReleaseMatrices.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
    }
    size = needRecoverParts.size();
    for (int i = 0; i < size; i++) {
        resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
    }
    return resBuilder.build();
}
Also used : ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) PSAttemptId(com.tencent.angel.ps.PSAttemptId) MatrixReport(com.tencent.angel.ml.matrix.MatrixReport)

Example 3 with RecoverPartKey

use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.

the class AMMatrixMetaManager method syncMatrixInfos.

/**
 * compare the matrix meta on the master and the matrix meta on ps to find the matrix this parameter server needs to create and delete
 * @param matrixReports parameter server matrix report, include the matrix ids this parameter server hold.
 * @param needCreateMatrixes use to return the matrix partitions this parameter server need to build
 * @param needReleaseMatrixes use to return the matrix ids this parameter server need to remove
 * @param needRecoverParts need recover partitions
 * @param psId parameter server id
 */
public void syncMatrixInfos(List<MatrixReport> matrixReports, List<MatrixMeta> needCreateMatrixes, List<Integer> needReleaseMatrixes, List<RecoverPartKey> needRecoverParts, ParameterServerId psId) {
    // get matrix ids in the parameter server report
    IntOpenHashSet matrixInPS = new IntOpenHashSet();
    int size = matrixReports.size();
    for (int i = 0; i < size; i++) {
        matrixInPS.add(matrixReports.get(i).matrixId);
    }
    handleMatrixReports(psId, matrixReports);
    Set<RecoverPartKey> parts = getAndRemoveNeedRecoverParts(psId);
    if (parts != null) {
        needRecoverParts.addAll(parts);
    }
    // get the matrices parameter server need to create and delete
    getPSNeedUpdateMatrix(matrixInPS, needCreateMatrixes, needReleaseMatrixes, psId);
    psMatricesUpdate(psId, matrixReports);
}
Also used : IntOpenHashSet(it.unimi.dsi.fastutil.ints.IntOpenHashSet) RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey)

Example 4 with RecoverPartKey

use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.

the class PeriodPusher method start.

/**
 * Start
 */
public void start() {
    super.start();
    dispatcher = new Thread(() -> {
        ParameterServerId psId = context.getPSAttemptId().getPsId();
        while (!stopped.get() && !Thread.interrupted()) {
            try {
                Thread.sleep(pushIntervalMs);
                Map<PartitionKey, Integer> parts = getAndClearAllNeedRecoverParts();
                Map<RecoverPartKey, FutureResult> futures = new HashMap<>(parts.size());
                for (PartitionKey part : parts.keySet()) {
                    PartitionLocation partLoc = context.getMaster().getPartLocation(part.getMatrixId(), part.getPartitionId());
                    if ((partLoc.psLocs.size() > 1) && psId.equals(partLoc.psLocs.get(0).psId)) {
                        int size = partLoc.psLocs.size();
                        for (int i = 1; i < size; i++) {
                            RecoverPartKey partKey = new RecoverPartKey(part, partLoc.psLocs.get(i));
                            LOG.info("Start to backup partition " + partKey.partKey + " to " + partKey.psLoc);
                            futures.put(partKey, recover(partKey));
                        }
                    }
                }
                waitResults(futures);
            } catch (Exception e) {
                if (!stopped.get()) {
                    LOG.error("recover parts failed ", e);
                }
            }
        }
    });
    dispatcher.setName("psha-push-dispatcher");
    dispatcher.start();
}
Also used : RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) PartitionKey(com.tencent.angel.PartitionKey) ParameterServerId(com.tencent.angel.ps.ParameterServerId) HashMap(java.util.HashMap) Map(java.util.Map) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Example 5 with RecoverPartKey

use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.

the class AMMatrixMetaManager method handlePartReport.

private void handlePartReport(ParameterServerId psId, int matrixId, PartReport partReport) {
    ParameterServerId master = matrixMetaManager.getMasterPs(matrixId, partReport.partId);
    if (!psId.equals(master)) {
        MatrixMeta matrixMeta = matrixMetaManager.getMatrixMeta(matrixId);
        if (matrixMeta == null) {
            return;
        }
        matrixMeta.getPartitionMeta(partReport.partId).addReplicationPS(psId);
        if (partReport.state == PartitionState.INITIALIZING) {
            addNeedRecoverPart(master, new RecoverPartKey(new PartitionKey(matrixId, partReport.partId), new PSLocation(psId, context.getLocationManager().getPsLocation(psId))));
        } else if (partReport.state == PartitionState.READ_AND_WRITE) {
            ParameterServerId orignalMaster = matrixPartitionsOnPS.get(psId).get(matrixId).getPartitionMeta(partReport.partId).getMasterPs();
            if (orignalMaster.equals(psId)) {
                matrixMetaManager.getMatrixMeta(matrixId).getPartitionMeta(partReport.partId).makePsToMaster(psId);
            }
        }
    }
}
Also used : RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) PartitionKey(com.tencent.angel.PartitionKey) ParameterServerId(com.tencent.angel.ps.ParameterServerId)

Aggregations

RecoverPartKey (com.tencent.angel.ps.recovery.ha.RecoverPartKey)5 PartitionKey (com.tencent.angel.PartitionKey)3 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)2 PSLocation (com.tencent.angel.ml.matrix.transport.PSLocation)2 ParameterServerId (com.tencent.angel.ps.ParameterServerId)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)1 MatrixReport (com.tencent.angel.ml.matrix.MatrixReport)1 PSAttemptId (com.tencent.angel.ps.PSAttemptId)1 Int2IntOpenHashMap (it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap)1 IntOpenHashSet (it.unimi.dsi.fastutil.ints.IntOpenHashSet)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1