use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.
the class PS2PSPusherImpl method start.
/**
* Start
*/
public void start() {
psClient.start();
workerPool = Executors.newFixedThreadPool(16);
recoverChecker = new Thread(() -> {
while (!stopped.get() && !Thread.interrupted()) {
try {
Thread.sleep(30000);
Map<RecoverPartKey, FutureResult> futures = new HashMap<>();
try {
lock.readLock().lock();
for (Map.Entry<PartitionKey, Map<PSLocation, Integer>> partEntry : failedUpdateCounters.entrySet()) {
PartitionKey partKey = partEntry.getKey();
Map<PSLocation, Integer> failedCounters = partEntry.getValue();
if (failedCounters.isEmpty()) {
continue;
}
PartitionLocation partLoc = context.getMaster().getPartLocation(partKey.getMatrixId(), partKey.getPartitionId());
if (partLoc.psLocs.size() > 1 && partLoc.psLocs.get(0).psId.equals(context.getPSAttemptId().getPsId())) {
for (int i = 1; i < partLoc.psLocs.size(); i++) {
PSLocation psLoc = partLoc.psLocs.get(i);
if (failedCounters.containsKey(psLoc) && failedCounters.get(psLoc) > 0) {
RecoverPartKey recoverPartKey = new RecoverPartKey(partKey, psLoc);
futures.put(recoverPartKey, recover(recoverPartKey));
}
}
}
}
} finally {
lock.readLock().unlock();
}
waitResults(futures);
} catch (Throwable e) {
if (!stopped.get()) {
LOG.error("Start to ");
}
}
}
});
recoverChecker.setName("Recover-checker");
recoverChecker.start();
}
use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.
the class MasterService method psReport.
/**
* response for parameter server heartbeat
* @param controller rpc controller of protobuf
* @param request heartbeat request
* @throws ServiceException
*/
@SuppressWarnings("unchecked")
@Override
public PSReportResponse psReport(RpcController controller, PSReportRequest request) throws ServiceException {
if (LOG.isDebugEnabled()) {
LOG.debug("receive ps heartbeat request. request=" + request);
}
// parse parameter server counters
List<Pair> params = request.getMetricsList();
int size = params.size();
Map<String, String> paramsMap = new HashMap<String, String>();
for (int i = 0; i < size; i++) {
paramsMap.put(params.get(i).getKey(), params.get(i).getValue());
}
PSAttemptId psAttemptId = ProtobufUtil.convertToId(request.getPsAttemptId());
PSReportResponse.Builder resBuilder = PSReportResponse.newBuilder();
if (!psLastHeartbeatTS.containsKey(psAttemptId)) {
// if psAttemptId is not in monitor set, just return a PSCOMMAND_SHUTDOWN command.
LOG.error("ps attempt " + psAttemptId + " is not in running ps attempt set");
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_SHUTDOWN);
} else {
// refresh last heartbeat timestamp
psLastHeartbeatTS.put(psAttemptId, System.currentTimeMillis());
// send a state update event to the specific PSAttempt
context.getEventHandler().handle(new PSAttemptStateUpdateEvent(psAttemptId, paramsMap));
// check if parameter server can commit now.
if (context.getParameterServerManager().psCanCommit()) {
List<Integer> ids = context.getParameterServerManager().getNeedCommitMatrixIds();
LOG.info("notify ps" + psAttemptId + " to commit now! commit matrices:" + StringUtils.joinInts(",", ids));
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_COMMIT);
NeedSaveMatrixProto.Builder saveBuilder = NeedSaveMatrixProto.newBuilder();
for (int matrixId : ids) {
resBuilder.addNeedSaveMatrices(saveBuilder.setMatrixId(matrixId).addAllPartIds(context.getMatrixMetaManager().getMasterPartsInPS(matrixId, psAttemptId.getPsId())).build());
saveBuilder.clear();
}
} else {
resBuilder.setPsCommand(PSCommandProto.PSCOMMAND_OK);
}
}
// Update PS failed counters
context.getParameterServerManager().psFailedReports(ProtobufUtil.convert(request.getPsFailedReports()));
// check matrix metadata inconsistencies between master and parameter server.
// if a matrix exists on the Master and does not exist on ps, then it is necessary to notify ps to establish the matrix
// if a matrix exists on the ps and does not exist on master, then it is necessary to notify ps to remove the matrix
List<MatrixReportProto> matrixReportsProto = request.getMatrixReportsList();
List<Integer> needReleaseMatrices = new ArrayList<>();
List<MatrixMeta> needCreateMatrices = new ArrayList<>();
List<RecoverPartKey> needRecoverParts = new ArrayList<>();
List<MatrixReport> matrixReports = ProtobufUtil.convertToMatrixReports(matrixReportsProto);
context.getMatrixMetaManager().syncMatrixInfos(matrixReports, needCreateMatrices, needReleaseMatrices, needRecoverParts, psAttemptId.getPsId());
size = needCreateMatrices.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedCreateMatrices(ProtobufUtil.convertToMatrixMetaProto(needCreateMatrices.get(i)));
}
size = needReleaseMatrices.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedReleaseMatrixIds(needReleaseMatrices.get(i));
}
size = needRecoverParts.size();
for (int i = 0; i < size; i++) {
resBuilder.addNeedRecoverParts(ProtobufUtil.convert(needRecoverParts.get(i)));
}
return resBuilder.build();
}
use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.
the class AMMatrixMetaManager method syncMatrixInfos.
/**
* compare the matrix meta on the master and the matrix meta on ps to find the matrix this parameter server needs to create and delete
* @param matrixReports parameter server matrix report, include the matrix ids this parameter server hold.
* @param needCreateMatrixes use to return the matrix partitions this parameter server need to build
* @param needReleaseMatrixes use to return the matrix ids this parameter server need to remove
* @param needRecoverParts need recover partitions
* @param psId parameter server id
*/
public void syncMatrixInfos(List<MatrixReport> matrixReports, List<MatrixMeta> needCreateMatrixes, List<Integer> needReleaseMatrixes, List<RecoverPartKey> needRecoverParts, ParameterServerId psId) {
// get matrix ids in the parameter server report
IntOpenHashSet matrixInPS = new IntOpenHashSet();
int size = matrixReports.size();
for (int i = 0; i < size; i++) {
matrixInPS.add(matrixReports.get(i).matrixId);
}
handleMatrixReports(psId, matrixReports);
Set<RecoverPartKey> parts = getAndRemoveNeedRecoverParts(psId);
if (parts != null) {
needRecoverParts.addAll(parts);
}
// get the matrices parameter server need to create and delete
getPSNeedUpdateMatrix(matrixInPS, needCreateMatrixes, needReleaseMatrixes, psId);
psMatricesUpdate(psId, matrixReports);
}
use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.
the class PeriodPusher method start.
/**
* Start
*/
public void start() {
super.start();
dispatcher = new Thread(() -> {
ParameterServerId psId = context.getPSAttemptId().getPsId();
while (!stopped.get() && !Thread.interrupted()) {
try {
Thread.sleep(pushIntervalMs);
Map<PartitionKey, Integer> parts = getAndClearAllNeedRecoverParts();
Map<RecoverPartKey, FutureResult> futures = new HashMap<>(parts.size());
for (PartitionKey part : parts.keySet()) {
PartitionLocation partLoc = context.getMaster().getPartLocation(part.getMatrixId(), part.getPartitionId());
if ((partLoc.psLocs.size() > 1) && psId.equals(partLoc.psLocs.get(0).psId)) {
int size = partLoc.psLocs.size();
for (int i = 1; i < size; i++) {
RecoverPartKey partKey = new RecoverPartKey(part, partLoc.psLocs.get(i));
LOG.info("Start to backup partition " + partKey.partKey + " to " + partKey.psLoc);
futures.put(partKey, recover(partKey));
}
}
}
waitResults(futures);
} catch (Exception e) {
if (!stopped.get()) {
LOG.error("recover parts failed ", e);
}
}
}
});
dispatcher.setName("psha-push-dispatcher");
dispatcher.start();
}
use of com.tencent.angel.ps.recovery.ha.RecoverPartKey in project angel by Tencent.
the class AMMatrixMetaManager method handlePartReport.
private void handlePartReport(ParameterServerId psId, int matrixId, PartReport partReport) {
ParameterServerId master = matrixMetaManager.getMasterPs(matrixId, partReport.partId);
if (!psId.equals(master)) {
MatrixMeta matrixMeta = matrixMetaManager.getMatrixMeta(matrixId);
if (matrixMeta == null) {
return;
}
matrixMeta.getPartitionMeta(partReport.partId).addReplicationPS(psId);
if (partReport.state == PartitionState.INITIALIZING) {
addNeedRecoverPart(master, new RecoverPartKey(new PartitionKey(matrixId, partReport.partId), new PSLocation(psId, context.getLocationManager().getPsLocation(psId))));
} else if (partReport.state == PartitionState.READ_AND_WRITE) {
ParameterServerId orignalMaster = matrixPartitionsOnPS.get(psId).get(matrixId).getPartitionMeta(partReport.partId).getMasterPs();
if (orignalMaster.equals(psId)) {
matrixMetaManager.getMatrixMeta(matrixId).getPartitionMeta(partReport.partId).makePsToMaster(psId);
}
}
}
}
Aggregations