use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.
the class MasterClient method getPartLocation.
/**
* Get the pss and their locations that stored the partition
*
* @param matrixId matrix id
* @param partId partition id
* @return the pss and their locations that stored the partition
* @throws ServiceException
*/
public PartitionLocation getPartLocation(int matrixId, int partId) throws ServiceException {
GetPartLocationResponse response = master.getPartLocation(null, GetPartLocationRequest.newBuilder().setMatrixId(matrixId).setPartId(partId).build());
List<PSLocationProto> psLocsProto = response.getLocationsList();
int size = psLocsProto.size();
List<PSLocation> psLocs = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
psLocs.add(new PSLocation(ProtobufUtil.convertToId(psLocsProto.get(i).getPsId()), ProtobufUtil.convertToLocation(psLocsProto.get(i))));
}
return new PartitionLocation(psLocs);
}
use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.
the class PSAgentMatrixMetaManager method getPartLocation.
/**
* Get partition location: includes stored pss and the location of the pss
* TODO: cache
*
* @param matrixId partition information
* @param partId partition id
* @return partition location
* @throws ServiceException
*/
public PartitionLocation getPartLocation(int matrixId, int partId) {
List<ParameterServerId> psIds = getPss(matrixId, partId);
if (psIds == null) {
return new PartitionLocation(new ArrayList<>());
}
int size = psIds.size();
List<PSLocation> psLocs = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
psLocs.add(new PSLocation(psIds.get(i), PSAgentContext.get().getLocationManager().getPsLocation(psIds.get(i))));
}
return new PartitionLocation(psLocs);
}
use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.
the class PeriodPusher method start.
/**
* Start
*/
public void start() {
super.start();
dispatcher = new Thread(() -> {
ParameterServerId psId = context.getPSAttemptId().getPsId();
while (!stopped.get() && !Thread.interrupted()) {
try {
Thread.sleep(pushIntervalMs);
Map<PartitionKey, Integer> parts = getAndClearAllNeedRecoverParts();
Map<RecoverPartKey, FutureResult> futures = new HashMap<>(parts.size());
for (PartitionKey part : parts.keySet()) {
PartitionLocation partLoc = context.getMaster().getPartLocation(part.getMatrixId(), part.getPartitionId());
if ((partLoc.psLocs.size() > 1) && psId.equals(partLoc.psLocs.get(0).psId)) {
int size = partLoc.psLocs.size();
for (int i = 1; i < size; i++) {
RecoverPartKey partKey = new RecoverPartKey(part, partLoc.psLocs.get(i));
LOG.info("Start to backup partition " + partKey.partKey + " to " + partKey.psLoc);
futures.put(partKey, recover(partKey));
}
}
}
waitResults(futures);
} catch (Exception e) {
if (!stopped.get()) {
LOG.error("recover parts failed ", e);
}
}
}
});
dispatcher.setName("psha-push-dispatcher");
dispatcher.start();
}
use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.
the class WorkerPool method putPartUpdate.
/**
* Update a matrix partition
* @param seqId rpc request id
* @param request rpc request
* @param in serialized request
* @return serialized rpc response
*/
private ByteBuf putPartUpdate(int seqId, PutPartitionUpdateRequest request, ByteBuf in) {
if (LOG.isDebugEnabled()) {
LOG.debug("put update request=" + request + " with seqId=" + seqId);
}
long startTs = System.currentTimeMillis();
ByteBuf buf = ByteBufUtils.newByteBuf(8 + 4);
buf.writeInt(seqId);
PutPartitionUpdateResponse response = null;
// Get partition and check the partition state
PartitionKey partKey = request.getPartKey();
ServerPartition part = context.getMatrixStorageManager().getPart(partKey.getMatrixId(), partKey.getPartitionId());
if (part == null) {
String log = "update " + request + " failed. The partition " + partKey + " does not exist";
LOG.fatal(log);
response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FATAL, log);
response.serialize(buf);
return buf;
}
PartitionState state = part.getState();
if (!request.isComeFromPs() && state != PartitionState.READ_AND_WRITE) {
String log = "update " + request + " failed. The partition " + partKey + " state is " + state;
LOG.error(log);
response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FAILED, log);
response.serialize(buf);
return buf;
}
// Get the stored pss for this partition
PartitionLocation partLoc = null;
try {
partLoc = context.getMatrixMetaManager().getPartLocation(request.getPartKey(), disableRouterCache);
} catch (Throwable x) {
String log = "update " + request + " failed, get partition location from master failed " + x.getMessage();
LOG.error(log, x);
response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FAILED, log);
response.serialize(buf);
return buf;
}
// Check this ps is the master ps for this partition, if not, just return failed
if (!request.isComeFromPs() && !isPartMasterPs(partLoc)) {
String log = "local ps is " + context.getPSAttemptId().getPsId() + " update " + request + " failed, update to slave ps for partition " + request.getPartKey();
LOG.error(log);
response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FAILED, log);
} else {
int clock = request.getClock();
partKey = request.getPartKey();
int taskIndex = request.getTaskIndex();
boolean updateClock = request.isUpdateClock();
if (LOG.isDebugEnabled()) {
LOG.debug("seqId = " + seqId + " update split request matrixId = " + partKey.getMatrixId() + ", partId = " + partKey.getPartitionId() + " clock = " + clock + ", taskIndex=" + taskIndex + ", updateClock = " + updateClock);
}
try {
state = part.getState();
if (state != PartitionState.READ_AND_WRITE) {
String log = "update " + request + " failed. The partition " + partKey + " state is " + state;
LOG.error(log);
response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FAILED, log);
response.serialize(buf);
return buf;
}
part.update(in, rowUpdater);
if (updateClock) {
context.getClockVectorManager().updateClock(partKey.getMatrixId(), partKey.getPartitionId(), taskIndex, clock);
}
response = new PutPartitionUpdateResponse(ResponseType.SUCCESS);
// Start to put the update to the slave pss
if (partLoc.psLocs.size() > 1) {
context.getPS2PSPusher().put(request, in, partLoc);
if (updateClock) {
context.getPS2PSPusher().updateClock(request.getPartKey(), taskIndex, clock, partLoc);
}
}
} catch (Throwable x) {
String log = "update " + request + " failed " + x.getMessage();
LOG.fatal(log, x);
response = new PutPartitionUpdateResponse(ResponseType.SERVER_HANDLE_FATAL, log);
}
}
response.serialize(buf);
if (LOG.isDebugEnabled()) {
LOG.debug("update partition for request " + request + " use time=" + (System.currentTimeMillis() - startTs) + ", response buf=" + buf);
}
return buf;
}
use of com.tencent.angel.ml.matrix.PartitionLocation in project angel by Tencent.
the class WorkerPool method update.
/**
* Update a partition use PSF
* @param seqId rpc request id
* @param request rpc request
* @param in serialized rpc request
* @return serialized rpc response
*/
private ByteBuf update(int seqId, UpdaterRequest request, ByteBuf in) {
UpdaterResponse response = null;
ByteBuf buf = ByteBufUtils.newByteBuf(4 + 8, useDirectorBuffer);
// Get partition and check the partition state
PartitionKey partKey = request.getPartKey();
ServerPartition part = context.getMatrixStorageManager().getPart(partKey.getMatrixId(), partKey.getPartitionId());
if (part == null) {
String log = "update " + request + " failed. The partition " + partKey + " does not exist";
LOG.fatal(log);
response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FATAL, log);
response.serialize(buf);
return buf;
}
PartitionState state = part.getState();
if (state != PartitionState.READ_AND_WRITE) {
String log = "update " + request + " failed. The partition " + partKey + " state is " + state;
LOG.error(log);
response = new UpdaterResponse(ResponseType.PARTITION_READ_ONLY, log);
response.serialize(buf);
return buf;
}
// Get the stored pss for this partition
PartitionLocation partLoc = null;
try {
partLoc = context.getMatrixMetaManager().getPartLocation(request.getPartKey(), disableRouterCache);
} catch (Throwable x) {
String log = "update " + request + " failed, get partition location from master failed " + x.getMessage();
LOG.error(log, x);
response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FAILED, log);
response.serialize(buf);
return buf;
}
// Check this ps is the master ps for this location, only master ps can accept the update
if (!request.isComeFromPs() && !isPartMasterPs(partLoc)) {
String log = "update " + request + " failed, update to slave ps for partition " + request.getPartKey();
LOG.error(log);
response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FAILED, log);
} else {
try {
Class<? extends UpdateFunc> funcClass = (Class<? extends UpdateFunc>) Class.forName(request.getUpdaterFuncClass());
Constructor<? extends UpdateFunc> constructor = funcClass.getConstructor();
constructor.setAccessible(true);
UpdateFunc func = constructor.newInstance();
func.setPsContext(context);
// Check the partition state again
state = part.getState();
if (state != PartitionState.READ_AND_WRITE) {
String log = "update " + request + " failed. The partition " + partKey + " state is " + state;
LOG.error(log);
response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FAILED, log);
response.serialize(buf);
return buf;
}
part.update(func, request.getPartParam());
response = new UpdaterResponse();
response.setResponseType(ResponseType.SUCCESS);
if (partLoc.psLocs.size() > 1) {
// Start to put the update to the slave pss
context.getPS2PSPusher().put(request, in, partLoc);
}
} catch (Throwable e) {
String log = "update " + request + " failed " + e.getMessage();
LOG.fatal(log, e);
response = new UpdaterResponse(ResponseType.SERVER_HANDLE_FATAL, log);
}
}
buf.writeInt(seqId);
response.serialize(buf);
return buf;
}
Aggregations