use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.
the class PSFailedReportTest method testPSFailedReport.
@Test
public void testPSFailedReport() throws Exception {
ParameterServerId ps1Id = new ParameterServerId(0);
final ParameterServerId ps2Id = new ParameterServerId(1);
PSAttemptId ps1Attempt0Id = new PSAttemptId(ps1Id, 0);
PSAttemptId ps2Attempt0Id = new PSAttemptId(ps2Id, 0);
PSAttemptId ps2Attempt1Id = new PSAttemptId(ps2Id, 1);
ParameterServer ps1Attempt0 = LocalClusterContext.get().getPS(ps1Attempt0Id).getPS();
ParameterServer ps2Attempt0 = LocalClusterContext.get().getPS(ps2Attempt0Id).getPS();
WorkerId worker0Id = new WorkerId(new WorkerGroupId(0), 0);
WorkerAttemptId worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
Worker worker0 = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
TaskContext task0Context = worker0.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
MatrixClient matrixClient = task0Context.getMatrix("w1");
int iterNum = 20;
for (int i = 0; i < iterNum; i++) {
DenseIntVector update = new DenseIntVector(dim);
for (int j = 0; j < dim; j++) {
update.set(j, 1);
}
update.setMatrixId(matrixClient.getMatrixId());
update.setRowId(0);
matrixClient.increment(update);
matrixClient.clock().get();
Thread.sleep(1000);
MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps1w1.getPartition(0));
assertNotNull(ps1w1.getPartition(1));
IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
int part0Size = ps1w1.getRow(0, 0).size();
IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
int part1Size = ps1w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
MatrixStorageManager ps2Storage = ps2Attempt0.getMatrixStorageManager();
ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps2w1.getPartition(0));
assertNotNull(ps2w1.getPartition(1));
row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
part0Size = ps2w1.getRow(0, 0).size();
row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
part1Size = ps2w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
}
LOG.info("===================================================================ps2 failed");
HashMap<PSLocation, Integer> failedCounters = new HashMap<>();
PSLocation psLoc = new PSLocation(ps2Id, ps2Attempt0.getLocationManager().getPsLocation(ps2Id));
failedCounters.put(psLoc, 10000);
worker0.getPSAgent().getMasterClient().psFailedReport(failedCounters);
Thread.sleep(20000);
for (int i = iterNum; i < 2 * iterNum; i++) {
DenseIntVector update = new DenseIntVector(dim);
for (int j = 0; j < dim; j++) {
update.set(j, 1);
}
update.setMatrixId(matrixClient.getMatrixId());
update.setRowId(0);
matrixClient.increment(update);
matrixClient.clock().get();
Thread.sleep(1000);
MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps1w1.getPartition(0));
assertNotNull(ps1w1.getPartition(1));
IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
int part0Size = ps1w1.getRow(0, 0).size();
IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
int part1Size = ps1w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
}
ParameterServer ps2Attempt = LocalClusterContext.get().getPS(ps2Attempt1Id).getPS();
for (int i = iterNum * 2; i < 3 * iterNum; i++) {
DenseIntVector update = new DenseIntVector(dim);
for (int j = 0; j < dim; j++) {
update.set(j, 1);
}
update.setMatrixId(matrixClient.getMatrixId());
update.setRowId(0);
matrixClient.increment(update);
matrixClient.clock().get();
Thread.sleep(1000);
MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps1w1.getPartition(0));
assertNotNull(ps1w1.getPartition(1));
IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
int part0Size = ps1w1.getRow(0, 0).size();
IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
int part1Size = ps1w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
MatrixStorageManager ps2Storage = ps2Attempt.getMatrixStorageManager();
ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
assertNotNull(ps2w1.getPartition(0));
assertNotNull(ps2w1.getPartition(1));
row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
part0Size = ps2w1.getRow(0, 0).size();
row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
part1Size = ps2w1.getRow(1, 0).size();
assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
}
}
use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.
the class PSMatrixMetaManager method getPartLocation.
/**
* Get partition location: includes stored pss and the location of the pss
*
* @param partitionKey partition information
* @return partition location
* @throws ServiceException
*/
public PartitionLocation getPartLocation(PartitionKey partitionKey) throws ServiceException {
List<ParameterServerId> psIds = getPss(partitionKey);
if (psIds == null) {
return new PartitionLocation(new ArrayList<>());
}
int size = psIds.size();
List<PSLocation> psLocs = new ArrayList<>(size);
for (int i = 0; i < size; i++) {
psLocs.add(new PSLocation(psIds.get(i), context.getLocationManager().getPsLocation(psIds.get(i))));
}
return new PartitionLocation(psLocs);
}
use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.
the class MasterService method getPartLocation.
/**
* Get locations for a partition
*/
@Override
public GetPartLocationResponse getPartLocation(RpcController controller, GetPartLocationRequest request) throws ServiceException {
GetPartLocationResponse.Builder builder = GetPartLocationResponse.newBuilder();
List<ParameterServerId> psIds = context.getMatrixMetaManager().getPss(request.getMatrixId(), request.getPartId());
if (psIds != null) {
int size = psIds.size();
for (int i = 0; i < size; i++) {
Location psLocation = context.getLocationManager().getPsLocation(psIds.get(i));
if (psLocation == null) {
builder.addLocations((PSLocationProto.newBuilder().setPsId(ProtobufUtil.convertToIdProto(psIds.get(i))).setPsStatus(PSStatus.PS_NOTREADY).build()));
} else {
builder.addLocations(ProtobufUtil.convertToPSLocProto(psIds.get(i), psLocation));
}
}
}
return builder.build();
}
use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.
the class MasterService method getPSLocation.
/**
* get a specific parameter server location.
*
* @param controller rpc controller of protobuf
* @param request parameter server id
*/
@Override
public GetPSLocationReponse getPSLocation(RpcController controller, GetPSLocationRequest request) throws ServiceException {
GetPSLocationReponse.Builder resBuilder = GetPSLocationReponse.newBuilder();
ParameterServerId psId = ProtobufUtil.convertToId(request.getPsId());
Location psLocation = context.getLocationManager().getPsLocation(psId);
if (psLocation == null) {
resBuilder.setPsLocation(PSLocationProto.newBuilder().setPsId(request.getPsId()).setPsStatus(PSStatus.PS_NOTREADY).build());
} else {
resBuilder.setPsLocation(ProtobufUtil.convertToPSLocProto(psId, psLocation));
}
return resBuilder.build();
}
use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.
the class AngelApplicationMaster method initAndStart.
/**
* init and start all service modules for angel applicaiton master.
*/
public void initAndStart() throws Exception {
addIfService(angelApp);
// init app state storage
String tmpOutPath = conf.get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH);
Path appStatePath = new Path(tmpOutPath, "app");
LOG.info("app state output path = " + appStatePath.toUri().toString());
FileSystem fs = appStatePath.getFileSystem(conf);
appStateStorage = new AppStateStorage(appContext, appStatePath.toUri().toString(), fs);
addIfService(appStateStorage);
LOG.info("build app state storage success");
// init event dispacher
dispatcher = new AsyncDispatcher();
addIfService(dispatcher);
LOG.info("build event dispacher");
// init location manager
locationManager = new LocationManager();
// init container allocator
AngelDeployMode deployMode = appContext.getDeployMode();
LOG.info("deploy mode=" + deployMode);
if (deployMode == AngelDeployMode.LOCAL) {
containerAllocator = new LocalContainerAllocator(appContext);
containerLauncher = new LocalContainerLauncher(appContext);
} else {
containerAllocator = new YarnContainerAllocator(appContext);
containerLauncher = new YarnContainerLauncher(appContext);
}
addIfService(containerAllocator);
dispatcher.register(ContainerAllocatorEventType.class, containerAllocator);
LOG.info("build containerAllocator success");
addIfService(containerLauncher);
dispatcher.register(ContainerLauncherEventType.class, containerLauncher);
LOG.info("build containerLauncher success");
// init a rpc service
masterService = new MasterService(appContext);
LOG.info("build master service success");
// recover matrix meta if needed
recoverMatrixMeta();
// recover ps attempt information if need
Map<ParameterServerId, Integer> psIdToAttemptIndexMap = recoverPSAttemptIndex();
if (psIdToAttemptIndexMap == null) {
LOG.info("recoverPSAttemptIndex return is null");
} else {
for (Entry<ParameterServerId, Integer> entry : psIdToAttemptIndexMap.entrySet()) {
LOG.info("psId=" + entry.getKey() + ",attemptIndex=" + entry.getValue());
}
}
// Init Client manager
clientManager = new ClientManager(appContext);
addIfService(clientManager);
// Init PS Client manager
psAgentManager = new PSAgentManager(appContext);
addIfService(psAgentManager);
// init parameter server manager
psManager = new ParameterServerManager(appContext, psIdToAttemptIndexMap);
addIfService(psManager);
psManager.init();
List<ParameterServerId> psIds = new ArrayList<>(psManager.getParameterServerMap().keySet());
Collections.sort(psIds, new Comparator<ParameterServerId>() {
@Override
public int compare(ParameterServerId s1, ParameterServerId s2) {
return s1.getIndex() - s2.getIndex();
}
});
locationManager.setPsIds(psIds.toArray(new ParameterServerId[0]));
dispatcher.register(ParameterServerManagerEventType.class, psManager);
dispatcher.register(AMParameterServerEventType.class, new ParameterServerEventHandler());
dispatcher.register(PSAttemptEventType.class, new PSAttemptEventDispatcher());
LOG.info("build PSManager success");
// recover task information if needed
recoverTaskState();
RunningMode mode = appContext.getRunningMode();
LOG.info("running mode=" + mode);
switch(mode) {
case ANGEL_PS_WORKER:
{
// a dummy data spliter is just for test now
boolean useDummyDataSpliter = conf.getBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, AngelConf.DEFAULT_ANGEL_AM_USE_DUMMY_DATASPLITER);
if (useDummyDataSpliter) {
dataSpliter = new DummyDataSpliter(appContext);
} else {
// recover data splits information if needed
recoveryDataSplits();
}
// init worker manager and register worker manager event
workerManager = new WorkerManager(appContext);
workerManager.adjustTaskNumber(dataSpliter.getSplitNum());
addIfService(workerManager);
dispatcher.register(WorkerManagerEventType.class, workerManager);
dispatcher.register(AMWorkerGroupEventType.class, new WorkerGroupEventHandler());
dispatcher.register(AMWorkerEventType.class, new WorkerEventHandler());
dispatcher.register(WorkerAttemptEventType.class, new WorkerAttemptEventHandler());
LOG.info("build WorkerManager success");
break;
}
case ANGEL_PS:
break;
}
// register slow worker/ps checker
addIfService(new SlowChecker(appContext));
algoMetricsService = new MetricsService(appContext);
addIfService(algoMetricsService);
dispatcher.register(MetricsEventType.class, algoMetricsService);
// register app manager event and finish event
dispatcher.register(AppEventType.class, angelApp);
dispatcher.register(AppFinishEventType.class, new AppFinishEventHandler());
// Init model saver & loader
modelSaver = new AMModelSaver(appContext);
addIfService(modelSaver);
modelLoader = new AMModelLoader(appContext);
addIfService(modelLoader);
hbMonitor = new HeartbeatMonitor(appContext);
addIfService(hbMonitor);
masterService.init(conf);
super.init(conf);
// start a web service if use yarn deploy mode
if (deployMode == AngelDeployMode.YARN) {
try {
webApp = WebApps.$for("angel", AMContext.class, appContext).with(conf).start(new AngelWebApp());
LOG.info("start webapp server success");
LOG.info("webApp.port()=" + webApp.port());
} catch (Exception e) {
LOG.error("Webapps failed to start. Ignoring for now:", e);
}
}
masterService.start();
locationManager.setMasterLocation(masterService.getLocation());
super.serviceStart();
psManager.startAllPS();
AngelServiceLoader.startServiceIfNeed(this, getConfig());
LOG.info("appAttemptId.getAttemptId()=" + appAttemptId.getAttemptId());
if (appAttemptId.getAttemptId() > 1) {
waitForAllPsRegisted();
waitForAllMetricsInited();
angelApp.startExecute();
}
}
Aggregations