use of com.tencent.angel.master.ps.ParameterServerManager in project angel by Tencent.
the class AngelApplicationMaster method initAndStart.
/**
* init and start all service modules for angel applicaiton master.
*/
public void initAndStart() throws Exception {
addIfService(angelApp);
// init app state storage
String tmpOutPath = conf.get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH);
Path appStatePath = new Path(tmpOutPath, "app");
LOG.info("app state output path = " + appStatePath.toUri().toString());
FileSystem fs = appStatePath.getFileSystem(conf);
appStateStorage = new AppStateStorage(appContext, appStatePath.toUri().toString(), fs);
addIfService(appStateStorage);
LOG.info("build app state storage success");
// init event dispacher
dispatcher = new AsyncDispatcher();
addIfService(dispatcher);
LOG.info("build event dispacher");
// init location manager
locationManager = new LocationManager();
// init container allocator
AngelDeployMode deployMode = appContext.getDeployMode();
LOG.info("deploy mode=" + deployMode);
if (deployMode == AngelDeployMode.LOCAL) {
containerAllocator = new LocalContainerAllocator(appContext);
containerLauncher = new LocalContainerLauncher(appContext);
} else {
containerAllocator = new YarnContainerAllocator(appContext);
containerLauncher = new YarnContainerLauncher(appContext);
}
addIfService(containerAllocator);
dispatcher.register(ContainerAllocatorEventType.class, containerAllocator);
LOG.info("build containerAllocator success");
addIfService(containerLauncher);
dispatcher.register(ContainerLauncherEventType.class, containerLauncher);
LOG.info("build containerLauncher success");
// init a rpc service
masterService = new MasterService(appContext);
LOG.info("build master service success");
// recover matrix meta if needed
recoverMatrixMeta();
// recover ps attempt information if need
Map<ParameterServerId, Integer> psIdToAttemptIndexMap = recoverPSAttemptIndex();
if (psIdToAttemptIndexMap == null) {
LOG.info("recoverPSAttemptIndex return is null");
} else {
for (Entry<ParameterServerId, Integer> entry : psIdToAttemptIndexMap.entrySet()) {
LOG.info("psId=" + entry.getKey() + ",attemptIndex=" + entry.getValue());
}
}
// init parameter server manager
psManager = new ParameterServerManager(appContext, psIdToAttemptIndexMap);
addIfService(psManager);
psManager.init();
List<ParameterServerId> psIds = new ArrayList<>(psManager.getParameterServerMap().keySet());
Collections.sort(psIds, new Comparator<ParameterServerId>() {
@Override
public int compare(ParameterServerId s1, ParameterServerId s2) {
return s1.getIndex() - s2.getIndex();
}
});
locationManager.setPsIds(psIds.toArray(new ParameterServerId[0]));
dispatcher.register(ParameterServerManagerEventType.class, psManager);
dispatcher.register(AMParameterServerEventType.class, new ParameterServerEventHandler());
dispatcher.register(PSAttemptEventType.class, new PSAttemptEventDispatcher());
LOG.info("build PSManager success");
// recover task information if needed
recoverTaskState();
RunningMode mode = appContext.getRunningMode();
LOG.info("running mode=" + mode);
switch(mode) {
case ANGEL_PS_PSAGENT:
{
// init psagent manager and register psagent manager event
psAgentManager = new PSAgentManager(appContext);
addIfService(psAgentManager);
dispatcher.register(PSAgentManagerEventType.class, psAgentManager);
dispatcher.register(AMPSAgentEventType.class, new PSAgentEventHandler());
dispatcher.register(PSAgentAttemptEventType.class, new PSAgentAttemptEventHandler());
LOG.info("build PSAgentManager success");
break;
}
case ANGEL_PS_WORKER:
{
// a dummy data spliter is just for test now
boolean useDummyDataSpliter = conf.getBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, AngelConf.DEFAULT_ANGEL_AM_USE_DUMMY_DATASPLITER);
if (useDummyDataSpliter) {
dataSpliter = new DummyDataSpliter(appContext);
} else {
// recover data splits information if needed
recoveryDataSplits();
}
// init worker manager and register worker manager event
workerManager = new WorkerManager(appContext);
workerManager.adjustTaskNumber(dataSpliter.getSplitNum());
addIfService(workerManager);
dispatcher.register(WorkerManagerEventType.class, workerManager);
dispatcher.register(AMWorkerGroupEventType.class, new WorkerGroupEventHandler());
dispatcher.register(AMWorkerEventType.class, new WorkerEventHandler());
dispatcher.register(WorkerAttemptEventType.class, new WorkerAttemptEventHandler());
LOG.info("build WorkerManager success");
break;
}
case ANGEL_PS:
break;
}
// register slow worker/ps checker
addIfService(new SlowChecker(appContext));
algoMetricsService = new MetricsService(appContext);
addIfService(algoMetricsService);
dispatcher.register(MetricsEventType.class, algoMetricsService);
// register app manager event and finish event
dispatcher.register(AppEventType.class, angelApp);
dispatcher.register(AppFinishEventType.class, new AppFinishEventHandler());
masterService.init(conf);
super.init(conf);
// start a web service if use yarn deploy mode
if (deployMode == AngelDeployMode.YARN) {
try {
webApp = WebApps.$for("angel", AMContext.class, appContext).with(conf).start(new AngelWebApp());
LOG.info("start webapp server success");
LOG.info("webApp.port()=" + webApp.port());
} catch (Exception e) {
LOG.error("Webapps failed to start. Ignoring for now:", e);
}
}
masterService.start();
locationManager.setMasterLocation(masterService.getLocation());
super.serviceStart();
psManager.startAllPS();
AngelServiceLoader.startServiceIfNeed(this, getConfig());
LOG.info("appAttemptId.getAttemptId()=" + appAttemptId.getAttemptId());
if (appAttemptId.getAttemptId() > 1) {
waitForAllPsRegisted();
waitForAllMetricsInited();
angelApp.startExecute();
}
}
use of com.tencent.angel.master.ps.ParameterServerManager in project angel by Tencent.
the class PSManagerTest method testPSDone.
@SuppressWarnings("unchecked")
@Test
public void testPSDone() throws Exception {
try {
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Location masterLoc = ps.getMasterLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
WorkerDoneRequest workerRequest = WorkerDoneRequest.newBuilder().setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id)).build();
WorkerDoneResponse workerResponse = master.workerDone(null, workerRequest);
assertEquals(workerResponse.getCommand(), WorkerCommandProto.W_SUCCESS);
Thread.sleep(5000);
angelAppMaster.getAppContext().getEventHandler().handle(new AppEvent(AppEventType.COMMIT));
PSDoneRequest request = PSDoneRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).build();
master.psDone(null, request);
Thread.sleep(5000);
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
assertEquals(psAttempt.getInternalState(), PSAttemptStateInternal.SUCCESS);
assertTrue(amPs.getState() == AMParameterServerState.SUCCESS);
assertEquals(amPs.getNextAttemptNumber(), 1);
assertNull(amPs.getRunningAttemptId());
assertEquals(amPs.getSuccessAttemptId(), psAttempt0Id);
assertEquals(amPs.getPSAttempts().size(), 1);
} catch (Exception x) {
LOG.error("run testPSDone failed ", x);
throw x;
}
}
use of com.tencent.angel.master.ps.ParameterServerManager in project angel by Tencent.
the class PSManagerTest method testPSReport.
@Test
public void testPSReport() throws Exception {
try {
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Location masterLoc = ps.getMasterLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
PSReportRequest.Builder builder = PSReportRequest.newBuilder();
builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id));
Pair.Builder pairBuilder = Pair.newBuilder();
pairBuilder.setKey("ps_key1");
pairBuilder.setValue("100");
builder.addMetrics(pairBuilder.build());
pairBuilder.setKey("ps_key2");
pairBuilder.setValue("200");
builder.addMetrics(pairBuilder.build());
builder.setPsFailedReports(MLProtos.PSFailedReportsProto.getDefaultInstance());
MatrixReportProto.Builder matrixBuilder = MatrixReportProto.newBuilder();
ConcurrentHashMap<Integer, ServerMatrix> matrixIdMap = ps.getMatrixStorageManager().getMatrices();
for (Entry<Integer, ServerMatrix> matrixEntry : matrixIdMap.entrySet()) {
builder.addMatrixReports((matrixBuilder.setMatrixId(matrixEntry.getKey()).setMatrixName(matrixEntry.getValue().getName())));
}
PSReportResponse response = master.psReport(null, builder.build());
assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_OK);
assertEquals(response.getNeedCreateMatricesCount(), 0);
assertEquals(response.getNeedReleaseMatrixIdsCount(), 0);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
Map<String, String> metrices = psAttempt.getMetrices();
assertTrue(metrices.get("ps_key1").equals("100"));
assertTrue(metrices.get("ps_key2").equals("200"));
PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id));
response = master.psReport(null, builder.build());
assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_SHUTDOWN);
} catch (Exception x) {
LOG.error("run testPSReport failed ", x);
throw x;
}
}
use of com.tencent.angel.master.ps.ParameterServerManager in project angel by Tencent.
the class PSManagerTest method testPSError.
@Test
public void testPSError() throws Exception {
try {
int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int task0w1Clock = 10;
int task0w2Clock = 20;
int task1w1Clock = 9;
int task1w2Clock = 19;
int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
task0Context.setMatrixClock(w1Id, w1Clock);
task1Context.setMatrixClock(w1Id, w1Clock);
task0Context.setMatrixClock(w2Id, w2Clock);
task1Context.setMatrixClock(w2Id, w2Clock);
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
assertEquals(amPs.getMaxAttempts(), 4);
PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
// attempt 0
ps.stop(-1);
PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
assertTrue(psAttempt1 != null);
assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 2);
assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 2);
List<String> diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 1);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
ClockVectorManager clockVectorManager = ps.getClockVectorManager();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
int matrixW1Id = w1Task0Client.getMatrixId();
int[] delta = new int[100000];
for (int i = 0; i < 100000; i++) {
delta[i] = 2;
}
DenseIntVector deltaVec = new DenseIntVector(100000, delta);
deltaVec.setMatrixId(matrixW1Id);
deltaVec.setRowId(0);
w1Task0Client.increment(deltaVec);
deltaVec = new DenseIntVector(100000, delta);
deltaVec.setMatrixId(matrixW1Id);
deltaVec.setRowId(0);
w1Task1Client.increment(deltaVec);
w1Task0Client.clock().get();
w1Task1Client.clock().get();
ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
Thread.sleep(snapshotInterval * 2);
// attempt1
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
assertTrue(psAttempt2 != null);
assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 3);
assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 3);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 2);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
assertEquals(sum((DenseIntVector) w1Task0Client.getRow(0)), 400000);
// attempt1
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
assertTrue(psAttempt3 != null);
assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 4);
assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 4);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 3);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(amPs.getState(), AMParameterServerState.FAILED);
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
assertEquals(amPs.getNextAttemptNumber(), 4);
assertNull(amPs.getRunningAttemptId());
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 4);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 4);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
} catch (Exception x) {
LOG.error("run testPSError failed ", x);
throw x;
}
}
use of com.tencent.angel.master.ps.ParameterServerManager in project angel by Tencent.
the class MasterRecoverTest method testMasterRecover.
@SuppressWarnings("unchecked")
@Test
public void testMasterRecover() throws Exception {
try {
ApplicationAttemptId appAttempt1Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 1);
ApplicationAttemptId appAttempt2Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 2);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt1Id);
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int task0w1Clock = 10;
int task0w2Clock = 20;
int task1w1Clock = 9;
int task1w2Clock = 19;
int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
int writeIntervalMS = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_AM_WRITE_STATE_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_AM_WRITE_STATE_INTERVAL_MS);
Thread.sleep(writeIntervalMS * 2);
angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
Thread.sleep(15000);
angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.RUNNING);
LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
PartitionKey w1Part0Key = new PartitionKey(0, w1Id, 0, 0, 1, 50000);
PartitionKey w1Part1Key = new PartitionKey(1, w1Id, 0, 50000, 1, 100000);
PartitionKey w2Part0Key = new PartitionKey(0, w2Id, 0, 0, 1, 50000);
PartitionKey w2Part1Key = new PartitionKey(1, w2Id, 0, 50000, 1, 100000);
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
LOG.info("worker=" + worker);
LOG.info("worker.getTaskManager()=" + worker.getTaskManager());
LOG.info("worker.getTaskManager().getRunningTask()=" + worker.getTaskManager().getRunningTask().size());
TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext();
assertEquals(task0Context.getEpoch(), task0Iteration);
assertEquals(task1Context.getEpoch(), task1Iteration);
assertEquals(task0Context.getMatrixClock(w1Id), task0w1Clock);
assertEquals(task0Context.getMatrixClock(w2Id), task0w2Clock);
assertEquals(task1Context.getMatrixClock(w1Id), task1w1Clock);
assertEquals(task1Context.getMatrixClock(w2Id), task1w2Clock);
LOG.info("===============worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size()=" + worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size());
assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w1Id));
assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w2Id));
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part0Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part1Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part0Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part1Key).get(0), psId);
ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
Thread.sleep(15000);
angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
} catch (Exception x) {
LOG.error("run testMasterRecover failed ", x);
throw x;
}
}
Aggregations