use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class PSManagerTest method testPSError.
@Test
public void testPSError() throws Exception {
try {
int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int task0w1Clock = 10;
int task0w2Clock = 20;
int task1w1Clock = 9;
int task1w2Clock = 19;
int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
assertEquals(amPs.getMaxAttempts(), 4);
PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
// attempt 0
ps.stop(-1);
PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
assertTrue(psAttempt1 != null);
assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 2);
assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 2);
List<String> diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 1);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
int matrixW1Id = w1Task0Client.getMatrixId();
int[] delta = new int[100000];
for (int i = 0; i < 100000; i++) {
delta[i] = 2;
}
IntIntVector deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
deltaVec.setMatrixId(matrixW1Id);
deltaVec.setRowId(0);
w1Task0Client.increment(deltaVec);
deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
deltaVec.setMatrixId(matrixW1Id);
deltaVec.setRowId(0);
w1Task1Client.increment(deltaVec);
w1Task0Client.clock().get();
w1Task1Client.clock().get();
ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
Thread.sleep(snapshotInterval * 2);
// attempt1
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
assertTrue(psAttempt2 != null);
assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 3);
assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 3);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 2);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
assertEquals(sum((IntIntVector) w1Task0Client.getRow(0)), 400000);
// attempt1
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
assertTrue(psAttempt3 != null);
assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
assertEquals(amPs.getNextAttemptNumber(), 4);
assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 4);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 3);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
ps.stop(-1);
request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
master.psError(null, request);
Thread.sleep(heartbeatInterval * 2);
assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
assertEquals(amPs.getState(), AMParameterServerState.FAILED);
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
assertEquals(amPs.getNextAttemptNumber(), 4);
assertNull(amPs.getRunningAttemptId());
assertNull(amPs.getSuccessAttemptId());
assertEquals(amPs.getPSAttempts().size(), 4);
diagnostics = amPs.getDiagnostics();
assertEquals(diagnostics.size(), 4);
assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
} catch (Exception x) {
LOG.error("run testPSError failed ", x);
throw x;
}
}
use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class PSManagerTest method testPSReport.
@Test
public void testPSReport() throws Exception {
try {
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Location masterLoc = ps.getMasterLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
PSReportRequest.Builder builder = PSReportRequest.newBuilder();
builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id));
Pair.Builder pairBuilder = Pair.newBuilder();
pairBuilder.setKey("ps_key1");
pairBuilder.setValue("100");
builder.addMetrics(pairBuilder.build());
pairBuilder.setKey("ps_key2");
pairBuilder.setValue("200");
builder.addMetrics(pairBuilder.build());
MatrixReportProto.Builder matrixBuilder = MatrixReportProto.newBuilder();
ConcurrentHashMap<Integer, ServerMatrix> matrixIdMap = ps.getMatrixStorageManager().getMatrices();
for (Entry<Integer, ServerMatrix> matrixEntry : matrixIdMap.entrySet()) {
builder.addMatrixReports((matrixBuilder.setMatrixId(matrixEntry.getKey()).setMatrixName(matrixEntry.getValue().getName())));
}
PSReportResponse response = master.psReport(null, builder.build());
assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_OK);
assertEquals(response.getNeedCreateMatricesCount(), 0);
assertEquals(response.getNeedReleaseMatrixIdsCount(), 0);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
Map<String, String> metrices = psAttempt.getMetrices();
assertTrue(metrices.get("ps_key1").equals("100"));
assertTrue(metrices.get("ps_key2").equals("200"));
PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id));
response = master.psReport(null, builder.build());
assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_SHUTDOWN);
} catch (Exception x) {
LOG.error("run testPSReport failed ", x);
throw x;
}
}
use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class AngelLocalClient method updateMaster.
@Override
protected void updateMaster(int maxWaitSeconds) throws Exception {
int tryTime = 0;
TConnection connection = TConnectionManager.getConnection(conf);
while (tryTime < maxWaitSeconds) {
LocalMaster localMaster = LocalClusterContext.get().getMaster();
if (localMaster == null || localMaster.getAppMaster().getAppContext().getMasterService() == null) {
Thread.sleep(1000);
tryTime++;
continue;
}
masterLocation = localMaster.getAppMaster().getAppContext().getMasterService().getLocation();
if (masterLocation == null) {
Thread.sleep(1000);
tryTime++;
continue;
}
try {
LOG.info("start to create rpc client to am");
master = connection.getMasterService(masterLocation.getIp(), masterLocation.getPort());
startHeartbeat();
break;
} catch (ServiceException e) {
Thread.sleep(1000);
tryTime++;
}
}
}
use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class PSAgentTest method testPSClient.
@Test
public void testPSClient() throws Exception {
try {
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertTrue(angelAppMaster != null);
AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
assertTrue(taskManager != null);
WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
assertTrue(workerManager != null);
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
assertTrue(worker != null);
PSAgent psAgent = worker.getPSAgent();
assertTrue(psAgent != null);
// psAgent.initAndStart();
// test conf
Configuration conf = psAgent.getConf();
assertTrue(conf != null);
assertEquals(conf.get(AngelConf.ANGEL_DEPLOY_MODE), "LOCAL");
// test master location
Location masterLoc = psAgent.getMasterLocation();
String ipRegex = "(2[5][0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})";
Pattern pattern = Pattern.compile(ipRegex);
Matcher matcher = pattern.matcher(masterLoc.getIp());
assertTrue(matcher.matches());
assertTrue(masterLoc.getPort() >= 1 && masterLoc.getPort() <= 65535);
// test app id
ApplicationId appId = psAgent.getAppId();
// test user
String user = psAgent.getUser();
// test ps agent attempt id
int psAgentId = psAgent.getId();
assertEquals(psAgentId, 1);
// test connection
TConnection conn = psAgent.getConnection();
assertTrue(conn != null);
// test master client
MasterClient masterClient = psAgent.getMasterClient();
assertTrue(masterClient != null);
// test ip
String ip = psAgent.getIp();
matcher = pattern.matcher(ip);
assertTrue(matcher.matches());
// test loc
Location loc = psAgent.getLocation();
assertTrue(loc != null);
matcher = pattern.matcher(loc.getIp());
assertTrue(matcher.matches());
assertTrue(loc.getPort() >= 1 && loc.getPort() <= 65535);
} catch (Exception x) {
LOG.error("run testPSClient failed ", x);
throw x;
}
}
use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class AppTest method testGetJobReport.
@SuppressWarnings("unchecked")
@Test
public void testGetJobReport() throws Exception {
try {
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(angelAppMaster.getConfig());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int jobIteration = (task0Iteration < task1Iteration) ? task0Iteration : task1Iteration;
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
Thread.sleep(1000);
GetJobReportRequest request = GetJobReportRequest.newBuilder().setAppId(LocalClusterContext.get().getAppId().toString()).build();
GetJobReportResponse response = master.getJobReport(null, request);
assertEquals(response.getJobReport().getJobState(), JobStateProto.J_RUNNING);
assertEquals(response.getJobReport().getCurIteration(), jobIteration);
angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed"));
Thread.sleep(5000);
response = master.getJobReport(null, request);
assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
assertEquals(response.getJobReport().getCurIteration(), jobIteration);
assertEquals(response.getJobReport().getDiagnostics(), "failed");
// Thread.sleep(5000);
// response = master.getJobReport(null, request);
// assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
// assertEquals(response.getJobReport().getCurIteration(), jobIteration);
// assertEquals(response.getJobReport().getDiagnostics(), "failed");
Thread.sleep(10000);
try {
response = master.getJobReport(null, request);
} catch (Exception x) {
response = tryGetResponseFromFile(true);
}
assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
assertEquals(response.getJobReport().getCurIteration(), jobIteration);
assertEquals(response.getJobReport().getDiagnostics(), "failed");
} catch (Exception x) {
LOG.error("run testGetJobReport failed ", x);
throw x;
}
}
Aggregations