use of com.tencent.angel.common.location.Location in project angel by Tencent.
the class PSAgentTest method testPSClient.
@Test
public void testPSClient() throws Exception {
try {
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertTrue(angelAppMaster != null);
AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
assertTrue(taskManager != null);
WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
assertTrue(workerManager != null);
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
assertTrue(worker != null);
PSAgent psAgent = worker.getPSAgent();
assertTrue(psAgent != null);
// psAgent.initAndStart();
// test conf
Configuration conf = psAgent.getConf();
assertTrue(conf != null);
assertEquals(conf.get(AngelConf.ANGEL_DEPLOY_MODE), "LOCAL");
// test master location
Location masterLoc = psAgent.getMasterLocation();
String ipRegex = "(2[5][0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})";
Pattern pattern = Pattern.compile(ipRegex);
Matcher matcher = pattern.matcher(masterLoc.getIp());
assertTrue(matcher.matches());
assertTrue(masterLoc.getPort() >= 1 && masterLoc.getPort() <= 65535);
// test app id
ApplicationId appId = psAgent.getAppId();
// test user
String user = psAgent.getUser();
// test ps agent attempt id
int psAgentId = psAgent.getId();
assertEquals(psAgentId, 1);
// test connection
TConnection conn = psAgent.getConnection();
assertTrue(conn != null);
// test master client
MasterClient masterClient = psAgent.getMasterClient();
assertTrue(masterClient != null);
// test ip
String ip = psAgent.getIp();
matcher = pattern.matcher(ip);
assertTrue(matcher.matches());
// test loc
Location loc = psAgent.getLocation();
assertTrue(loc != null);
matcher = pattern.matcher(loc.getIp());
assertTrue(matcher.matches());
assertTrue(loc.getPort() >= 1 && loc.getPort() <= 65535);
} catch (Exception x) {
LOG.error("run testPSClient failed ", x);
throw x;
}
}
use of com.tencent.angel.common.location.Location in project angel by Tencent.
the class PSAgentTest method testMatrixLocationManager.
@Test
public void testMatrixLocationManager() throws Exception {
try {
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertTrue(angelAppMaster != null);
AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
assertTrue(taskManager != null);
WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
assertTrue(workerManager != null);
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
assertTrue(worker != null);
PSAgent psAgent = worker.getPSAgent();
assertTrue(psAgent != null);
PSAgentMatrixMetaManager matrixPartitionRouter = psAgent.getMatrixMetaManager();
PSAgentLocationManager locationCache = psAgent.getLocationManager();
assertTrue(matrixPartitionRouter != null);
// test ps location
Location psLoc = locationCache.getPsLocation(psId);
String ipRegex = "(2[5][0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})";
Pattern pattern = Pattern.compile(ipRegex);
Matcher matcher = pattern.matcher(psLoc.getIp());
assertTrue(matcher.matches());
assertTrue(psLoc.getPort() >= 1 && psLoc.getPort() <= 65535);
int matrix1Id = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
int matrix2Id = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
// test partitions
List<PartitionKey> partition1Keys = matrixPartitionRouter.getPartitions(matrix1Id);
assertEquals(partition1Keys.size(), 2);
List<PartitionKey> partition2Keys = matrixPartitionRouter.getPartitions(matrix1Id);
assertEquals(partition2Keys.size(), 2);
partition1Keys.clear();
partition1Keys = matrixPartitionRouter.getPartitions(matrix1Id, 0);
assertEquals(partition1Keys.size(), 2);
partition2Keys.clear();
partition2Keys = matrixPartitionRouter.getPartitions(matrix1Id, 0);
assertEquals(partition2Keys.size(), 2);
int rowPartSize = matrixPartitionRouter.getRowPartitionSize(matrix1Id, 0);
assertEquals(rowPartSize, 2);
rowPartSize = matrixPartitionRouter.getRowPartitionSize(matrix1Id, 0);
assertEquals(rowPartSize, 2);
} catch (Exception x) {
LOG.error("run testMatrixLocationManager failed ", x);
throw x;
}
}
use of com.tencent.angel.common.location.Location in project angel by Tencent.
the class MasterService method getWorkerLogDir.
/**
* Get worker log url
*
* @param controller rpc controller
* @param request rpc request contains worker id
* @return worker log url
* @throws ServiceException worker does not exist
*/
@Override
public GetWorkerLogDirResponse getWorkerLogDir(RpcController controller, GetWorkerLogDirRequest request) throws ServiceException {
WorkerId workerId = ProtobufUtil.convertToId(request.getWorkerId());
AMWorker worker = context.getWorkerManager().getWorker(workerId);
if (worker == null) {
throw new ServiceException("can not find worker " + workerId);
}
WorkerAttempt workerAttempt = worker.getRunningAttempt();
if (workerAttempt == null) {
return GetWorkerLogDirResponse.newBuilder().setLogDir("").build();
}
Location loc = workerAttempt.getLocation();
Container container = workerAttempt.getContainer();
if (loc == null || container == null) {
return GetWorkerLogDirResponse.newBuilder().setLogDir("").build();
}
return GetWorkerLogDirResponse.newBuilder().setLogDir("http://" + loc.getIp() + ":" + yarnNMWebPort + "/node/containerlogs/" + container.getId() + "/angel/syslog/?start=0").build();
}
use of com.tencent.angel.common.location.Location in project angel by Tencent.
the class AppTest method testGetJobReport.
@SuppressWarnings("unchecked")
@Test
public void testGetJobReport() throws Exception {
try {
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(angelAppMaster.getConfig());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int jobIteration = (task0Iteration < task1Iteration) ? task0Iteration : task1Iteration;
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
Thread.sleep(1000);
GetJobReportRequest request = GetJobReportRequest.newBuilder().setAppId(LocalClusterContext.get().getAppId().toString()).build();
GetJobReportResponse response = master.getJobReport(null, request);
assertEquals(response.getJobReport().getJobState(), JobStateProto.J_RUNNING);
assertEquals(response.getJobReport().getCurIteration(), jobIteration);
angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed"));
Thread.sleep(5000);
response = master.getJobReport(null, request);
assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
assertEquals(response.getJobReport().getCurIteration(), jobIteration);
assertEquals(response.getJobReport().getDiagnostics(), "failed");
// Thread.sleep(5000);
// response = master.getJobReport(null, request);
// assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
// assertEquals(response.getJobReport().getCurIteration(), jobIteration);
// assertEquals(response.getJobReport().getDiagnostics(), "failed");
Thread.sleep(10000);
try {
response = master.getJobReport(null, request);
} catch (Exception x) {
response = tryGetResponseFromFile(true);
}
assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
assertEquals(response.getJobReport().getCurIteration(), jobIteration);
assertEquals(response.getJobReport().getDiagnostics(), "failed");
} catch (Exception x) {
LOG.error("run testGetJobReport failed ", x);
throw x;
}
}
use of com.tencent.angel.common.location.Location in project angel by Tencent.
the class MasterRecoverTest method testMasterRecover.
@SuppressWarnings("unchecked")
@Test
public void testMasterRecover() throws Exception {
try {
ApplicationAttemptId appAttempt1Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 1);
ApplicationAttemptId appAttempt2Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 2);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt1Id);
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int task0Iteration = 2;
int task1Iteration = 1;
int task0w1Clock = 10;
int task0w2Clock = 20;
int task1w1Clock = 9;
int task1w2Clock = 19;
int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
int writeIntervalMS = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_AM_WRITE_STATE_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_AM_WRITE_STATE_INTERVAL_MS);
Thread.sleep(writeIntervalMS * 2);
angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
Thread.sleep(10000);
angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.RUNNING);
LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
PartitionKey w1Part0Key = new PartitionKey(0, w1Id, 0, 0, 1, 50000);
PartitionKey w1Part1Key = new PartitionKey(1, w1Id, 0, 50000, 1, 100000);
PartitionKey w2Part0Key = new PartitionKey(0, w2Id, 0, 0, 1, 50000);
PartitionKey w2Part1Key = new PartitionKey(1, w2Id, 0, 50000, 1, 100000);
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
LOG.info("worker=" + worker);
LOG.info("worker.getTaskManager()=" + worker.getTaskManager());
LOG.info("worker.getTaskManager().getRunningTask()=" + worker.getTaskManager().getRunningTask().size());
TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext();
assertEquals(task0Context.getEpoch(), task0Iteration);
assertEquals(task1Context.getEpoch(), task1Iteration);
LOG.info("===============worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size()=" + worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size());
assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w1Id));
assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w2Id));
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part0Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part1Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part0Key).get(0), psId);
assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part1Key).get(0), psId);
ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
Thread.sleep(10000);
angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
} catch (Exception x) {
LOG.error("run testMasterRecover failed ", x);
throw x;
}
}
Aggregations