use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class MasterClient method init.
/**
* Init protobuf rpc client to master
*
* @throws IOException connect to master failed
*/
public void init() throws IOException {
TConnection connection = TConnectionManager.getConnection(PSAgentContext.get().getConf());
Location masterLoc = PSAgentContext.get().getPsAgent().getMasterLocation();
this.master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
}
use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class PSManagerTest method testPSDone.
@SuppressWarnings("unchecked")
@Test
public void testPSDone() throws Exception {
try {
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
Location masterLoc = ps.getMasterLocation();
TConnection connection = TConnectionManager.getConnection(ps.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
WorkerDoneRequest workerRequest = WorkerDoneRequest.newBuilder().setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id)).build();
WorkerDoneResponse workerResponse = master.workerDone(null, workerRequest);
assertEquals(workerResponse.getCommand(), WorkerCommandProto.W_SUCCESS);
Thread.sleep(5000);
angelAppMaster.getAppContext().getEventHandler().handle(new AppEvent(AppEventType.COMMIT));
PSDoneRequest request = PSDoneRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).build();
master.psDone(null, request);
Thread.sleep(5000);
ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
AMParameterServer amPs = psManager.getParameterServer(psId);
PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
assertEquals(psAttempt.getInternalState(), PSAttemptStateInternal.SUCCESS);
assertTrue(amPs.getState() == AMParameterServerState.SUCCESS);
assertEquals(amPs.getNextAttemptNumber(), 1);
assertNull(amPs.getRunningAttemptId());
assertEquals(amPs.getSuccessAttemptId(), psAttempt0Id);
assertEquals(amPs.getPSAttempts().size(), 1);
} catch (Exception x) {
LOG.error("run testPSDone failed ", x);
throw x;
}
}
use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class AngelKubernetesClient method updateMaster.
@Override
protected void updateMaster(int maxWaitSeconds) throws Exception {
int port;
int tryTime = 0;
TConnection connection = TConnectionManager.getConnection(conf);
while (tryTime < maxWaitSeconds) {
String masterPodIp = k8sClientApp.getAngelMasterPodIp();
port = conf.getInt(AngelConf.ANGEL_KUBERNETES_MASTER_PORT, AngelConf.DEFAULT_ANGEL_KUBERNETES_MASTER_PORT);
if (masterPodIp == null || "".equals(masterPodIp)) {
LOG.info("AM not assigned to Job. Waiting to get the AM ...");
Thread.sleep(1000);
tryTime++;
} else {
try {
masterLocation = new Location(masterPodIp, port);
LOG.info("master host=" + masterLocation.getIp() + ", port=" + masterLocation.getPort());
LOG.info("start to create rpc client to am");
Thread.sleep(5000);
master = connection.getMasterService(masterLocation.getIp(), masterLocation.getPort());
startHeartbeat();
} catch (Exception e) {
LOG.error("Register to Master failed, ", e);
Thread.sleep(1000);
tryTime++;
continue;
}
break;
}
}
if (tryTime >= maxWaitSeconds && masterLocation == null) {
throw new IOException("wait for master location timeout");
}
}
use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class AngelYarnClient method updateMaster.
@Override
protected void updateMaster(int maxWaitSeconds) throws Exception {
String host = null;
int port = -1;
int tryTime = 0;
TConnection connection = TConnectionManager.getConnection(conf);
while (tryTime < maxWaitSeconds) {
ApplicationReport appMaster = yarnClient.getApplicationReport(appId);
String diagnostics = (appMaster == null ? "application report is null" : appMaster.getDiagnostics());
if (appMaster == null || appMaster.getYarnApplicationState() == YarnApplicationState.FAILED || appMaster.getYarnApplicationState() == YarnApplicationState.KILLED) {
throw new IOException("Failed to run job : " + diagnostics);
}
if (appMaster.getYarnApplicationState() == YarnApplicationState.FINISHED) {
LOG.info("application is finished!!");
master = null;
return;
}
host = appMaster.getHost();
port = appMaster.getRpcPort();
if (host == null || "".equals(host)) {
LOG.info("AM not assigned to Job. Waiting to get the AM ...");
Thread.sleep(1000);
tryTime++;
} else if (UNAVAILABLE.equals(host)) {
Thread.sleep(1000);
tryTime++;
} else {
String appMasterurl = "appMaster getTrackingUrl = " + appMaster.getTrackingUrl().replace("proxy", "cluster/app");
LOG.info(appMasterurl);
System.out.println(appMasterurl);
LOG.info("master host=" + host + ", port=" + port);
try {
masterLocation = new Location(host, port);
LOG.info("start to create rpc client to am");
master = connection.getMasterService(masterLocation.getIp(), masterLocation.getPort());
startHeartbeat();
} catch (ServiceException e) {
LOG.error("Register to Master failed, ", e);
Thread.sleep(1000);
tryTime++;
continue;
}
break;
}
}
if (tryTime >= maxWaitSeconds && masterLocation == null) {
throw new IOException("wait for master location timeout");
}
}
use of com.tencent.angel.ipc.TConnection in project angel by Tencent.
the class MasterServiceTest method testMasterService.
@Test
public void testMasterService() throws Exception {
try {
LOG.info("===========================testMasterService===============================");
Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
TConnection connection = TConnectionManager.getConnection(worker.getConf());
MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
int psAgentId = master.getPSAgentId(null, PSAgentMasterServiceProtos.GetPSAgentIdRequest.getDefaultInstance()).getPsAgentId();
// worker register
WorkerAttemptId worker1Attempt0Id = new WorkerAttemptId(new WorkerId(new WorkerGroupId(1), 0), 0);
WorkerRegisterRequest registeRequest = WorkerRegisterRequest.newBuilder().setPsAgentId(psAgentId).setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker1Attempt0Id)).setLocation(LocationProto.newBuilder().setIp("0.0.0.0").setPort(10000).build()).build();
WorkerRegisterResponse registerResponse = master.workerRegister(null, registeRequest);
assertTrue(registerResponse.getCommand() == WorkerCommandProto.W_SHUTDOWN);
WorkerReportRequest.Builder reportBuilder = WorkerReportRequest.newBuilder();
Pair.Builder kvBuilder = Pair.newBuilder();
TaskStateProto.Builder taskBuilder = TaskStateProto.newBuilder();
reportBuilder.setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id));
taskBuilder.setProgress(0.20f);
taskBuilder.setState("RUNNING");
taskBuilder.setTaskId(ProtobufUtil.convertToIdProto(task0Id));
kvBuilder.setKey("task_key1");
kvBuilder.setValue("100");
taskBuilder.addCounters(kvBuilder.build());
kvBuilder.setKey("task_key2");
kvBuilder.setValue("200");
taskBuilder.addCounters(kvBuilder.build());
reportBuilder.addTaskReports(taskBuilder.build());
taskBuilder.setProgress(0.30f);
taskBuilder.setState("RUNNING");
taskBuilder.setTaskId(ProtobufUtil.convertToIdProto(task1Id));
kvBuilder.setKey("task_key1");
kvBuilder.setValue("1000");
taskBuilder.addCounters(kvBuilder.build());
kvBuilder.setKey("task_key2");
kvBuilder.setValue("2000");
taskBuilder.addCounters(kvBuilder.build());
reportBuilder.addTaskReports(taskBuilder.build());
kvBuilder.setKey("worker_key1");
kvBuilder.setValue("100");
reportBuilder.addPairs(kvBuilder.build());
kvBuilder.setKey("worker_key2");
kvBuilder.setValue("200");
reportBuilder.addPairs(kvBuilder.build());
WorkerReportResponse reportResponse = master.workerReport(null, reportBuilder.build());
assertTrue(reportResponse.getCommand() == WorkerCommandProto.W_SUCCESS);
assertEquals(reportResponse.getActiveTaskNum(), 2);
AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
WorkerAttempt worker0Attempt = angelAppMaster.getAppContext().getWorkerManager().getWorker(worker0Attempt0Id.getWorkerId()).getWorkerAttempt(worker0Attempt0Id);
assertTrue(worker0Attempt != null);
Map<String, String> workerMetrics = worker0Attempt.getMetrics();
String valueForWorkerKey1 = workerMetrics.get("worker_key1");
String valueForWorkerKey2 = workerMetrics.get("worker_key2");
assertNotNull(valueForWorkerKey1);
assertNotNull(valueForWorkerKey2);
assertEquals(valueForWorkerKey1, "100");
assertEquals(valueForWorkerKey2, "200");
AMTaskManager amTaskManager = angelAppMaster.getAppContext().getTaskManager();
AMTask task0 = amTaskManager.getTask(task0Id);
AMTask task1 = amTaskManager.getTask(task1Id);
assertTrue(task0 != null);
assertTrue(task1 != null);
Map<String, String> task0Metrics = task0.getMetrics();
Map<String, String> task1Metrics = task1.getMetrics();
String valueForTask0Key1 = task0Metrics.get("task_key1");
String valueForTask0Key2 = task0Metrics.get("task_key2");
String valueForTask1Key1 = task1Metrics.get("task_key1");
String valueForTask1Key2 = task1Metrics.get("task_key2");
assertTrue(valueForTask0Key1 != null);
assertTrue(valueForTask0Key2 != null);
assertTrue(valueForTask1Key1 != null);
assertTrue(valueForTask1Key2 != null);
assertEquals(valueForTask0Key1, "100");
assertEquals(valueForTask0Key2, "200");
assertEquals(valueForTask1Key1, "1000");
assertEquals(valueForTask1Key2, "2000");
assertEquals(task0.getProgress(), 0.20f, 0.000001);
assertEquals(task1.getProgress(), 0.30f, 0.000001);
} catch (Exception x) {
LOG.error("run testMasterService failed ", x);
throw x;
}
}
Aggregations