Search in sources :

Example 1 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class MasterClient method init.

/**
 * Init protobuf rpc client to master
 *
 * @throws IOException connect to master failed
 */
public void init() throws IOException {
    TConnection connection = TConnectionManager.getConnection(PSAgentContext.get().getConf());
    Location masterLoc = PSAgentContext.get().getPsAgent().getMasterLocation();
    this.master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation) Location(com.tencent.angel.common.location.Location)

Example 2 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class PSManagerTest method testPSDone.

@SuppressWarnings("unchecked")
@Test
public void testPSDone() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Location masterLoc = ps.getMasterLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        WorkerDoneRequest workerRequest = WorkerDoneRequest.newBuilder().setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id)).build();
        WorkerDoneResponse workerResponse = master.workerDone(null, workerRequest);
        assertEquals(workerResponse.getCommand(), WorkerCommandProto.W_SUCCESS);
        Thread.sleep(5000);
        angelAppMaster.getAppContext().getEventHandler().handle(new AppEvent(AppEventType.COMMIT));
        PSDoneRequest request = PSDoneRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).build();
        master.psDone(null, request);
        Thread.sleep(5000);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
        assertEquals(psAttempt.getInternalState(), PSAttemptStateInternal.SUCCESS);
        assertTrue(amPs.getState() == AMParameterServerState.SUCCESS);
        assertEquals(amPs.getNextAttemptNumber(), 1);
        assertNull(amPs.getRunningAttemptId());
        assertEquals(amPs.getSuccessAttemptId(), psAttempt0Id);
        assertEquals(amPs.getPSAttempts().size(), 1);
    } catch (Exception x) {
        LOG.error("run testPSDone failed ", x);
        throw x;
    }
}
Also used : WorkerDoneRequest(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneRequest) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) AngelException(com.tencent.angel.exception.AngelException) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) AppEvent(com.tencent.angel.master.app.AppEvent) TConnection(com.tencent.angel.ipc.TConnection) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) WorkerDoneResponse(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneResponse) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 3 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class AngelKubernetesClient method updateMaster.

@Override
protected void updateMaster(int maxWaitSeconds) throws Exception {
    int port;
    int tryTime = 0;
    TConnection connection = TConnectionManager.getConnection(conf);
    while (tryTime < maxWaitSeconds) {
        String masterPodIp = k8sClientApp.getAngelMasterPodIp();
        port = conf.getInt(AngelConf.ANGEL_KUBERNETES_MASTER_PORT, AngelConf.DEFAULT_ANGEL_KUBERNETES_MASTER_PORT);
        if (masterPodIp == null || "".equals(masterPodIp)) {
            LOG.info("AM not assigned to Job. Waiting to get the AM ...");
            Thread.sleep(1000);
            tryTime++;
        } else {
            try {
                masterLocation = new Location(masterPodIp, port);
                LOG.info("master host=" + masterLocation.getIp() + ", port=" + masterLocation.getPort());
                LOG.info("start to create rpc client to am");
                Thread.sleep(5000);
                master = connection.getMasterService(masterLocation.getIp(), masterLocation.getPort());
                startHeartbeat();
            } catch (Exception e) {
                LOG.error("Register to Master failed, ", e);
                Thread.sleep(1000);
                tryTime++;
                continue;
            }
            break;
        }
    }
    if (tryTime >= maxWaitSeconds && masterLocation == null) {
        throw new IOException("wait for master location timeout");
    }
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) IOException(java.io.IOException) AngelException(com.tencent.angel.exception.AngelException) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location)

Example 4 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class AngelYarnClient method updateMaster.

@Override
protected void updateMaster(int maxWaitSeconds) throws Exception {
    String host = null;
    int port = -1;
    int tryTime = 0;
    TConnection connection = TConnectionManager.getConnection(conf);
    while (tryTime < maxWaitSeconds) {
        ApplicationReport appMaster = yarnClient.getApplicationReport(appId);
        String diagnostics = (appMaster == null ? "application report is null" : appMaster.getDiagnostics());
        if (appMaster == null || appMaster.getYarnApplicationState() == YarnApplicationState.FAILED || appMaster.getYarnApplicationState() == YarnApplicationState.KILLED) {
            throw new IOException("Failed to run job : " + diagnostics);
        }
        if (appMaster.getYarnApplicationState() == YarnApplicationState.FINISHED) {
            LOG.info("application is finished!!");
            master = null;
            return;
        }
        host = appMaster.getHost();
        port = appMaster.getRpcPort();
        if (host == null || "".equals(host)) {
            LOG.info("AM not assigned to Job. Waiting to get the AM ...");
            Thread.sleep(1000);
            tryTime++;
        } else if (UNAVAILABLE.equals(host)) {
            Thread.sleep(1000);
            tryTime++;
        } else {
            String appMasterurl = "appMaster getTrackingUrl = " + appMaster.getTrackingUrl().replace("proxy", "cluster/app");
            LOG.info(appMasterurl);
            System.out.println(appMasterurl);
            LOG.info("master host=" + host + ", port=" + port);
            try {
                masterLocation = new Location(host, port);
                LOG.info("start to create rpc client to am");
                master = connection.getMasterService(masterLocation.getIp(), masterLocation.getPort());
                startHeartbeat();
            } catch (ServiceException e) {
                LOG.error("Register to Master failed, ", e);
                Thread.sleep(1000);
                tryTime++;
                continue;
            }
            break;
        }
    }
    if (tryTime >= maxWaitSeconds && masterLocation == null) {
        throw new IOException("wait for master location timeout");
    }
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) ServiceException(com.google.protobuf.ServiceException) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location)

Example 5 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class MasterServiceTest method testMasterService.

@Test
public void testMasterService() throws Exception {
    try {
        LOG.info("===========================testMasterService===============================");
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(worker.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int psAgentId = master.getPSAgentId(null, PSAgentMasterServiceProtos.GetPSAgentIdRequest.getDefaultInstance()).getPsAgentId();
        // worker register
        WorkerAttemptId worker1Attempt0Id = new WorkerAttemptId(new WorkerId(new WorkerGroupId(1), 0), 0);
        WorkerRegisterRequest registeRequest = WorkerRegisterRequest.newBuilder().setPsAgentId(psAgentId).setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker1Attempt0Id)).setLocation(LocationProto.newBuilder().setIp("0.0.0.0").setPort(10000).build()).build();
        WorkerRegisterResponse registerResponse = master.workerRegister(null, registeRequest);
        assertTrue(registerResponse.getCommand() == WorkerCommandProto.W_SHUTDOWN);
        WorkerReportRequest.Builder reportBuilder = WorkerReportRequest.newBuilder();
        Pair.Builder kvBuilder = Pair.newBuilder();
        TaskStateProto.Builder taskBuilder = TaskStateProto.newBuilder();
        reportBuilder.setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id));
        taskBuilder.setProgress(0.20f);
        taskBuilder.setState("RUNNING");
        taskBuilder.setTaskId(ProtobufUtil.convertToIdProto(task0Id));
        kvBuilder.setKey("task_key1");
        kvBuilder.setValue("100");
        taskBuilder.addCounters(kvBuilder.build());
        kvBuilder.setKey("task_key2");
        kvBuilder.setValue("200");
        taskBuilder.addCounters(kvBuilder.build());
        reportBuilder.addTaskReports(taskBuilder.build());
        taskBuilder.setProgress(0.30f);
        taskBuilder.setState("RUNNING");
        taskBuilder.setTaskId(ProtobufUtil.convertToIdProto(task1Id));
        kvBuilder.setKey("task_key1");
        kvBuilder.setValue("1000");
        taskBuilder.addCounters(kvBuilder.build());
        kvBuilder.setKey("task_key2");
        kvBuilder.setValue("2000");
        taskBuilder.addCounters(kvBuilder.build());
        reportBuilder.addTaskReports(taskBuilder.build());
        kvBuilder.setKey("worker_key1");
        kvBuilder.setValue("100");
        reportBuilder.addPairs(kvBuilder.build());
        kvBuilder.setKey("worker_key2");
        kvBuilder.setValue("200");
        reportBuilder.addPairs(kvBuilder.build());
        WorkerReportResponse reportResponse = master.workerReport(null, reportBuilder.build());
        assertTrue(reportResponse.getCommand() == WorkerCommandProto.W_SUCCESS);
        assertEquals(reportResponse.getActiveTaskNum(), 2);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        WorkerAttempt worker0Attempt = angelAppMaster.getAppContext().getWorkerManager().getWorker(worker0Attempt0Id.getWorkerId()).getWorkerAttempt(worker0Attempt0Id);
        assertTrue(worker0Attempt != null);
        Map<String, String> workerMetrics = worker0Attempt.getMetrics();
        String valueForWorkerKey1 = workerMetrics.get("worker_key1");
        String valueForWorkerKey2 = workerMetrics.get("worker_key2");
        assertNotNull(valueForWorkerKey1);
        assertNotNull(valueForWorkerKey2);
        assertEquals(valueForWorkerKey1, "100");
        assertEquals(valueForWorkerKey2, "200");
        AMTaskManager amTaskManager = angelAppMaster.getAppContext().getTaskManager();
        AMTask task0 = amTaskManager.getTask(task0Id);
        AMTask task1 = amTaskManager.getTask(task1Id);
        assertTrue(task0 != null);
        assertTrue(task1 != null);
        Map<String, String> task0Metrics = task0.getMetrics();
        Map<String, String> task1Metrics = task1.getMetrics();
        String valueForTask0Key1 = task0Metrics.get("task_key1");
        String valueForTask0Key2 = task0Metrics.get("task_key2");
        String valueForTask1Key1 = task1Metrics.get("task_key1");
        String valueForTask1Key2 = task1Metrics.get("task_key2");
        assertTrue(valueForTask0Key1 != null);
        assertTrue(valueForTask0Key2 != null);
        assertTrue(valueForTask1Key1 != null);
        assertTrue(valueForTask1Key2 != null);
        assertEquals(valueForTask0Key1, "100");
        assertEquals(valueForTask0Key2, "200");
        assertEquals(valueForTask1Key1, "1000");
        assertEquals(valueForTask1Key2, "2000");
        assertEquals(task0.getProgress(), 0.20f, 0.000001);
        assertEquals(task1.getProgress(), 0.30f, 0.000001);
    } catch (Exception x) {
        LOG.error("run testMasterService failed ", x);
        throw x;
    }
}
Also used : WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) WorkerId(com.tencent.angel.worker.WorkerId) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) TConnection(com.tencent.angel.ipc.TConnection) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) Worker(com.tencent.angel.worker.Worker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMTask(com.tencent.angel.master.task.AMTask) Location(com.tencent.angel.common.location.Location) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair) Test(org.junit.Test)

Aggregations

TConnection (com.tencent.angel.ipc.TConnection)13 Location (com.tencent.angel.common.location.Location)12 Test (org.junit.Test)9 Worker (com.tencent.angel.worker.Worker)6 ParameterServer (com.tencent.angel.ps.ParameterServer)5 AngelException (com.tencent.angel.exception.AngelException)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)3 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)3 MatrixClient (com.tencent.angel.psagent.matrix.MatrixClient)3 IOException (java.io.IOException)3 ServiceException (com.google.protobuf.ServiceException)2 MasterProtocol (com.tencent.angel.master.MasterProtocol)2 InternalErrorEvent (com.tencent.angel.master.app.InternalErrorEvent)2 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)2 Pair (com.tencent.angel.protobuf.generated.MLProtos.Pair)2 PSErrorRequest (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSErrorRequest)2 PSAttemptId (com.tencent.angel.ps.PSAttemptId)2 PartitionKey (com.tencent.angel.PartitionKey)1 InitNeighbor (com.tencent.angel.graph.client.initneighbor.InitNeighbor)1