Search in sources :

Example 6 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class MasterService method psAgentRegister.

/**
 * response for psagent heartbeat.
 *
 * @param controller rpc controller of protobuf
 * @param request contains psagent attempt id
 * @throws ServiceException
 */
@SuppressWarnings("unchecked")
@Override
public PSAgentRegisterResponse psAgentRegister(RpcController controller, PSAgentRegisterRequest request) throws ServiceException {
    if (LOG.isDebugEnabled()) {
        LOG.debug("receive ps agent register, request=" + request);
    }
    PSAgentRegisterResponse.Builder registerResponseBuilder = PSAgentRegisterResponse.newBuilder();
    PSAgentAttemptId psAgentAttemptId = ProtobufUtil.convertToId(request.getPsAgentAttemptId());
    if (!psAgentLastHeartbeatTS.containsKey(psAgentAttemptId)) {
        LOG.error("psagent attempt " + psAgentAttemptId + " is not in running worker attempt set now, shutdown it");
        registerResponseBuilder.setCommand(PSAgentCommandProto.PSAGENT_SHUTDOWN);
    } else {
        registerPSAgentAttemptId(psAgentAttemptId);
        Location location = new Location(request.getLocation().getIp(), request.getLocation().getPort());
        context.getEventHandler().handle(new PSAgentRegisterEvent(psAgentAttemptId, location));
        registerResponseBuilder.setCommand(PSAgentCommandProto.PSAGENT_SUCCESS);
    }
    LOG.info("psagent " + psAgentAttemptId + " register finished!");
    return registerResponseBuilder.build();
}
Also used : PSAgentAttemptId(com.tencent.angel.psagent.PSAgentAttemptId) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 7 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class PSAgentManager method initPSAgents.

private void initPSAgents() throws InvalidParameterException {
    for (int i = 0; i < psAgentNumber; i++) {
        PSAgentId id = new PSAgentId(i);
        AMPSAgent agent = new AMPSAgent(context, id, new Location(ips[i], 0));
        psAgentMap.put(id, agent);
    }
}
Also used : PSAgentId(com.tencent.angel.psagent.PSAgentId) Location(com.tencent.angel.common.location.Location)

Example 8 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class PSManagerTest method testPSDone.

@SuppressWarnings("unchecked")
@Test
public void testPSDone() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Location masterLoc = ps.getMasterLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        WorkerDoneRequest workerRequest = WorkerDoneRequest.newBuilder().setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id)).build();
        WorkerDoneResponse workerResponse = master.workerDone(null, workerRequest);
        assertEquals(workerResponse.getCommand(), WorkerCommandProto.W_SUCCESS);
        Thread.sleep(5000);
        angelAppMaster.getAppContext().getEventHandler().handle(new AppEvent(AppEventType.COMMIT));
        PSDoneRequest request = PSDoneRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).build();
        master.psDone(null, request);
        Thread.sleep(5000);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
        assertEquals(psAttempt.getInternalState(), PSAttemptStateInternal.SUCCESS);
        assertTrue(amPs.getState() == AMParameterServerState.SUCCESS);
        assertEquals(amPs.getNextAttemptNumber(), 1);
        assertNull(amPs.getRunningAttemptId());
        assertEquals(amPs.getSuccessAttemptId(), psAttempt0Id);
        assertEquals(amPs.getPSAttempts().size(), 1);
    } catch (Exception x) {
        LOG.error("run testPSDone failed ", x);
        throw x;
    }
}
Also used : WorkerDoneRequest(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneRequest) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) AngelException(com.tencent.angel.exception.AngelException) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) AppEvent(com.tencent.angel.master.app.AppEvent) TConnection(com.tencent.angel.ipc.TConnection) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) WorkerDoneResponse(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneResponse) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 9 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class AngelKubernetesClient method updateMaster.

@Override
protected void updateMaster(int maxWaitSeconds) throws Exception {
    int port;
    int tryTime = 0;
    TConnection connection = TConnectionManager.getConnection(conf);
    while (tryTime < maxWaitSeconds) {
        String masterPodIp = k8sClientApp.getAngelMasterPodIp();
        port = conf.getInt(AngelConf.ANGEL_KUBERNETES_MASTER_PORT, AngelConf.DEFAULT_ANGEL_KUBERNETES_MASTER_PORT);
        if (masterPodIp == null || "".equals(masterPodIp)) {
            LOG.info("AM not assigned to Job. Waiting to get the AM ...");
            Thread.sleep(1000);
            tryTime++;
        } else {
            try {
                masterLocation = new Location(masterPodIp, port);
                LOG.info("master host=" + masterLocation.getIp() + ", port=" + masterLocation.getPort());
                LOG.info("start to create rpc client to am");
                Thread.sleep(5000);
                master = connection.getMasterService(masterLocation.getIp(), masterLocation.getPort());
                startHeartbeat();
            } catch (Exception e) {
                LOG.error("Register to Master failed, ", e);
                Thread.sleep(1000);
                tryTime++;
                continue;
            }
            break;
        }
    }
    if (tryTime >= maxWaitSeconds && masterLocation == null) {
        throw new IOException("wait for master location timeout");
    }
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) IOException(java.io.IOException) AngelException(com.tencent.angel.exception.AngelException) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location)

Example 10 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class AngelYarnClient method updateMaster.

@Override
protected void updateMaster(int maxWaitSeconds) throws Exception {
    String host = null;
    int port = -1;
    int tryTime = 0;
    TConnection connection = TConnectionManager.getConnection(conf);
    while (tryTime < maxWaitSeconds) {
        ApplicationReport appMaster = yarnClient.getApplicationReport(appId);
        String diagnostics = (appMaster == null ? "application report is null" : appMaster.getDiagnostics());
        if (appMaster == null || appMaster.getYarnApplicationState() == YarnApplicationState.FAILED || appMaster.getYarnApplicationState() == YarnApplicationState.KILLED) {
            throw new IOException("Failed to run job : " + diagnostics);
        }
        if (appMaster.getYarnApplicationState() == YarnApplicationState.FINISHED) {
            LOG.info("application is finished!!");
            master = null;
            return;
        }
        host = appMaster.getHost();
        port = appMaster.getRpcPort();
        if (host == null || "".equals(host)) {
            LOG.info("AM not assigned to Job. Waiting to get the AM ...");
            Thread.sleep(1000);
            tryTime++;
        } else if (UNAVAILABLE.equals(host)) {
            Thread.sleep(1000);
            tryTime++;
        } else {
            String appMasterurl = "appMaster getTrackingUrl = " + appMaster.getTrackingUrl().replace("proxy", "cluster/app");
            LOG.info(appMasterurl);
            System.out.println(appMasterurl);
            LOG.info("master host=" + host + ", port=" + port);
            try {
                masterLocation = new Location(host, port);
                LOG.info("start to create rpc client to am");
                master = connection.getMasterService(masterLocation.getIp(), masterLocation.getPort());
                startHeartbeat();
            } catch (ServiceException e) {
                LOG.error("Register to Master failed, ", e);
                Thread.sleep(1000);
                tryTime++;
                continue;
            }
            break;
        }
    }
    if (tryTime >= maxWaitSeconds && masterLocation == null) {
        throw new IOException("wait for master location timeout");
    }
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) ServiceException(com.google.protobuf.ServiceException) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location)

Aggregations

Location (com.tencent.angel.common.location.Location)38 TConnection (com.tencent.angel.ipc.TConnection)12 Test (org.junit.Test)12 PSLocation (com.tencent.angel.ps.server.data.PSLocation)10 IOException (java.io.IOException)10 Worker (com.tencent.angel.worker.Worker)9 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)6 ServiceException (com.google.protobuf.ServiceException)5 AngelException (com.tencent.angel.exception.AngelException)5 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)5 ParameterServer (com.tencent.angel.ps.ParameterServer)5 ParameterServerId (com.tencent.angel.ps.ParameterServerId)5 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)5 AngelApplicationMaster (com.tencent.angel.master.AngelApplicationMaster)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 WorkerManager (com.tencent.angel.master.worker.WorkerManager)4 MasterClient (com.tencent.angel.psagent.client.MasterClient)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 MasterServiceTest (com.tencent.angel.master.MasterServiceTest)3