Search in sources :

Example 1 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class PSAgent method main.

public static void main(String[] args) {
    // get configuration from config file
    Configuration conf = new Configuration();
    conf.addResource(AngelConf.ANGEL_JOB_CONF_FILE);
    String containerIdStr = System.getenv(Environment.CONTAINER_ID.name());
    ContainerId containerId = ConverterUtils.toContainerId(containerIdStr);
    ApplicationAttemptId applicationAttemptId = containerId.getApplicationAttemptId();
    ApplicationId appId = applicationAttemptId.getApplicationId();
    String user = System.getenv(Environment.USER.name());
    // set localDir with enviroment set by nm.
    String[] localSysDirs = StringUtils.getTrimmedStrings(System.getenv(Environment.LOCAL_DIRS.name()));
    conf.setStrings(AngelConf.LOCAL_DIR, localSysDirs);
    LOG.info(AngelConf.LOCAL_DIR + " for child: " + conf.get(AngelConf.LOCAL_DIR));
    String psAgentindex = System.getenv(AngelEnvironment.PSAGENT_ID.name());
    String psAgentAttemptIndex = System.getenv(AngelEnvironment.PSAGENT_ATTEMPT_ID.name());
    String masterAddr = System.getenv(AngelEnvironment.LISTEN_ADDR.name());
    String portStr = System.getenv(AngelEnvironment.LISTEN_PORT.name());
    Location masterLocation = new Location(masterAddr, Integer.valueOf(portStr));
    LOG.info("psAgentindex=" + psAgentindex);
    LOG.info("psAgentAttemptIndex=" + psAgentAttemptIndex);
    LOG.info("masterLocation=" + masterLocation);
    LOG.info("user=" + user);
    LOG.info("appId=" + appId);
    PSAgentId psAgentId = new PSAgentId(Integer.valueOf(psAgentindex));
    PSAgentAttemptId psAgentAttemptId = new PSAgentAttemptId(psAgentId, Integer.valueOf(psAgentAttemptIndex));
    try {
        PSAgent psAgent = new PSAgent(conf, appId, user, psAgentAttemptId, masterAddr, Integer.valueOf(portStr), true, null);
        psAgent.initAndStart();
    } catch (Exception e) {
        LOG.fatal("Failed to start worker.", e);
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) ServiceException(com.google.protobuf.ServiceException) InvalidParameterException(com.tencent.angel.exception.InvalidParameterException) TimeOutException(com.tencent.angel.exception.TimeOutException) AngelException(com.tencent.angel.exception.AngelException) IOException(java.io.IOException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) Location(com.tencent.angel.common.location.Location)

Example 2 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class MasterClient method init.

/**
 * Init protobuf rpc client to master
 *
 * @throws IOException connect to master failed
 */
public void init() throws IOException {
    TConnection connection = TConnectionManager.getConnection(PSAgentContext.get().getConf());
    Location masterLoc = PSAgentContext.get().getPsAgent().getMasterLocation();
    this.master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation) Location(com.tencent.angel.common.location.Location)

Example 3 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class WorkerTest method testWorkerContext.

@Test
public void testWorkerContext() throws IOException {
    try {
        localWorker = LocalClusterContext.get().getWorker(worker0Attempt0Id);
        worker = localWorker.getWorker();
        WorkerContext context = WorkerContext.get();
        assertTrue(context != null);
        // application
        ApplicationId appid = context.getAppId();
        assertTrue(appid != null);
        assertEquals(LocalClusterContext.get().getAppId(), appid);
        assertEquals(worker.getUser(), context.getUser());
        assertEquals(AngelDeployMode.LOCAL, context.getDeployMode());
        assertEquals(conf, context.getConf());
        assertEquals(0, context.getInitMinClock());
        // lcation
        String localIp = NetUtils.getRealLocalIP();
        Location location = context.getLocation();
        assertEquals(localIp, location.getIp());
        int port = location.getPort();
        assertTrue(port > 0 && port < 655355);
        // workerGroup info
        assertEquals(group0Id, context.getWorkerGroupId());
        // worker info
        Worker w = context.getWorker();
        assertTrue(w != null);
        assertTrue(w.equals(worker));
        WorkerId wid = context.getWorkerId();
        assertEquals(worker0Id, wid);
        assertEquals(worker0Attempt0Id, context.getWorkerAttemptId());
        assertEquals(ProtobufUtil.convertToIdProto(worker0Attempt0Id), context.getWorkerAttemptIdProto());
        Map<String, String> workerMetrics = context.getWorkerMetrics();
        assertTrue(workerMetrics != null);
        assertEquals(worker, context.getWorker());
        assertEquals(worker.getDataBlockManager(), context.getDataBlockManager());
        assertEquals(worker.getPSAgent(), context.getPSAgent());
        // task
        assertEquals(2, context.getActiveTaskNum());
        assertEquals(worker.getTaskManager(), context.getTaskManager());
    } catch (Exception x) {
        LOG.error("run testWorkerContext failed ", x);
        throw x;
    }
}
Also used : LocalWorker(com.tencent.angel.localcluster.LocalWorker) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location) MasterServiceTest(com.tencent.angel.master.MasterServiceTest)

Example 4 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class WorkerTest method testMaster.

@Test
public void testMaster() throws Exception {
    try {
        localWorker = LocalClusterContext.get().getWorker(worker0Attempt0Id);
        worker = localWorker.getWorker();
        localMaster = LocalClusterContext.get().getMaster();
        master = localMaster.getAppMaster();
        assertTrue(master != null);
        // master location
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        assertEquals(masterLoc, worker.getMasterLocation());
        // masterClient
        MasterClient masterClient = worker.getPSAgent().getMasterClient();
        WorkerMasterServiceProtos.WorkerRegisterResponse response = masterClient.workerRegister();
        assertTrue(response != null);
        assertEquals(WorkerMasterServiceProtos.WorkerCommandProto.W_SUCCESS, response.getCommand());
    } catch (Exception x) {
        LOG.error("run testMaster failed ", x);
        throw x;
    }
}
Also used : MasterClient(com.tencent.angel.psagent.client.MasterClient) WorkerMasterServiceProtos(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location) MasterServiceTest(com.tencent.angel.master.MasterServiceTest)

Example 5 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class MatrixTransportClient method refreshServerLocation.

/**
 * refresh the server location use async mode
 *
 * @param serverId server id
 */
private void refreshServerLocation(final ParameterServerId serverId) {
    Thread psLocRefresher = new Thread() {

        @Override
        public void run() {
            Location location = null;
            try {
                while (location == null) {
                    Thread.sleep(PSAgentContext.get().getRequestSleepTimeMS());
                    location = PSAgentContext.get().getMasterClient().getPSLocation(serverId);
                    LOG.info("Get PS " + serverId + " location = " + location);
                    if (location != null) {
                        Location oldLocation = PSAgentContext.get().getLocationManager().getPsLocation(serverId);
                        PSAgentContext.get().getLocationManager().setPsLocation(serverId, location);
                        if (oldLocation != null && location.equals(oldLocation)) {
                            refreshServerLocationSuccess(serverId, false);
                        } else {
                            refreshServerLocationSuccess(serverId, true);
                        }
                        return;
                    }
                }
            } catch (Exception x) {
                refreshServerLocationFailed(serverId);
            }
        }
    };
    psLocRefresher.setName("ps-location-getter");
    psLocRefresher.start();
}
Also used : ServiceException(com.google.protobuf.ServiceException) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation) Location(com.tencent.angel.common.location.Location)

Aggregations

Location (com.tencent.angel.common.location.Location)38 TConnection (com.tencent.angel.ipc.TConnection)12 Test (org.junit.Test)12 PSLocation (com.tencent.angel.ps.server.data.PSLocation)10 IOException (java.io.IOException)10 Worker (com.tencent.angel.worker.Worker)9 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)6 ServiceException (com.google.protobuf.ServiceException)5 AngelException (com.tencent.angel.exception.AngelException)5 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)5 ParameterServer (com.tencent.angel.ps.ParameterServer)5 ParameterServerId (com.tencent.angel.ps.ParameterServerId)5 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)5 AngelApplicationMaster (com.tencent.angel.master.AngelApplicationMaster)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 WorkerManager (com.tencent.angel.master.worker.WorkerManager)4 MasterClient (com.tencent.angel.psagent.client.MasterClient)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 MasterServiceTest (com.tencent.angel.master.MasterServiceTest)3