Search in sources :

Example 31 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class MasterClient method workerRegister.

/**
 * Register to master, report the listening port
 *
 * @return WorkerRegisterResponse worker register response
 * @throws ServiceException rpc falied
 */
public WorkerRegisterResponse workerRegister() throws ServiceException {
    Location location = WorkerContext.get().getLocation();
    WorkerRegisterRequest request = WorkerRegisterRequest.newBuilder().setWorkerAttemptId(WorkerContext.get().getWorkerAttemptIdProto()).setLocation(LocationProto.newBuilder().setIp(location.getIp()).setPort(location.getPort()).build()).setPsAgentId(WorkerContext.get().getPSAgent().getId()).build();
    return master.workerRegister(null, request);
}
Also used : PSLocation(com.tencent.angel.ps.server.data.PSLocation) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation) Location(com.tencent.angel.common.location.Location)

Example 32 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class PSAgentLocationManager method getPsLocation.

/**
 * Get PS location
 *
 * @param psId ps id
 * @param sync true means get from Master, false means just get from local cache
 * @return ps location
 * @throws ServiceException
 */
public Location getPsLocation(ParameterServerId psId, boolean sync) throws ServiceException {
    if (!sync) {
        return locationManager.getPsLocation(psId);
    } else {
        Location location = context.getMasterClient().getPSLocation(psId);
        setPsLocation(psId, location);
        return location;
    }
}
Also used : Location(com.tencent.angel.common.location.Location)

Example 33 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class KubernetesWorkerApp method main.

public static void main(String[] args) throws IOException {
    LOG.info("Starting worker...");
    // get configuration from envs
    Configuration conf = new Configuration();
    ConfUtils.addResourceProperties(conf, Constants.ANGEL_CONF_PATH());
    long clusterTimestamp = Long.parseLong(conf.get(AngelConf.ANGEL_KUBERNETES_APP_CLUSTERTIMESTAMP));
    int randomId = Integer.parseInt(conf.get(AngelConf.ANGEL_KUBERNETES_APP_RANDOMID));
    ApplicationId appId = ApplicationId.newInstance(clusterTimestamp, randomId);
    String user = System.getenv(ApplicationConstants.Environment.USER.name());
    int workerGroupIndex = Integer.parseInt(System.getenv(Constants.ENV_EXECUTOR_ID()));
    int workerIndex = workerGroupIndex;
    int attemptIndex = Integer.parseInt(System.getenv(Constants.ENV_EXECUTOR_ATTEMPT_ID()));
    WorkerGroupId workerGroupId = new WorkerGroupId(workerGroupIndex);
    WorkerId workerId = new WorkerId(workerGroupId, workerIndex);
    WorkerAttemptId workerAttemptId = new WorkerAttemptId(workerId, attemptIndex);
    conf.set(AngelConf.ANGEL_WORKERGROUP_ACTUAL_NUM, System.getenv(Constants.ENV_ANGEL_WORKERGROUP_NUMBER()));
    conf.set(AngelConf.ANGEL_TASK_ACTUAL_NUM, System.getenv(Constants.ENV_ANGEL_TASK_NUMBER()));
    conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, System.getenv(Constants.ENV_ANGEL_USER_TASK()));
    LOG.info("actual workergroup number:" + conf.get(AngelConf.ANGEL_WORKERGROUP_ACTUAL_NUM));
    LOG.info("actual task number:" + conf.get(AngelConf.ANGEL_TASK_ACTUAL_NUM));
    // get master location
    String appMasterHost = System.getenv(Constants.ENV_MASTER_BIND_ADDRESS());
    int appMasterPort = Integer.valueOf(System.getenv(Constants.ENV_MASTER_BIND_PORT()));
    Location masterLocation = new Location(appMasterHost, appMasterPort);
    LOG.info("appMasterHost is " + appMasterHost + ", appMasterPort is " + appMasterPort);
    conf.setBoolean("mapred.mapper.new-api", true);
    Worker worker = new Worker(AngelConf.clone(conf), appId, user, workerAttemptId, masterLocation, 0, false);
    try {
        worker.initAndStart();
    } catch (Exception e) {
        LOG.fatal("Failed to start worker.", e);
        worker.error(e.getMessage());
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location)

Example 34 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class Worker method main.

public static void main(String[] args) {
    // get configuration from config file
    Configuration conf = new Configuration();
    conf.addResource(AngelConf.ANGEL_JOB_CONF_FILE);
    String containerIdStr = System.getenv(Environment.CONTAINER_ID.name());
    ContainerId containerId = ConverterUtils.toContainerId(containerIdStr);
    ApplicationAttemptId applicationAttemptId = containerId.getApplicationAttemptId();
    ApplicationId appId = applicationAttemptId.getApplicationId();
    String user = System.getenv(Environment.USER.name());
    // set localDir with enviroment set by nm.
    String[] localSysDirs = StringUtils.getTrimmedStrings(System.getenv(Environment.LOCAL_DIRS.name()));
    conf.setStrings(AngelConf.LOCAL_DIR, localSysDirs);
    LOG.info(AngelConf.LOCAL_DIR + " for child: " + conf.get(AngelConf.LOCAL_DIR));
    int workerGroupIndex = Integer.parseInt(System.getenv(AngelEnvironment.WORKER_GROUP_ID.name()));
    int workerIndex = Integer.parseInt(System.getenv(AngelEnvironment.WORKER_ID.name()));
    int attemptIndex = Integer.parseInt(System.getenv(AngelEnvironment.WORKER_ATTEMPT_ID.name()));
    WorkerGroupId workerGroupId = new WorkerGroupId(workerGroupIndex);
    WorkerId workerId = new WorkerId(workerGroupId, workerIndex);
    WorkerAttemptId workerAttemptId = new WorkerAttemptId(workerId, attemptIndex);
    conf.set(AngelConf.ANGEL_WORKERGROUP_ACTUAL_NUM, System.getenv(AngelEnvironment.WORKERGROUP_NUMBER.name()));
    conf.set(AngelConf.ANGEL_TASK_ACTUAL_NUM, System.getenv(AngelEnvironment.TASK_NUMBER.name()));
    conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, System.getenv(AngelEnvironment.ANGEL_USER_TASK.name()));
    LOG.info("actual workergroup number:" + conf.get(AngelConf.ANGEL_WORKERGROUP_ACTUAL_NUM));
    LOG.info("actual task number:" + conf.get(AngelConf.ANGEL_TASK_ACTUAL_NUM));
    // get master location
    String masterAddr = System.getenv(AngelEnvironment.LISTEN_ADDR.name());
    String portStr = System.getenv(AngelEnvironment.LISTEN_PORT.name());
    Location masterLocation = new Location(masterAddr, Integer.valueOf(portStr));
    String startClock = System.getenv(AngelEnvironment.INIT_MIN_CLOCK.name());
    Worker worker = new Worker(AngelConf.clone(conf), appId, user, workerAttemptId, masterLocation, Integer.valueOf(startClock), false);
    try {
        worker.initAndStart();
    } catch (Exception e) {
        LOG.fatal("Failed to start worker.", e);
        worker.error(e.getMessage());
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ServiceException(com.google.protobuf.ServiceException) IOException(java.io.IOException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Location(com.tencent.angel.common.location.Location)

Example 35 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class InitNeighborTest method testCSR.

@Test
public void testCSR() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client = worker.getPSAgent().getMatrixClient(SPARSE_INT_MAT, 0);
    int matrixId = client.getMatrixId();
    ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
    Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
    TConnection connection = TConnectionManager.getConnection(ps.getConf());
    MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
    // Init node neighbors
    Int2ObjectOpenHashMap<int[]> nodeIdToNeighbors = new Int2ObjectOpenHashMap<>();
    nodeIdToNeighbors.put(1, new int[] { 2, 3 });
    nodeIdToNeighbors.put(2, new int[] { 4 });
    InitNeighbor func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();
    nodeIdToNeighbors.put(1, new int[] { 4, 5, 6 });
    nodeIdToNeighbors.put(2, new int[] { 5 });
    nodeIdToNeighbors.put(4, new int[] { 5, 6 });
    func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();
    nodeIdToNeighbors.put(3, new int[] { 4, 5, 6 });
    nodeIdToNeighbors.put(5, new int[] { 6 });
    nodeIdToNeighbors.put(8, new int[] { 3, 4 });
    func = new InitNeighbor(new InitNeighborParam(matrixId, nodeIdToNeighbors));
    client.asyncUpdate(func).get();
    nodeIdToNeighbors.clear();
    client.asyncUpdate(new InitNeighborOver(new InitNeighborOverParam(matrixId))).get();
    // Sample the neighbors
    int[] nodeIds = new int[] { 1, 2, 3, 4, 5, 6, 7, 8 };
    SampleNeighborParam param = new SampleNeighborParam(matrixId, nodeIds, -1);
    Int2ObjectOpenHashMap<int[]> result = ((SampleNeighborResult) (client.get(new SampleNeighbor(param)))).getNodeIdToNeighbors();
    ObjectIterator<Entry<int[]>> iter = result.int2ObjectEntrySet().fastIterator();
    LOG.info("==============================sample neighbors result============================");
    Entry<int[]> entry;
    while (iter.hasNext()) {
        entry = iter.next();
        LOG.info("node id = " + entry.getIntKey() + ", neighbors = " + Arrays.toString(entry.getValue()));
    }
    client.checkpoint(0);
    ps.stop(-1);
    PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
    master.psError(null, request);
    Thread.sleep(10000);
    result = ((SampleNeighborResult) (client.get(new SampleNeighbor(param)))).getNodeIdToNeighbors();
    iter = result.int2ObjectEntrySet().fastIterator();
    LOG.info("==============================sample neighbors result============================");
    while (iter.hasNext()) {
        entry = iter.next();
        LOG.info("node id = " + entry.getIntKey() + ", neighbors = " + Arrays.toString(entry.getValue()));
    }
}
Also used : Int2ObjectOpenHashMap(it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap) InitNeighborParam(com.tencent.angel.graph.client.initneighbor.InitNeighborParam) SampleNeighborResult(com.tencent.angel.graph.client.sampleneighbor.SampleNeighborResult) SampleNeighbor(com.tencent.angel.graph.client.sampleneighbor.SampleNeighbor) InitNeighbor(com.tencent.angel.graph.client.initneighbor.InitNeighbor) ParameterServer(com.tencent.angel.ps.ParameterServer) Entry(it.unimi.dsi.fastutil.ints.Int2ObjectMap.Entry) TConnection(com.tencent.angel.ipc.TConnection) InitNeighborOverParam(com.tencent.angel.graph.client.initneighbor.InitNeighborOverParam) SampleNeighborParam(com.tencent.angel.graph.client.sampleneighbor.SampleNeighborParam) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) InitNeighborOver(com.tencent.angel.graph.client.initneighbor.InitNeighborOver) MasterProtocol(com.tencent.angel.master.MasterProtocol) PSErrorRequest(com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSErrorRequest) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Aggregations

Location (com.tencent.angel.common.location.Location)38 TConnection (com.tencent.angel.ipc.TConnection)12 Test (org.junit.Test)12 PSLocation (com.tencent.angel.ps.server.data.PSLocation)10 IOException (java.io.IOException)10 Worker (com.tencent.angel.worker.Worker)9 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)6 ServiceException (com.google.protobuf.ServiceException)5 AngelException (com.tencent.angel.exception.AngelException)5 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)5 ParameterServer (com.tencent.angel.ps.ParameterServer)5 ParameterServerId (com.tencent.angel.ps.ParameterServerId)5 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)5 AngelApplicationMaster (com.tencent.angel.master.AngelApplicationMaster)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 WorkerManager (com.tencent.angel.master.worker.WorkerManager)4 MasterClient (com.tencent.angel.psagent.client.MasterClient)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 MasterServiceTest (com.tencent.angel.master.MasterServiceTest)3