Search in sources :

Example 16 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class MasterService method workerRegister.

/**
 * worker register to master
 *
 * @param controller rpc controller of protobuf
 * @param request contains worker attempt id, worker location
 */
@SuppressWarnings("unchecked")
@Override
public WorkerRegisterResponse workerRegister(RpcController controller, WorkerRegisterRequest request) throws ServiceException {
    WorkerRegisterResponse.Builder registerResponseBuilder = WorkerRegisterResponse.newBuilder();
    WorkerAttemptId workerAttemptId = ProtobufUtil.convertToId(request.getWorkerAttemptId());
    LOG.info("Worker " + workerAttemptId + " register, location=" + request.getLocation() + ", psagent id=" + request.getPsAgentId());
    // if worker attempt id is not in monitor set, we should shutdown it
    if (!context.getWorkerManager().isAlive(workerAttemptId)) {
        LOG.error("worker attempt " + workerAttemptId + " is not in running worker attempt set now, shutdown it");
        registerResponseBuilder.setCommand(WorkerCommandProto.W_SHUTDOWN);
    } else {
        context.getWorkerManager().alive(workerAttemptId);
        Location location = new Location(request.getLocation().getIp(), request.getLocation().getPort());
        context.getEventHandler().handle(new WorkerAttemptRegisterEvent(workerAttemptId, location));
        registerResponseBuilder.setCommand(WorkerCommandProto.W_SUCCESS);
        LOG.info("worker attempt " + workerAttemptId + " register finished!");
    }
    return registerResponseBuilder.build();
}
Also used : WorkerRegisterResponse(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerRegisterResponse) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) WorkerAttemptRegisterEvent(com.tencent.angel.master.worker.attempt.WorkerAttemptRegisterEvent) PSLocation(com.tencent.angel.ps.server.data.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 17 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class ProtobufUtil method buildWorkerMetaProto.

private static WorkerMetaInfoProto buildWorkerMetaProto(AMWorker worker) {
    WorkerMetaInfoProto.Builder builder = WorkerMetaInfoProto.newBuilder();
    WorkerAttempt attempt = worker.getRunningAttempt();
    WorkerAttemptIdProto workerAttemptIdProto = convertToIdProto(attempt.getId());
    Location location = attempt.getLocation();
    WorkerLocationProto.Builder locBuilder = WorkerLocationProto.newBuilder();
    locBuilder.setWorkerAttemptId(workerAttemptIdProto);
    if (location != null) {
        locBuilder.setLocation(buildLocation(location));
    }
    builder.setWorkerLocation(locBuilder.build());
    TaskMetaInfoProto.Builder taskMetaBuilder = TaskMetaInfoProto.newBuilder();
    MatrixClock.Builder clockBuilder = MatrixClock.newBuilder();
    for (Entry<TaskId, AMTask> taskEntry : attempt.getTaskMap().entrySet()) {
        AMTask task = taskEntry.getValue();
        taskMetaBuilder.setTaskId(convertToIdProto(taskEntry.getKey()));
        taskMetaBuilder.setIteration(task.getIteration());
        Int2IntOpenHashMap matrixClocks = task.getMatrixClocks();
        for (it.unimi.dsi.fastutil.ints.Int2IntMap.Entry clockEntry : matrixClocks.int2IntEntrySet()) {
            taskMetaBuilder.addMatrixClock(clockBuilder.setMatrixId(clockEntry.getIntKey()).setClock(clockEntry.getIntValue()).build());
        }
        builder.addTasks(taskMetaBuilder.build());
        LOG.debug("task meta=" + taskMetaBuilder.build());
    }
    return builder.build();
}
Also used : WorkerMetaInfoProto(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerMetaInfoProto) TaskId(com.tencent.angel.worker.task.TaskId) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) TaskMetaInfoProto(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.TaskMetaInfoProto) MatrixClock(com.tencent.angel.protobuf.generated.MLProtos.MatrixClock) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) WorkerAttemptIdProto(com.tencent.angel.protobuf.generated.MLProtos.WorkerAttemptIdProto) WorkerLocationProto(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerLocationProto) AMTask(com.tencent.angel.master.task.AMTask) PSLocation(com.tencent.angel.ps.server.data.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 18 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class PSAgent method initAndStart.

public void initAndStart() throws Exception {
    // Init control connection manager
    controlConnectManager = TConnectionManager.getConnection(conf);
    // Get ps locations from master and put them to the location cache.
    locationManager = new PSAgentLocationManager(PSAgentContext.get());
    locationManager.setMasterLocation(masterLocation);
    // Build and initialize rpc client to master
    masterClient = new MasterClient();
    masterClient.init();
    // Get psagent id
    id = masterClient.getPSAgentId();
    // Build PS control rpc client manager
    psControlClientManager = new PSControlClientManager();
    // Build local location
    String localIp = NetUtils.getRealLocalIP();
    int port = NetUtils.chooseAListenPort(conf);
    location = new Location(localIp, port);
    register();
    // Initialize matrix meta information
    // clockCache = new ClockCache();
    List<MatrixMeta> matrixMetas = masterClient.getMatrices();
    LOG.info("PSAgent get matrices from master," + matrixMetas.size());
    this.matrixMetaManager = new PSAgentMatrixMetaManager();
    matrixMetaManager.addMatrices(matrixMetas);
    Map<ParameterServerId, Location> psIdToLocMap = masterClient.getPSLocations();
    List<ParameterServerId> psIds = new ArrayList<>(psIdToLocMap.keySet());
    Collections.sort(psIds, new Comparator<ParameterServerId>() {

        @Override
        public int compare(ParameterServerId s1, ParameterServerId s2) {
            return s1.getIndex() - s2.getIndex();
        }
    });
    int size = psIds.size();
    locationManager.setPsIds(psIds.toArray(new ParameterServerId[0]));
    for (int i = 0; i < size; i++) {
        if (psIdToLocMap.containsKey(psIds.get(i))) {
            locationManager.setPsLocation(psIds.get(i), psIdToLocMap.get(psIds.get(i)));
        }
    }
    matrixTransClient = new MatrixTransportClient();
    userRequestAdapter = new UserRequestAdapter();
    if (runningMode == RunningMode.ANGEL_PS_WORKER) {
        // opLogCache = new MatrixOpLogCache();
        matrixStorageManager = new MatrixStorageManager();
    // int staleness = conf.getInt(AngelConf.ANGEL_STALENESS, AngelConf.DEFAULT_ANGEL_STALENESS);
    // consistencyController = new ConsistencyController(staleness);
    // consistencyController.init();
    }
    psAgentInitFinishedFlag.set(true);
    // Start all services
    matrixTransClient.start();
    userRequestAdapter.start();
    if (runningMode == RunningMode.ANGEL_PS_WORKER) {
    // clockCache.start();
    // opLogCache.start();
    }
}
Also used : MasterClient(com.tencent.angel.psagent.client.MasterClient) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta) PSAgentMatrixMetaManager(com.tencent.angel.psagent.matrix.PSAgentMatrixMetaManager) PSControlClientManager(com.tencent.angel.psagent.client.PSControlClientManager) MatrixTransportClient(com.tencent.angel.psagent.matrix.transport.MatrixTransportClient) PSAgentLocationManager(com.tencent.angel.psagent.matrix.PSAgentLocationManager) MatrixStorageManager(com.tencent.angel.psagent.matrix.storage.MatrixStorageManager) UserRequestAdapter(com.tencent.angel.psagent.matrix.transport.adapter.UserRequestAdapter) ParameterServerId(com.tencent.angel.ps.ParameterServerId) Location(com.tencent.angel.common.location.Location)

Example 19 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class WorkerService method start.

public void start() throws IOException {
    int workerServerPort = NetUtils.chooseAListenPort(WorkerContext.get().getConf());
    String workerServerHost = InetAddress.getLocalHost().getHostAddress();
    location = new Location(workerServerHost, workerServerPort);
    rpcServer = MLRPC.getServer(WorkerService.class, this, new Class<?>[] { WorkerProtocol.class }, workerServerHost, workerServerPort, WorkerContext.get().getConf());
    LOG.info("Starting workerserver service at " + workerServerHost + ":" + workerServerPort);
    rpcServer.openServer();
}
Also used : Location(com.tencent.angel.common.location.Location)

Example 20 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class MasterServiceTest method testMasterService.

@Test
public void testMasterService() throws Exception {
    try {
        LOG.info("===========================testMasterService===============================");
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(worker.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int psAgentId = master.getPSAgentId(null, PSAgentMasterServiceProtos.GetPSAgentIdRequest.getDefaultInstance()).getPsAgentId();
        // worker register
        WorkerAttemptId worker1Attempt0Id = new WorkerAttemptId(new WorkerId(new WorkerGroupId(1), 0), 0);
        WorkerRegisterRequest registeRequest = WorkerRegisterRequest.newBuilder().setPsAgentId(psAgentId).setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker1Attempt0Id)).setLocation(LocationProto.newBuilder().setIp("0.0.0.0").setPort(10000).build()).build();
        WorkerRegisterResponse registerResponse = master.workerRegister(null, registeRequest);
        assertTrue(registerResponse.getCommand() == WorkerCommandProto.W_SHUTDOWN);
        WorkerReportRequest.Builder reportBuilder = WorkerReportRequest.newBuilder();
        Pair.Builder kvBuilder = Pair.newBuilder();
        TaskStateProto.Builder taskBuilder = TaskStateProto.newBuilder();
        reportBuilder.setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id));
        taskBuilder.setProgress(0.20f);
        taskBuilder.setState("RUNNING");
        taskBuilder.setTaskId(ProtobufUtil.convertToIdProto(task0Id));
        kvBuilder.setKey("task_key1");
        kvBuilder.setValue("100");
        taskBuilder.addCounters(kvBuilder.build());
        kvBuilder.setKey("task_key2");
        kvBuilder.setValue("200");
        taskBuilder.addCounters(kvBuilder.build());
        reportBuilder.addTaskReports(taskBuilder.build());
        taskBuilder.setProgress(0.30f);
        taskBuilder.setState("RUNNING");
        taskBuilder.setTaskId(ProtobufUtil.convertToIdProto(task1Id));
        kvBuilder.setKey("task_key1");
        kvBuilder.setValue("1000");
        taskBuilder.addCounters(kvBuilder.build());
        kvBuilder.setKey("task_key2");
        kvBuilder.setValue("2000");
        taskBuilder.addCounters(kvBuilder.build());
        reportBuilder.addTaskReports(taskBuilder.build());
        kvBuilder.setKey("worker_key1");
        kvBuilder.setValue("100");
        reportBuilder.addPairs(kvBuilder.build());
        kvBuilder.setKey("worker_key2");
        kvBuilder.setValue("200");
        reportBuilder.addPairs(kvBuilder.build());
        WorkerReportResponse reportResponse = master.workerReport(null, reportBuilder.build());
        assertTrue(reportResponse.getCommand() == WorkerCommandProto.W_SUCCESS);
        assertEquals(reportResponse.getActiveTaskNum(), 2);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        WorkerAttempt worker0Attempt = angelAppMaster.getAppContext().getWorkerManager().getWorker(worker0Attempt0Id.getWorkerId()).getWorkerAttempt(worker0Attempt0Id);
        assertTrue(worker0Attempt != null);
        Map<String, String> workerMetrics = worker0Attempt.getMetrics();
        String valueForWorkerKey1 = workerMetrics.get("worker_key1");
        String valueForWorkerKey2 = workerMetrics.get("worker_key2");
        assertNotNull(valueForWorkerKey1);
        assertNotNull(valueForWorkerKey2);
        assertEquals(valueForWorkerKey1, "100");
        assertEquals(valueForWorkerKey2, "200");
        AMTaskManager amTaskManager = angelAppMaster.getAppContext().getTaskManager();
        AMTask task0 = amTaskManager.getTask(task0Id);
        AMTask task1 = amTaskManager.getTask(task1Id);
        assertTrue(task0 != null);
        assertTrue(task1 != null);
        Map<String, String> task0Metrics = task0.getMetrics();
        Map<String, String> task1Metrics = task1.getMetrics();
        String valueForTask0Key1 = task0Metrics.get("task_key1");
        String valueForTask0Key2 = task0Metrics.get("task_key2");
        String valueForTask1Key1 = task1Metrics.get("task_key1");
        String valueForTask1Key2 = task1Metrics.get("task_key2");
        assertTrue(valueForTask0Key1 != null);
        assertTrue(valueForTask0Key2 != null);
        assertTrue(valueForTask1Key1 != null);
        assertTrue(valueForTask1Key2 != null);
        assertEquals(valueForTask0Key1, "100");
        assertEquals(valueForTask0Key2, "200");
        assertEquals(valueForTask1Key1, "1000");
        assertEquals(valueForTask1Key2, "2000");
        assertEquals(task0.getProgress(), 0.20f, 0.000001);
        assertEquals(task1.getProgress(), 0.30f, 0.000001);
    } catch (Exception x) {
        LOG.error("run testMasterService failed ", x);
        throw x;
    }
}
Also used : WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) WorkerId(com.tencent.angel.worker.WorkerId) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) TConnection(com.tencent.angel.ipc.TConnection) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) Worker(com.tencent.angel.worker.Worker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) AMTask(com.tencent.angel.master.task.AMTask) Location(com.tencent.angel.common.location.Location) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair) Test(org.junit.Test)

Aggregations

Location (com.tencent.angel.common.location.Location)38 TConnection (com.tencent.angel.ipc.TConnection)12 Test (org.junit.Test)12 PSLocation (com.tencent.angel.ps.server.data.PSLocation)10 IOException (java.io.IOException)10 Worker (com.tencent.angel.worker.Worker)9 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)6 ServiceException (com.google.protobuf.ServiceException)5 AngelException (com.tencent.angel.exception.AngelException)5 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)5 ParameterServer (com.tencent.angel.ps.ParameterServer)5 ParameterServerId (com.tencent.angel.ps.ParameterServerId)5 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)5 AngelApplicationMaster (com.tencent.angel.master.AngelApplicationMaster)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 WorkerManager (com.tencent.angel.master.worker.WorkerManager)4 MasterClient (com.tencent.angel.psagent.client.MasterClient)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 MasterServiceTest (com.tencent.angel.master.MasterServiceTest)3