Search in sources :

Example 26 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class PSAgentTest method testPSClient.

@Test
public void testPSClient() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        assertTrue(taskManager != null);
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        assertTrue(worker != null);
        PSAgent psAgent = worker.getPSAgent();
        assertTrue(psAgent != null);
        // psAgent.initAndStart();
        // test conf
        Configuration conf = psAgent.getConf();
        assertTrue(conf != null);
        assertEquals(conf.get(AngelConf.ANGEL_DEPLOY_MODE), "LOCAL");
        // test master location
        Location masterLoc = psAgent.getMasterLocation();
        String ipRegex = "(2[5][0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})";
        Pattern pattern = Pattern.compile(ipRegex);
        Matcher matcher = pattern.matcher(masterLoc.getIp());
        assertTrue(matcher.matches());
        assertTrue(masterLoc.getPort() >= 1 && masterLoc.getPort() <= 65535);
        // test app id
        ApplicationId appId = psAgent.getAppId();
        // test user
        String user = psAgent.getUser();
        // test ps agent attempt id
        int psAgentId = psAgent.getId();
        assertEquals(psAgentId, 1);
        // test connection
        TConnection conn = psAgent.getConnection();
        assertTrue(conn != null);
        // test master client
        MasterClient masterClient = psAgent.getMasterClient();
        assertTrue(masterClient != null);
        // test ip
        String ip = psAgent.getIp();
        matcher = pattern.matcher(ip);
        assertTrue(matcher.matches());
        // test loc
        Location loc = psAgent.getLocation();
        assertTrue(loc != null);
        matcher = pattern.matcher(loc.getIp());
        assertTrue(matcher.matches());
        assertTrue(loc.getPort() >= 1 && loc.getPort() <= 65535);
    } catch (Exception x) {
        LOG.error("run testPSClient failed ", x);
        throw x;
    }
}
Also used : Pattern(java.util.regex.Pattern) Configuration(org.apache.hadoop.conf.Configuration) Matcher(java.util.regex.Matcher) MasterClient(com.tencent.angel.psagent.client.MasterClient) WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) TConnection(com.tencent.angel.ipc.TConnection) AngelApplicationMaster(com.tencent.angel.master.AngelApplicationMaster) Worker(com.tencent.angel.worker.Worker) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 27 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class PSAgentTest method testMatrixLocationManager.

@Test
public void testMatrixLocationManager() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        assertTrue(taskManager != null);
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        assertTrue(worker != null);
        PSAgent psAgent = worker.getPSAgent();
        assertTrue(psAgent != null);
        PSAgentMatrixMetaManager matrixPartitionRouter = psAgent.getMatrixMetaManager();
        PSAgentLocationManager locationCache = psAgent.getLocationManager();
        assertTrue(matrixPartitionRouter != null);
        // test ps location
        Location psLoc = locationCache.getPsLocation(psId);
        String ipRegex = "(2[5][0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})";
        Pattern pattern = Pattern.compile(ipRegex);
        Matcher matcher = pattern.matcher(psLoc.getIp());
        assertTrue(matcher.matches());
        assertTrue(psLoc.getPort() >= 1 && psLoc.getPort() <= 65535);
        int matrix1Id = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int matrix2Id = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        // test partitions
        List<PartitionKey> partition1Keys = matrixPartitionRouter.getPartitions(matrix1Id);
        assertEquals(partition1Keys.size(), 2);
        List<PartitionKey> partition2Keys = matrixPartitionRouter.getPartitions(matrix1Id);
        assertEquals(partition2Keys.size(), 2);
        partition1Keys.clear();
        partition1Keys = matrixPartitionRouter.getPartitions(matrix1Id, 0);
        assertEquals(partition1Keys.size(), 2);
        partition2Keys.clear();
        partition2Keys = matrixPartitionRouter.getPartitions(matrix1Id, 0);
        assertEquals(partition2Keys.size(), 2);
        int rowPartSize = matrixPartitionRouter.getRowPartitionSize(matrix1Id, 0);
        assertEquals(rowPartSize, 2);
        rowPartSize = matrixPartitionRouter.getRowPartitionSize(matrix1Id, 0);
        assertEquals(rowPartSize, 2);
    } catch (Exception x) {
        LOG.error("run testMatrixLocationManager failed ", x);
        throw x;
    }
}
Also used : Pattern(java.util.regex.Pattern) Matcher(java.util.regex.Matcher) PSAgentMatrixMetaManager(com.tencent.angel.psagent.matrix.PSAgentMatrixMetaManager) WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) AngelApplicationMaster(com.tencent.angel.master.AngelApplicationMaster) PSAgentLocationManager(com.tencent.angel.psagent.matrix.PSAgentLocationManager) Worker(com.tencent.angel.worker.Worker) PartitionKey(com.tencent.angel.PartitionKey) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 28 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class MasterService method getWorkerLogDir.

/**
 * Get worker log url
 *
 * @param controller rpc controller
 * @param request rpc request contains worker id
 * @return worker log url
 * @throws ServiceException worker does not exist
 */
@Override
public GetWorkerLogDirResponse getWorkerLogDir(RpcController controller, GetWorkerLogDirRequest request) throws ServiceException {
    WorkerId workerId = ProtobufUtil.convertToId(request.getWorkerId());
    AMWorker worker = context.getWorkerManager().getWorker(workerId);
    if (worker == null) {
        throw new ServiceException("can not find worker " + workerId);
    }
    WorkerAttempt workerAttempt = worker.getRunningAttempt();
    if (workerAttempt == null) {
        return GetWorkerLogDirResponse.newBuilder().setLogDir("").build();
    }
    Location loc = workerAttempt.getLocation();
    Container container = workerAttempt.getContainer();
    if (loc == null || container == null) {
        return GetWorkerLogDirResponse.newBuilder().setLogDir("").build();
    }
    return GetWorkerLogDirResponse.newBuilder().setLogDir("http://" + loc.getIp() + ":" + yarnNMWebPort + "/node/containerlogs/" + container.getId() + "/angel/syslog/?start=0").build();
}
Also used : Container(org.apache.hadoop.yarn.api.records.Container) ServiceException(com.google.protobuf.ServiceException) AMWorker(com.tencent.angel.master.worker.worker.AMWorker) WorkerAttempt(com.tencent.angel.master.worker.attempt.WorkerAttempt) WorkerId(com.tencent.angel.worker.WorkerId) PSLocation(com.tencent.angel.ps.server.data.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 29 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class AppTest method testGetJobReport.

@SuppressWarnings("unchecked")
@Test
public void testGetJobReport() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(angelAppMaster.getConfig());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int jobIteration = (task0Iteration < task1Iteration) ? task0Iteration : task1Iteration;
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        Thread.sleep(1000);
        GetJobReportRequest request = GetJobReportRequest.newBuilder().setAppId(LocalClusterContext.get().getAppId().toString()).build();
        GetJobReportResponse response = master.getJobReport(null, request);
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_RUNNING);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed"));
        Thread.sleep(5000);
        response = master.getJobReport(null, request);
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        assertEquals(response.getJobReport().getDiagnostics(), "failed");
        // Thread.sleep(5000);
        // response = master.getJobReport(null, request);
        // assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        // assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        // assertEquals(response.getJobReport().getDiagnostics(), "failed");
        Thread.sleep(10000);
        try {
            response = master.getJobReport(null, request);
        } catch (Exception x) {
            response = tryGetResponseFromFile(true);
        }
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        assertEquals(response.getJobReport().getDiagnostics(), "failed");
    } catch (Exception x) {
        LOG.error("run testGetJobReport failed ", x);
        throw x;
    }
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent) GetJobReportRequest(com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportRequest) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location) GetJobReportResponse(com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportResponse) Test(org.junit.Test)

Example 30 with Location

use of com.tencent.angel.common.location.Location in project angel by Tencent.

the class MasterRecoverTest method testMasterRecover.

@SuppressWarnings("unchecked")
@Test
public void testMasterRecover() throws Exception {
    try {
        ApplicationAttemptId appAttempt1Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 1);
        ApplicationAttemptId appAttempt2Id = ApplicationAttemptId.newInstance(LocalClusterContext.get().getAppId(), 2);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt1Id);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        int writeIntervalMS = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_AM_WRITE_STATE_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_AM_WRITE_STATE_INTERVAL_MS);
        Thread.sleep(writeIntervalMS * 2);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
        Thread.sleep(10000);
        angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.RUNNING);
        LOG.info("angelAppMaster.getAppContext().getApplicationAttemptId()=" + angelAppMaster.getAppContext().getApplicationAttemptId());
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
        PartitionKey w1Part0Key = new PartitionKey(0, w1Id, 0, 0, 1, 50000);
        PartitionKey w1Part1Key = new PartitionKey(1, w1Id, 0, 50000, 1, 100000);
        PartitionKey w2Part0Key = new PartitionKey(0, w2Id, 0, 0, 1, 50000);
        PartitionKey w2Part1Key = new PartitionKey(1, w2Id, 0, 50000, 1, 100000);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        LOG.info("worker=" + worker);
        LOG.info("worker.getTaskManager()=" + worker.getTaskManager());
        LOG.info("worker.getTaskManager().getRunningTask()=" + worker.getTaskManager().getRunningTask().size());
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext();
        assertEquals(task0Context.getEpoch(), task0Iteration);
        assertEquals(task1Context.getEpoch(), task1Iteration);
        LOG.info("===============worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size()=" + worker.getPSAgent().getMatrixMetaManager().getMatrixMetas().size());
        assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w1Id));
        assertTrue(worker.getPSAgent().getMatrixMetaManager().exist(w2Id));
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part0Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w1Part1Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part0Key).get(0), psId);
        assertEquals(worker.getPSAgent().getMatrixMetaManager().getPss(w2Part1Key).get(0), psId);
        ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed", true));
        Thread.sleep(10000);
        angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertEquals(angelAppMaster.getAppContext().getApplicationAttemptId(), appAttempt2Id);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
    } catch (Exception x) {
        LOG.error("run testMasterRecover failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ParameterServer(com.tencent.angel.ps.ParameterServer) TConnection(com.tencent.angel.ipc.TConnection) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) PartitionKey(com.tencent.angel.PartitionKey) Worker(com.tencent.angel.worker.Worker) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Aggregations

Location (com.tencent.angel.common.location.Location)38 TConnection (com.tencent.angel.ipc.TConnection)12 Test (org.junit.Test)12 PSLocation (com.tencent.angel.ps.server.data.PSLocation)10 IOException (java.io.IOException)10 Worker (com.tencent.angel.worker.Worker)9 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)6 ServiceException (com.google.protobuf.ServiceException)5 AngelException (com.tencent.angel.exception.AngelException)5 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)5 ParameterServer (com.tencent.angel.ps.ParameterServer)5 ParameterServerId (com.tencent.angel.ps.ParameterServerId)5 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)5 AngelApplicationMaster (com.tencent.angel.master.AngelApplicationMaster)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 WorkerManager (com.tencent.angel.master.worker.WorkerManager)4 MasterClient (com.tencent.angel.psagent.client.MasterClient)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 MasterServiceTest (com.tencent.angel.master.MasterServiceTest)3