Search in sources :

Example 6 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class PSManagerTest method testPSError.

@Test
public void testPSError() throws Exception {
    try {
        int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        assertEquals(amPs.getMaxAttempts(), 4);
        PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
        PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
        PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
        // attempt 0
        ps.stop(-1);
        PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
        assertTrue(psAttempt1 != null);
        assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 2);
        assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 2);
        List<String> diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 1);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
        MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
        int matrixW1Id = w1Task0Client.getMatrixId();
        int[] delta = new int[100000];
        for (int i = 0; i < 100000; i++) {
            delta[i] = 2;
        }
        IntIntVector deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task0Client.increment(deltaVec);
        deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task1Client.increment(deltaVec);
        w1Task0Client.clock().get();
        w1Task1Client.clock().get();
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
        Thread.sleep(snapshotInterval * 2);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
        assertTrue(psAttempt2 != null);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 3);
        assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 3);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 2);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        assertEquals(sum((IntIntVector) w1Task0Client.getRow(0)), 400000);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
        assertTrue(psAttempt3 != null);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 3);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(amPs.getState(), AMParameterServerState.FAILED);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertNull(amPs.getRunningAttemptId());
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 4);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
    } catch (Exception x) {
        LOG.error("run testPSError failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.psagent.task.TaskContext) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) IntIntVector(com.tencent.angel.ml.math2.vector.IntIntVector) AngelException(com.tencent.angel.exception.AngelException) ParameterServer(com.tencent.angel.ps.ParameterServer) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) TConnection(com.tencent.angel.ipc.TConnection) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) IntIntDenseVectorStorage(com.tencent.angel.ml.math2.storage.IntIntDenseVectorStorage) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 7 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class PSManagerTest method testPSReport.

@Test
public void testPSReport() throws Exception {
    try {
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Location masterLoc = ps.getMasterLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        PSReportRequest.Builder builder = PSReportRequest.newBuilder();
        builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id));
        Pair.Builder pairBuilder = Pair.newBuilder();
        pairBuilder.setKey("ps_key1");
        pairBuilder.setValue("100");
        builder.addMetrics(pairBuilder.build());
        pairBuilder.setKey("ps_key2");
        pairBuilder.setValue("200");
        builder.addMetrics(pairBuilder.build());
        MatrixReportProto.Builder matrixBuilder = MatrixReportProto.newBuilder();
        ConcurrentHashMap<Integer, ServerMatrix> matrixIdMap = ps.getMatrixStorageManager().getMatrices();
        for (Entry<Integer, ServerMatrix> matrixEntry : matrixIdMap.entrySet()) {
            builder.addMatrixReports((matrixBuilder.setMatrixId(matrixEntry.getKey()).setMatrixName(matrixEntry.getValue().getName())));
        }
        PSReportResponse response = master.psReport(null, builder.build());
        assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_OK);
        assertEquals(response.getNeedCreateMatricesCount(), 0);
        assertEquals(response.getNeedReleaseMatrixIdsCount(), 0);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
        Map<String, String> metrices = psAttempt.getMetrices();
        assertTrue(metrices.get("ps_key1").equals("100"));
        assertTrue(metrices.get("ps_key2").equals("200"));
        PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
        builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id));
        response = master.psReport(null, builder.build());
        assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_SHUTDOWN);
    } catch (Exception x) {
        LOG.error("run testPSReport failed ", x);
        throw x;
    }
}
Also used : AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ServerMatrix(com.tencent.angel.ps.storage.matrix.ServerMatrix) AngelException(com.tencent.angel.exception.AngelException) ParameterServer(com.tencent.angel.ps.ParameterServer) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) TConnection(com.tencent.angel.ipc.TConnection) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) Location(com.tencent.angel.common.location.Location) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair) Test(org.junit.Test)

Example 8 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class AngelLocalClient method updateMaster.

@Override
protected void updateMaster(int maxWaitSeconds) throws Exception {
    int tryTime = 0;
    TConnection connection = TConnectionManager.getConnection(conf);
    while (tryTime < maxWaitSeconds) {
        LocalMaster localMaster = LocalClusterContext.get().getMaster();
        if (localMaster == null || localMaster.getAppMaster().getAppContext().getMasterService() == null) {
            Thread.sleep(1000);
            tryTime++;
            continue;
        }
        masterLocation = localMaster.getAppMaster().getAppContext().getMasterService().getLocation();
        if (masterLocation == null) {
            Thread.sleep(1000);
            tryTime++;
            continue;
        }
        try {
            LOG.info("start to create rpc client to am");
            master = connection.getMasterService(masterLocation.getIp(), masterLocation.getPort());
            startHeartbeat();
            break;
        } catch (ServiceException e) {
            Thread.sleep(1000);
            tryTime++;
        }
    }
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) LocalMaster(com.tencent.angel.localcluster.LocalMaster) ServiceException(com.google.protobuf.ServiceException)

Example 9 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class PSAgentTest method testPSClient.

@Test
public void testPSClient() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        assertTrue(taskManager != null);
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        assertTrue(worker != null);
        PSAgent psAgent = worker.getPSAgent();
        assertTrue(psAgent != null);
        // psAgent.initAndStart();
        // test conf
        Configuration conf = psAgent.getConf();
        assertTrue(conf != null);
        assertEquals(conf.get(AngelConf.ANGEL_DEPLOY_MODE), "LOCAL");
        // test master location
        Location masterLoc = psAgent.getMasterLocation();
        String ipRegex = "(2[5][0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})";
        Pattern pattern = Pattern.compile(ipRegex);
        Matcher matcher = pattern.matcher(masterLoc.getIp());
        assertTrue(matcher.matches());
        assertTrue(masterLoc.getPort() >= 1 && masterLoc.getPort() <= 65535);
        // test app id
        ApplicationId appId = psAgent.getAppId();
        // test user
        String user = psAgent.getUser();
        // test ps agent attempt id
        int psAgentId = psAgent.getId();
        assertEquals(psAgentId, 1);
        // test connection
        TConnection conn = psAgent.getConnection();
        assertTrue(conn != null);
        // test master client
        MasterClient masterClient = psAgent.getMasterClient();
        assertTrue(masterClient != null);
        // test ip
        String ip = psAgent.getIp();
        matcher = pattern.matcher(ip);
        assertTrue(matcher.matches());
        // test loc
        Location loc = psAgent.getLocation();
        assertTrue(loc != null);
        matcher = pattern.matcher(loc.getIp());
        assertTrue(matcher.matches());
        assertTrue(loc.getPort() >= 1 && loc.getPort() <= 65535);
    } catch (Exception x) {
        LOG.error("run testPSClient failed ", x);
        throw x;
    }
}
Also used : Pattern(java.util.regex.Pattern) Configuration(org.apache.hadoop.conf.Configuration) Matcher(java.util.regex.Matcher) MasterClient(com.tencent.angel.psagent.client.MasterClient) WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) TConnection(com.tencent.angel.ipc.TConnection) AngelApplicationMaster(com.tencent.angel.master.AngelApplicationMaster) Worker(com.tencent.angel.worker.Worker) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 10 with TConnection

use of com.tencent.angel.ipc.TConnection in project angel by Tencent.

the class AppTest method testGetJobReport.

@SuppressWarnings("unchecked")
@Test
public void testGetJobReport() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(angelAppMaster.getConfig());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int jobIteration = (task0Iteration < task1Iteration) ? task0Iteration : task1Iteration;
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        Thread.sleep(1000);
        GetJobReportRequest request = GetJobReportRequest.newBuilder().setAppId(LocalClusterContext.get().getAppId().toString()).build();
        GetJobReportResponse response = master.getJobReport(null, request);
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_RUNNING);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        angelAppMaster.getAppContext().getEventHandler().handle(new InternalErrorEvent(angelAppMaster.getAppContext().getApplicationId(), "failed"));
        Thread.sleep(5000);
        response = master.getJobReport(null, request);
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        assertEquals(response.getJobReport().getDiagnostics(), "failed");
        // Thread.sleep(5000);
        // response = master.getJobReport(null, request);
        // assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        // assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        // assertEquals(response.getJobReport().getDiagnostics(), "failed");
        Thread.sleep(10000);
        try {
            response = master.getJobReport(null, request);
        } catch (Exception x) {
            response = tryGetResponseFromFile(true);
        }
        assertEquals(response.getJobReport().getJobState(), JobStateProto.J_FAILED);
        assertEquals(response.getJobReport().getCurIteration(), jobIteration);
        assertEquals(response.getJobReport().getDiagnostics(), "failed");
    } catch (Exception x) {
        LOG.error("run testGetJobReport failed ", x);
        throw x;
    }
}
Also used : TConnection(com.tencent.angel.ipc.TConnection) InternalErrorEvent(com.tencent.angel.master.app.InternalErrorEvent) GetJobReportRequest(com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportRequest) IOException(java.io.IOException) Location(com.tencent.angel.common.location.Location) GetJobReportResponse(com.tencent.angel.protobuf.generated.ClientMasterServiceProtos.GetJobReportResponse) Test(org.junit.Test)

Aggregations

TConnection (com.tencent.angel.ipc.TConnection)13 Location (com.tencent.angel.common.location.Location)12 Test (org.junit.Test)9 Worker (com.tencent.angel.worker.Worker)6 ParameterServer (com.tencent.angel.ps.ParameterServer)5 AngelException (com.tencent.angel.exception.AngelException)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)3 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)3 MatrixClient (com.tencent.angel.psagent.matrix.MatrixClient)3 IOException (java.io.IOException)3 ServiceException (com.google.protobuf.ServiceException)2 MasterProtocol (com.tencent.angel.master.MasterProtocol)2 InternalErrorEvent (com.tencent.angel.master.app.InternalErrorEvent)2 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)2 Pair (com.tencent.angel.protobuf.generated.MLProtos.Pair)2 PSErrorRequest (com.tencent.angel.protobuf.generated.PSMasterServiceProtos.PSErrorRequest)2 PSAttemptId (com.tencent.angel.ps.PSAttemptId)2 PartitionKey (com.tencent.angel.PartitionKey)1 InitNeighbor (com.tencent.angel.graph.client.initneighbor.InitNeighbor)1