Search in sources :

Example 1 with ClockVectorManager

use of com.tencent.angel.ps.impl.ClockVectorManager in project angel by Tencent.

the class PSManagerTest method testPSError.

@Test
public void testPSError() throws Exception {
    try {
        int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
        task0Context.setMatrixClock(w1Id, w1Clock);
        task1Context.setMatrixClock(w1Id, w1Clock);
        task0Context.setMatrixClock(w2Id, w2Clock);
        task1Context.setMatrixClock(w2Id, w2Clock);
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        assertEquals(amPs.getMaxAttempts(), 4);
        PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
        PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
        PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
        // attempt 0
        ps.stop(-1);
        PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
        assertTrue(psAttempt1 != null);
        assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 2);
        assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 2);
        List<String> diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 1);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        ClockVectorManager clockVectorManager = ps.getClockVectorManager();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
        MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
        int matrixW1Id = w1Task0Client.getMatrixId();
        int[] delta = new int[100000];
        for (int i = 0; i < 100000; i++) {
            delta[i] = 2;
        }
        DenseIntVector deltaVec = new DenseIntVector(100000, delta);
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task0Client.increment(deltaVec);
        deltaVec = new DenseIntVector(100000, delta);
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task1Client.increment(deltaVec);
        w1Task0Client.clock().get();
        w1Task1Client.clock().get();
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
        Thread.sleep(snapshotInterval * 2);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
        assertTrue(psAttempt2 != null);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 3);
        assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 3);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 2);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        assertEquals(sum((DenseIntVector) w1Task0Client.getRow(0)), 400000);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
        assertTrue(psAttempt3 != null);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 3);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(amPs.getState(), AMParameterServerState.FAILED);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertNull(amPs.getRunningAttemptId());
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 4);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
    } catch (Exception x) {
        LOG.error("run testPSError failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.psagent.task.TaskContext) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) AngelException(com.tencent.angel.exception.AngelException) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) DenseIntVector(com.tencent.angel.ml.math.vector.DenseIntVector) TConnection(com.tencent.angel.ipc.TConnection) PSAttemptId(com.tencent.angel.ps.PSAttemptId) ClockVectorManager(com.tencent.angel.ps.impl.ClockVectorManager) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 2 with ClockVectorManager

use of com.tencent.angel.ps.impl.ClockVectorManager in project angel by Tencent.

the class MasterRecoverTest method checkMatrixInfo.

private void checkMatrixInfo(ParameterServer ps, int w1Id, int w2Id, int w1Clock, int w2Clock) {
    MatrixStorageManager matrixPartManager = ps.getMatrixStorageManager();
    ClockVectorManager clockVectorManager = ps.getClockVectorManager();
    ConcurrentHashMap<Integer, ServerMatrix> matrixIdMap = matrixPartManager.getMatrices();
    ServerMatrix sw1 = matrixIdMap.get(w1Id);
    ServerMatrix sw2 = matrixIdMap.get(w2Id);
    assertTrue(sw1 != null);
    assertTrue(sw2 != null);
    LOG.info("======================partition key is " + sw1.getPartition(0).getPartitionKey());
    LOG.info("======================partition key is " + sw1.getPartition(1).getPartitionKey());
    assertEquals(sw1.getPartition(0).getPartitionKey().getStartRow(), 0);
    assertEquals(sw1.getPartition(0).getPartitionKey().getEndRow(), 1);
    assertEquals(sw1.getPartition(0).getPartitionKey().getStartCol(), 0);
    assertEquals(sw1.getPartition(0).getPartitionKey().getEndCol(), 50000);
    assertEquals(sw1.getPartition(0).getPartitionKey().getMatrixId(), w1Id);
    assertEquals(sw1.getPartition(0).getPartitionKey().getPartitionId(), 0);
    assertEquals(clockVectorManager.getPartClock(sw1.getId(), 0), w1Clock);
    assertEquals(sw1.getPartition(1).getPartitionKey().getStartRow(), 0);
    assertEquals(sw1.getPartition(1).getPartitionKey().getEndRow(), 1);
    assertEquals(sw1.getPartition(1).getPartitionKey().getStartCol(), 50000);
    assertEquals(sw1.getPartition(1).getPartitionKey().getEndCol(), 100000);
    assertEquals(sw1.getPartition(1).getPartitionKey().getMatrixId(), w1Id);
    assertEquals(sw1.getPartition(1).getPartitionKey().getPartitionId(), 1);
    assertEquals(clockVectorManager.getPartClock(sw1.getId(), 1), w1Clock);
    assertEquals(sw2.getPartition(0).getPartitionKey().getStartRow(), 0);
    assertEquals(sw2.getPartition(0).getPartitionKey().getEndRow(), 1);
    assertEquals(sw2.getPartition(0).getPartitionKey().getStartCol(), 0);
    assertEquals(sw2.getPartition(0).getPartitionKey().getEndCol(), 50000);
    assertEquals(sw2.getPartition(0).getPartitionKey().getMatrixId(), w2Id);
    assertEquals(sw2.getPartition(0).getPartitionKey().getPartitionId(), 0);
    assertEquals(clockVectorManager.getPartClock(sw2.getId(), 0), w2Clock);
    assertEquals(sw2.getPartition(1).getPartitionKey().getStartRow(), 0);
    assertEquals(sw2.getPartition(1).getPartitionKey().getEndRow(), 1);
    assertEquals(sw2.getPartition(1).getPartitionKey().getStartCol(), 50000);
    assertEquals(sw2.getPartition(1).getPartitionKey().getEndCol(), 100000);
    assertEquals(sw2.getPartition(1).getPartitionKey().getMatrixId(), w2Id);
    assertEquals(sw2.getPartition(1).getPartitionKey().getPartitionId(), 1);
    assertEquals(clockVectorManager.getPartClock(sw2.getId(), 1), w2Clock);
}
Also used : ClockVectorManager(com.tencent.angel.ps.impl.ClockVectorManager) ServerMatrix(com.tencent.angel.ps.impl.matrix.ServerMatrix) MatrixStorageManager(com.tencent.angel.ps.impl.MatrixStorageManager)

Example 3 with ClockVectorManager

use of com.tencent.angel.ps.impl.ClockVectorManager in project angel by Tencent.

the class PSManagerTest method checkMatrixInfo.

private void checkMatrixInfo(ParameterServer ps, int w1Id, int w2Id, int w1Clock, int w2Clock) {
    PSMatrixMetaManager matrixPartManager = ps.getMatrixMetaManager();
    ClockVectorManager clockVectorManager = ps.getClockVectorManager();
    Map<Integer, MatrixMeta> matrixIdMap = matrixPartManager.getMatrixMetas();
    MatrixMeta sw1 = matrixIdMap.get(w1Id);
    MatrixMeta sw2 = matrixIdMap.get(w2Id);
    assertNotNull(sw1);
    assertNotNull(sw2);
    assertEquals(sw1.getPartitionMeta(0).getPartitionKey().getStartRow(), 0);
    assertEquals(sw1.getPartitionMeta(0).getPartitionKey().getEndRow(), 1);
    assertEquals(sw1.getPartitionMeta(0).getPartitionKey().getStartCol(), 0);
    assertEquals(sw1.getPartitionMeta(0).getPartitionKey().getEndCol(), 50000);
    assertEquals(sw1.getPartitionMeta(0).getPartitionKey().getMatrixId(), w1Id);
    assertEquals(sw1.getPartitionMeta(0).getPartitionKey().getPartitionId(), 0);
    assertEquals(clockVectorManager.getPartClock(sw1.getId(), 0), w1Clock);
    assertEquals(sw1.getPartitionMeta(1).getPartitionKey().getStartRow(), 0);
    assertEquals(sw1.getPartitionMeta(1).getPartitionKey().getEndRow(), 1);
    assertEquals(sw1.getPartitionMeta(1).getPartitionKey().getStartCol(), 50000);
    assertEquals(sw1.getPartitionMeta(1).getPartitionKey().getEndCol(), 100000);
    assertEquals(sw1.getPartitionMeta(1).getPartitionKey().getMatrixId(), w1Id);
    assertEquals(sw1.getPartitionMeta(1).getPartitionKey().getPartitionId(), 1);
    assertEquals(clockVectorManager.getPartClock(sw1.getId(), 1), w1Clock);
    assertEquals(sw2.getPartitionMeta(0).getPartitionKey().getStartRow(), 0);
    assertEquals(sw2.getPartitionMeta(0).getPartitionKey().getEndRow(), 1);
    assertEquals(sw2.getPartitionMeta(0).getPartitionKey().getStartCol(), 0);
    assertEquals(sw2.getPartitionMeta(0).getPartitionKey().getEndCol(), 50000);
    assertEquals(sw2.getPartitionMeta(0).getPartitionKey().getMatrixId(), w2Id);
    assertEquals(sw2.getPartitionMeta(0).getPartitionKey().getPartitionId(), 0);
    assertEquals(clockVectorManager.getPartClock(sw2.getId(), 0), w2Clock);
    assertEquals(sw2.getPartitionMeta(1).getPartitionKey().getStartRow(), 0);
    assertEquals(sw2.getPartitionMeta(1).getPartitionKey().getEndRow(), 1);
    assertEquals(sw2.getPartitionMeta(1).getPartitionKey().getStartCol(), 50000);
    assertEquals(sw2.getPartitionMeta(1).getPartitionKey().getEndCol(), 100000);
    assertEquals(sw2.getPartitionMeta(1).getPartitionKey().getMatrixId(), w2Id);
    assertEquals(sw2.getPartitionMeta(1).getPartitionKey().getPartitionId(), 1);
    assertEquals(clockVectorManager.getPartClock(sw2.getId(), 1), w2Clock);
}
Also used : PSMatrixMetaManager(com.tencent.angel.ps.impl.PSMatrixMetaManager) ClockVectorManager(com.tencent.angel.ps.impl.ClockVectorManager) MatrixMeta(com.tencent.angel.ml.matrix.MatrixMeta)

Aggregations

ClockVectorManager (com.tencent.angel.ps.impl.ClockVectorManager)3 Location (com.tencent.angel.common.location.Location)1 AngelException (com.tencent.angel.exception.AngelException)1 TConnection (com.tencent.angel.ipc.TConnection)1 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)1 PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)1 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)1 DenseIntVector (com.tencent.angel.ml.math.vector.DenseIntVector)1 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)1 PSAttemptId (com.tencent.angel.ps.PSAttemptId)1 MatrixStorageManager (com.tencent.angel.ps.impl.MatrixStorageManager)1 PSMatrixMetaManager (com.tencent.angel.ps.impl.PSMatrixMetaManager)1 ParameterServer (com.tencent.angel.ps.impl.ParameterServer)1 ServerMatrix (com.tencent.angel.ps.impl.matrix.ServerMatrix)1 MatrixClient (com.tencent.angel.psagent.matrix.MatrixClient)1 TaskContext (com.tencent.angel.psagent.task.TaskContext)1 Worker (com.tencent.angel.worker.Worker)1 Test (org.junit.Test)1