Search in sources :

Example 46 with Worker

use of com.tencent.angel.worker.Worker in project angel by Tencent.

the class PSManagerTest method testPSError.

@Test
public void testPSError() throws Exception {
    try {
        int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        assertEquals(amPs.getMaxAttempts(), 4);
        PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
        PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
        PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
        // attempt 0
        ps.stop(-1);
        PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
        assertTrue(psAttempt1 != null);
        assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 2);
        assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 2);
        List<String> diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 1);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
        MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
        int matrixW1Id = w1Task0Client.getMatrixId();
        int[] delta = new int[100000];
        for (int i = 0; i < 100000; i++) {
            delta[i] = 2;
        }
        IntIntVector deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task0Client.increment(deltaVec);
        deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task1Client.increment(deltaVec);
        w1Task0Client.clock().get();
        w1Task1Client.clock().get();
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
        Thread.sleep(snapshotInterval * 2);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
        assertTrue(psAttempt2 != null);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 3);
        assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 3);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 2);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        assertEquals(sum((IntIntVector) w1Task0Client.getRow(0)), 400000);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
        assertTrue(psAttempt3 != null);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 3);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(amPs.getState(), AMParameterServerState.FAILED);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertNull(amPs.getRunningAttemptId());
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 4);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
    } catch (Exception x) {
        LOG.error("run testPSError failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.psagent.task.TaskContext) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) IntIntVector(com.tencent.angel.ml.math2.vector.IntIntVector) AngelException(com.tencent.angel.exception.AngelException) ParameterServer(com.tencent.angel.ps.ParameterServer) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) TConnection(com.tencent.angel.ipc.TConnection) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) IntIntDenseVectorStorage(com.tencent.angel.ml.math2.storage.IntIntDenseVectorStorage) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 47 with Worker

use of com.tencent.angel.worker.Worker in project angel by Tencent.

the class GetRowsTest method testSparseFloatUDF.

public void testSparseFloatUDF() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client1 = worker.getPSAgent().getMatrixClient(SPARSE_FLOAT_MAT, 0);
    int[] index = genIndexs(feaNum, nnz);
    IntFloatVector deltaVec = null;
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = new IntFloatVector(feaNum, new IntFloatSparseVectorStorage(feaNum));
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], index[i]);
        }
        client1.increment(rowId, deltaVec, true);
    }
    int[] rowIds = new int[rowNum];
    for (int i = 0; i < rowNum; i++) {
        rowIds[i] = i;
    }
    Vector[] rows = client1.getRows(rowIds);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntFloatVector) rows[i]).get(id), deltaVec.get(id), zero);
        }
        Assert.assertEquals(index.length, ((IntFloatVector) rows[i]).size());
    }
}
Also used : Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient)

Example 48 with Worker

use of com.tencent.angel.worker.Worker in project angel by Tencent.

the class GetRowsTest method testSparseLongUDF.

public void testSparseLongUDF() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client1 = worker.getPSAgent().getMatrixClient(SPARSE_LONG_MAT, 0);
    int matrixW1Id = client1.getMatrixId();
    int[] index = genIndexs(feaNum, nnz);
    IntLongVector deltaVec = null;
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = new IntLongVector(feaNum, new IntLongSparseVectorStorage(feaNum));
        for (int i = 0; i < feaNum; i++) deltaVec.set(i, i);
        client1.increment(rowId, deltaVec, true);
    }
    int[] rowIds = new int[rowNum];
    for (int i = 0; i < rowNum; i++) {
        rowIds[i] = i;
    }
    Vector[] rows = client1.getRows(rowIds);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntLongVector) rows[i]).get(id), deltaVec.get(id), zero);
        }
    }
}
Also used : Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient)

Example 49 with Worker

use of com.tencent.angel.worker.Worker in project angel by Tencent.

the class IncrementRowTest method testSparseLongLongKeyUDF.

public void testSparseLongLongKeyUDF() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client1 = worker.getPSAgent().getMatrixClient(SPARSE_LONG_LONG_MAT, 0);
    long[] index = genLongIndexs(feaNum, nnz);
    LongLongVector deltaVec = new LongLongVector(feaNum, new LongLongSparseVectorStorage(feaNum, nnz));
    for (int i = 0; i < nnz; i++) deltaVec.set(index[i], index[i]);
    deltaVec.setRowId(0);
    client1.increment(deltaVec, true);
    LongLongVector row = (LongLongVector) client1.getRow(0);
    for (long id : index) {
        // System.out.println("id=" + id + ", value=" + row.get(id));
        Assert.assertEquals(row.get(id), deltaVec.get(id));
    }
// Assert.assertTrue(index.length == row.size());
}
Also used : Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient)

Example 50 with Worker

use of com.tencent.angel.worker.Worker in project angel by Tencent.

the class IncrementRowTest method testDenseDoubleUDF.

public void testDenseDoubleUDF() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client1 = worker.getPSAgent().getMatrixClient(DENSE_DOUBLE_MAT, 0);
    int[] index = genIndexs(feaNum, nnz);
    IntDoubleVector deltaVec = new IntDoubleVector(feaNum, new IntDoubleDenseVectorStorage(feaNum));
    for (int i = 0; i < feaNum; i++) deltaVec.set(i, i);
    deltaVec.setRowId(0);
    client1.increment(deltaVec, true);
    IntDoubleVector row = (IntDoubleVector) client1.getRow(0);
    for (int id : index) {
        Assert.assertTrue(row.get(id) == deltaVec.get(id));
    }
    Assert.assertEquals(feaNum, row.size());
}
Also used : Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient)

Aggregations

Worker (com.tencent.angel.worker.Worker)196 MatrixClient (com.tencent.angel.psagent.matrix.MatrixClient)183 Test (org.junit.Test)21 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)13 IntDoubleVector (com.tencent.angel.ml.math2.vector.IntDoubleVector)13 IntFloatVector (com.tencent.angel.ml.math2.vector.IntFloatVector)13 IntIntVector (com.tencent.angel.ml.math2.vector.IntIntVector)13 IntLongVector (com.tencent.angel.ml.math2.vector.IntLongVector)12 LongDoubleVector (com.tencent.angel.ml.math2.vector.LongDoubleVector)12 LongFloatVector (com.tencent.angel.ml.math2.vector.LongFloatVector)12 LongIntVector (com.tencent.angel.ml.math2.vector.LongIntVector)12 LongLongVector (com.tencent.angel.ml.math2.vector.LongLongVector)12 Vector (com.tencent.angel.ml.math2.vector.Vector)12 IncrementRows (com.tencent.angel.ml.matrix.psf.update.update.IncrementRows)12 IncrementRowsParam (com.tencent.angel.ml.matrix.psf.update.update.IncrementRowsParam)12 WorkerManager (com.tencent.angel.master.worker.WorkerManager)10 Location (com.tencent.angel.common.location.Location)9 AngelApplicationMaster (com.tencent.angel.master.AngelApplicationMaster)9 LongIndexGet (com.tencent.angel.ml.matrix.psf.get.indexed.LongIndexGet)8 LongIndexGetParam (com.tencent.angel.ml.matrix.psf.get.indexed.LongIndexGetParam)8