Search in sources :

Example 46 with MatrixClient

use of com.tencent.angel.psagent.matrix.MatrixClient in project angel by Tencent.

the class UpdateRowsTest method testSparseIntLongKeyUDF.

public void testSparseIntLongKeyUDF() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client1 = worker.getPSAgent().getMatrixClient(SPARSE_INT_LONG_MAT, 0);
    int matrixW1Id = client1.getMatrixId();
    long[] index = genLongIndexs(feaNum, nnz);
    LongIntVector deltaVec = null;
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = new LongIntVector(feaNum, new LongIntSparseVectorStorage(feaNum));
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], (int) index[i]);
        }
        client1.update(rowId, deltaVec);
    }
    int[] rowIds = new int[rowNum];
    for (int i = 0; i < rowNum; i++) {
        rowIds[i] = i;
    }
    Vector[] rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (long id : index) {
            Assert.assertEquals(((LongIntVector) rows[i]).get(id), deltaVec.get(id), zero);
        }
    }
    client1.zero();
    LongIntVector[] deltaVecs = new LongIntVector[rowNum];
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = VFactory.sparseLongKeyIntVector(feaNum, index.length);
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], (int) index[i]);
        }
        deltaVecs[rowId] = deltaVec;
    }
    client1.update(rowIds, deltaVecs);
    rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (long id : index) {
            Assert.assertEquals(((LongIntVector) rows[i]).get(id), deltaVecs[i].get(id), zero);
        }
    }
    client1.zero();
    deltaVecs = new LongIntVector[rowNum];
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = VFactory.sparseLongKeyIntVector(feaNum, index.length);
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], (int) index[i]);
        }
        deltaVecs[rowId] = deltaVec;
    }
    client1.increment(rowIds, deltaVecs, true);
    rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (long id : index) {
            Assert.assertEquals(((LongIntVector) rows[i]).get(id), deltaVecs[i].get(id), zero);
        }
    }
}
Also used : Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient)

Example 47 with MatrixClient

use of com.tencent.angel.psagent.matrix.MatrixClient in project angel by Tencent.

the class UpdateRowsTest method testDenseIntUDF.

public void testDenseIntUDF() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client1 = worker.getPSAgent().getMatrixClient(DENSE_INT_MAT, 0);
    int matrixW1Id = client1.getMatrixId();
    int[] index = genIndexs(feaNum, nnz);
    IntIntVector deltaVec = null;
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = new IntIntVector(feaNum, new IntIntDenseVectorStorage(feaNum));
        for (int i = 0; i < feaNum; i++) deltaVec.set(i, i);
        client1.update(rowId, deltaVec);
    }
    int[] rowIds = new int[rowNum];
    for (int i = 0; i < rowNum; i++) {
        rowIds[i] = i;
    }
    Vector[] rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntIntVector) rows[i]).get(id), deltaVec.get(id), zero);
        }
    }
    client1.zero();
    IntIntVector[] deltaVecs = new IntIntVector[rowNum];
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = VFactory.sparseIntVector(feaNum, index.length);
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], index[i]);
        }
        deltaVecs[rowId] = deltaVec;
    }
    client1.update(rowIds, deltaVecs);
    rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntIntVector) rows[i]).get(id), deltaVecs[i].get(id), zero);
        }
    }
    client1.zero();
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = VFactory.sparseIntVector(feaNum, index.length);
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], index[i]);
        }
        deltaVecs[rowId] = deltaVec;
    }
    client1.increment(rowIds, deltaVecs, true);
    rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntIntVector) rows[i]).get(id), deltaVecs[i].get(id), zero);
        }
    }
}
Also used : Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient)

Example 48 with MatrixClient

use of com.tencent.angel.psagent.matrix.MatrixClient in project angel by Tencent.

the class UpdateRowsTest method testDenseLongUDF.

public void testDenseLongUDF() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client1 = worker.getPSAgent().getMatrixClient(DENSE_LONG_MAT, 0);
    int matrixW1Id = client1.getMatrixId();
    int[] index = genIndexs(feaNum, nnz);
    IntLongVector deltaVec = null;
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = new IntLongVector(feaNum, new IntLongDenseVectorStorage(feaNum));
        for (int i = 0; i < feaNum; i++) deltaVec.set(i, i);
        client1.update(rowId, deltaVec);
    }
    int[] rowIds = new int[rowNum];
    for (int i = 0; i < rowNum; i++) {
        rowIds[i] = i;
    }
    Vector[] rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntLongVector) rows[i]).get(id), deltaVec.get(id), zero);
        }
    }
    client1.zero();
    IntLongVector[] deltaVecs = new IntLongVector[rowNum];
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = VFactory.sparseLongVector(feaNum, index.length);
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], index[i]);
        }
        deltaVecs[rowId] = deltaVec;
    }
    client1.update(rowIds, deltaVecs);
    rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntLongVector) rows[i]).get(id), deltaVecs[i].get(id), zero);
        }
    }
    client1.zero();
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = VFactory.sparseLongVector(feaNum, index.length);
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], index[i]);
        }
        deltaVecs[rowId] = deltaVec;
    }
    client1.increment(rowIds, deltaVecs, true);
    rows = client1.get(rowIds, index);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntLongVector) rows[i]).get(id), deltaVecs[i].get(id), zero);
        }
    }
}
Also used : Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient)

Example 49 with MatrixClient

use of com.tencent.angel.psagent.matrix.MatrixClient in project angel by Tencent.

the class PSManagerTest method testPSError.

@Test
public void testPSError() throws Exception {
    try {
        int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        assertEquals(amPs.getMaxAttempts(), 4);
        PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
        PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
        PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
        // attempt 0
        ps.stop(-1);
        PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
        assertTrue(psAttempt1 != null);
        assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 2);
        assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 2);
        List<String> diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 1);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
        MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
        int matrixW1Id = w1Task0Client.getMatrixId();
        int[] delta = new int[100000];
        for (int i = 0; i < 100000; i++) {
            delta[i] = 2;
        }
        IntIntVector deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task0Client.increment(deltaVec);
        deltaVec = new IntIntVector(100000, new IntIntDenseVectorStorage(delta));
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task1Client.increment(deltaVec);
        w1Task0Client.clock().get();
        w1Task1Client.clock().get();
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
        Thread.sleep(snapshotInterval * 2);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
        assertTrue(psAttempt2 != null);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 3);
        assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 3);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 2);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        assertEquals(sum((IntIntVector) w1Task0Client.getRow(0)), 400000);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
        assertTrue(psAttempt3 != null);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 3);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(amPs.getState(), AMParameterServerState.FAILED);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertNull(amPs.getRunningAttemptId());
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 4);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
    } catch (Exception x) {
        LOG.error("run testPSError failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.psagent.task.TaskContext) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) IntIntVector(com.tencent.angel.ml.math2.vector.IntIntVector) AngelException(com.tencent.angel.exception.AngelException) ParameterServer(com.tencent.angel.ps.ParameterServer) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) TConnection(com.tencent.angel.ipc.TConnection) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) IntIntDenseVectorStorage(com.tencent.angel.ml.math2.storage.IntIntDenseVectorStorage) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 50 with MatrixClient

use of com.tencent.angel.psagent.matrix.MatrixClient in project angel by Tencent.

the class GetRowsTest method testSparseFloatUDF.

public void testSparseFloatUDF() throws Exception {
    Worker worker = LocalClusterContext.get().getWorker(workerAttempt0Id).getWorker();
    MatrixClient client1 = worker.getPSAgent().getMatrixClient(SPARSE_FLOAT_MAT, 0);
    int[] index = genIndexs(feaNum, nnz);
    IntFloatVector deltaVec = null;
    for (int rowId = 0; rowId < rowNum; rowId++) {
        deltaVec = new IntFloatVector(feaNum, new IntFloatSparseVectorStorage(feaNum));
        for (int i = 0; i < index.length; i++) {
            deltaVec.set(index[i], index[i]);
        }
        client1.increment(rowId, deltaVec, true);
    }
    int[] rowIds = new int[rowNum];
    for (int i = 0; i < rowNum; i++) {
        rowIds[i] = i;
    }
    Vector[] rows = client1.getRows(rowIds);
    for (int i = 0; i < rowNum; i++) {
        for (int id : index) {
            Assert.assertEquals(((IntFloatVector) rows[i]).get(id), deltaVec.get(id), zero);
        }
        Assert.assertEquals(index.length, ((IntFloatVector) rows[i]).size());
    }
}
Also used : Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient)

Aggregations

MatrixClient (com.tencent.angel.psagent.matrix.MatrixClient)198 Worker (com.tencent.angel.worker.Worker)183 IntDoubleVector (com.tencent.angel.ml.math2.vector.IntDoubleVector)14 IntFloatVector (com.tencent.angel.ml.math2.vector.IntFloatVector)14 LongFloatVector (com.tencent.angel.ml.math2.vector.LongFloatVector)14 IntIntVector (com.tencent.angel.ml.math2.vector.IntIntVector)13 Vector (com.tencent.angel.ml.math2.vector.Vector)13 IncrementRows (com.tencent.angel.ml.matrix.psf.update.update.IncrementRows)13 IncrementRowsParam (com.tencent.angel.ml.matrix.psf.update.update.IncrementRowsParam)13 Test (org.junit.Test)13 IntLongVector (com.tencent.angel.ml.math2.vector.IntLongVector)12 LongDoubleVector (com.tencent.angel.ml.math2.vector.LongDoubleVector)12 LongIntVector (com.tencent.angel.ml.math2.vector.LongIntVector)12 LongLongVector (com.tencent.angel.ml.math2.vector.LongLongVector)12 AngelException (com.tencent.angel.exception.AngelException)10 LongIndexGet (com.tencent.angel.ml.matrix.psf.get.indexed.LongIndexGet)9 LongIndexGetParam (com.tencent.angel.ml.matrix.psf.get.indexed.LongIndexGetParam)9 TVector (com.tencent.angel.ml.math.TVector)6 MasterServiceTest (com.tencent.angel.master.MasterServiceTest)5 DenseDoubleVector (com.tencent.angel.ml.math.vector.DenseDoubleVector)5