Search in sources :

Example 1 with ParameterServer

use of com.tencent.angel.ps.impl.ParameterServer in project angel by Tencent.

the class MatrixMetaManagerTest method testCreateMatrix.

@Test
public void testCreateMatrix() throws Exception {
    try {
        LOG.info("===========================testCreateMatrix===============================");
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        MasterClient masterClient = worker.getPSAgent().getMasterClient();
        int w3Id = -1;
        int w4Id = -1;
        // add matrix
        MatrixContext mMatrix = new MatrixContext();
        mMatrix.setName("w3");
        mMatrix.setRowNum(1);
        mMatrix.setColNum(100000);
        mMatrix.setMaxRowNumInBlock(1);
        mMatrix.setMaxColNumInBlock(50000);
        mMatrix.setRowType(RowType.T_DOUBLE_DENSE);
        mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
        mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
        mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
        mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_DOUBLE");
        masterClient.createMatrix(mMatrix, 10000);
        mMatrix.setName("w4");
        mMatrix.setRowNum(1);
        mMatrix.setColNum(100000);
        mMatrix.setMaxRowNumInBlock(1);
        mMatrix.setMaxColNumInBlock(50000);
        mMatrix.setRowType(RowType.T_DOUBLE_DENSE);
        mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
        mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
        mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
        mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_DOUBLE");
        masterClient.createMatrix(mMatrix, 10000);
        MatrixMeta w3Meta = worker.getPSAgent().getMatrixMetaManager().getMatrixMeta("w3");
        MatrixMeta w4Meta = worker.getPSAgent().getMatrixMetaManager().getMatrixMeta("w4");
        assertEquals(w3Meta.getRowNum(), 1);
        assertEquals(w3Meta.getColNum(), 100000);
        assertEquals(w3Meta.getRowType(), RowType.T_DOUBLE_DENSE);
        assertEquals(w4Meta.getRowNum(), 1);
        assertEquals(w4Meta.getColNum(), 100000);
        assertEquals(w4Meta.getRowType(), RowType.T_DOUBLE_DENSE);
        w3Id = w3Meta.getId();
        w4Id = w4Meta.getId();
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMMatrixMetaManager matrixMetaManager = angelAppMaster.getAppContext().getMatrixMetaManager();
        MatrixMeta matrixw3Proto = matrixMetaManager.getMatrix("w3");
        MatrixMeta matrixw4Proto = matrixMetaManager.getMatrix("w4");
        assertNotNull(matrixw3Proto);
        assertNotNull(matrixw4Proto);
        assertEquals(matrixw3Proto.getRowNum(), 1);
        assertEquals(matrixw3Proto.getColNum(), 100000);
        assertEquals(matrixw3Proto.getPartitionMetas().size(), 2);
        Map<Integer, PartitionMeta> w3Parts = matrixw3Proto.getPartitionMetas();
        assertEquals(w3Parts.get(0).getPss().get(0), psId);
        assertEquals(w3Parts.get(0).getPartId(), 0);
        assertEquals(w3Parts.get(0).getStartRow(), 0);
        assertEquals(w3Parts.get(0).getEndRow(), 1);
        assertEquals(w3Parts.get(0).getStartCol(), 0);
        assertEquals(w3Parts.get(0).getEndCol(), 50000);
        assertEquals(w3Parts.get(1).getPartId(), 1);
        assertEquals(w3Parts.get(1).getStartRow(), 0);
        assertEquals(w3Parts.get(1).getEndRow(), 1);
        assertEquals(w3Parts.get(1).getStartCol(), 50000);
        assertEquals(w3Parts.get(1).getEndCol(), 100000);
        Map<Integer, PartitionMeta> w4Parts = matrixw4Proto.getPartitionMetas();
        assertEquals(w4Parts.get(0).getPss().get(0), psId);
        assertEquals(w4Parts.get(0).getPartId(), 0);
        assertEquals(w4Parts.get(0).getStartRow(), 0);
        assertEquals(w4Parts.get(0).getEndRow(), 1);
        assertEquals(w4Parts.get(0).getStartCol(), 0);
        assertEquals(w4Parts.get(0).getEndCol(), 50000);
        assertEquals(w4Parts.get(1).getPartId(), 1);
        assertEquals(w4Parts.get(1).getStartRow(), 0);
        assertEquals(w4Parts.get(1).getEndRow(), 1);
        assertEquals(w4Parts.get(1).getStartCol(), 50000);
        assertEquals(w4Parts.get(1).getEndCol(), 100000);
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        PSMatrixMetaManager matrixPartManager = ps.getMatrixMetaManager();
        PartitionMeta w3Part0 = matrixPartManager.getPartMeta(w3Id, 0);
        PartitionMeta w3Part1 = matrixPartManager.getPartMeta(w3Id, 1);
        assertTrue(w3Part0 != null);
        assertTrue(w3Part1 != null);
        assertEquals(w3Part0.getPartitionKey().getStartRow(), 0);
        assertEquals(w3Part0.getPartitionKey().getEndRow(), 1);
        assertEquals(w3Part0.getPartitionKey().getStartCol(), 0);
        assertEquals(w3Part0.getPartitionKey().getEndCol(), 50000);
        assertEquals(w3Part1.getPartitionKey().getStartRow(), 0);
        assertEquals(w3Part1.getPartitionKey().getEndRow(), 1);
        assertEquals(w3Part1.getPartitionKey().getStartCol(), 50000);
        assertEquals(w3Part1.getPartitionKey().getEndCol(), 100000);
        PartitionMeta w4Part0 = matrixPartManager.getPartMeta(w4Id, 0);
        PartitionMeta w4Part1 = matrixPartManager.getPartMeta(w4Id, 1);
        assertTrue(w4Part0 != null);
        assertTrue(w4Part1 != null);
        assertEquals(w4Part0.getPartitionKey().getStartRow(), 0);
        assertEquals(w4Part0.getPartitionKey().getEndRow(), 1);
        assertEquals(w4Part0.getPartitionKey().getStartCol(), 0);
        assertEquals(w4Part0.getPartitionKey().getEndCol(), 50000);
        assertEquals(w4Part1.getPartitionKey().getStartRow(), 0);
        assertEquals(w4Part1.getPartitionKey().getEndRow(), 1);
        assertEquals(w4Part1.getPartitionKey().getStartCol(), 50000);
        assertEquals(w4Part1.getPartitionKey().getEndCol(), 100000);
        MatrixClient w4ClientForTask0 = worker.getPSAgent().getMatrixClient("w4", 0);
        MatrixClient w4ClientForTask1 = worker.getPSAgent().getMatrixClient("w4", 1);
        TaskContext task0Context = w4ClientForTask0.getTaskContext();
        TaskContext task1Context = w4ClientForTask1.getTaskContext();
        double[] delta = new double[100000];
        for (int i = 0; i < delta.length; i++) {
            delta[i] = 1.0;
        }
        int iterIndex = 0;
        while (iterIndex < 5) {
            DenseDoubleVector row1 = (DenseDoubleVector) w4ClientForTask0.getRow(0);
            double sum1 = sum(row1.getValues());
            LOG.info("taskid=" + task0Context.getIndex() + ", matrixId=" + w4ClientForTask0.getMatrixId() + ", rowIndex=0, local row sum=" + sum1);
            DenseDoubleVector deltaRow1 = new DenseDoubleVector(delta.length, delta);
            deltaRow1.setMatrixId(w4ClientForTask0.getMatrixId());
            deltaRow1.setRowId(0);
            w4ClientForTask0.increment(deltaRow1);
            w4ClientForTask0.clock().get();
            task0Context.increaseEpoch();
            DenseDoubleVector row2 = (DenseDoubleVector) w4ClientForTask1.getRow(0);
            double sum2 = sum(row2.getValues());
            LOG.info("taskid=" + task1Context.getIndex() + ", matrixId=" + w4ClientForTask1.getMatrixId() + ", rowIndex=1, local row sum=" + sum2);
            DenseDoubleVector deltaRow2 = new DenseDoubleVector(delta.length, delta);
            deltaRow2.setMatrixId(w4ClientForTask1.getMatrixId());
            deltaRow2.setRowId(0);
            w4ClientForTask1.increment(deltaRow2);
            w4ClientForTask1.clock().get();
            task1Context.increaseEpoch();
            iterIndex++;
        }
        AMTaskManager amTaskManager = angelAppMaster.getAppContext().getTaskManager();
        AMTask amTask0 = amTaskManager.getTask(task0Id);
        AMTask amTask1 = amTaskManager.getTask(task1Id);
        assertEquals(amTask0.getIteration(), 5);
        assertEquals(amTask1.getIteration(), 5);
        Int2IntOpenHashMap task0MatrixClocks = amTask0.getMatrixClocks();
        assertEquals(task0MatrixClocks.size(), 1);
        assertEquals(task0MatrixClocks.get(w4Id), 5);
        Int2IntOpenHashMap task1MatrixClocks = amTask1.getMatrixClocks();
        assertEquals(task1MatrixClocks.size(), 1);
        assertEquals(task1MatrixClocks.get(w4Id), 5);
        DenseDoubleVector row1 = (DenseDoubleVector) w4ClientForTask0.getRow(0);
        double sum1 = sum(row1.getValues());
        assertEquals(sum1, 1000000.0, 0.000001);
        DenseDoubleVector row2 = (DenseDoubleVector) w4ClientForTask1.getRow(0);
        double sum2 = sum(row2.getValues());
        assertEquals(sum2, 1000000.0, 0.000001);
        masterClient.releaseMatrix(w3Meta.getName());
        Thread.sleep(10000);
        matrixw3Proto = matrixMetaManager.getMatrix("w3");
        assertTrue(matrixw3Proto == null);
        MatrixStorageManager matrixStorageManager = LocalClusterContext.get().getPS(psAttempt0Id).getPS().getMatrixStorageManager();
        ServerMatrix sw3 = matrixStorageManager.getMatrix(w3Id);
        assertTrue(sw3 == null);
        w4ClientForTask0.clock().get();
        w4ClientForTask1.clock().get();
        row1 = (DenseDoubleVector) w4ClientForTask0.getRow(0);
        sum1 = sum(row1.getValues());
        assertEquals(sum1, 1000000.0, 0.000001);
        row2 = (DenseDoubleVector) w4ClientForTask1.getRow(0);
        sum2 = sum(row2.getValues());
        assertEquals(sum2, 1000000.0, 0.000001);
    } catch (Exception x) {
        LOG.error("run testCreateMatrix failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.psagent.task.TaskContext) DenseDoubleVector(com.tencent.angel.ml.math.vector.DenseDoubleVector) MasterClient(com.tencent.angel.psagent.client.MasterClient) ServerMatrix(com.tencent.angel.ps.impl.matrix.ServerMatrix) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap) AngelException(com.tencent.angel.exception.AngelException) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) PSMatrixMetaManager(com.tencent.angel.ps.impl.PSMatrixMetaManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) AMMatrixMetaManager(com.tencent.angel.master.matrixmeta.AMMatrixMetaManager) MatrixStorageManager(com.tencent.angel.ps.impl.MatrixStorageManager) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) AMTask(com.tencent.angel.master.task.AMTask) Test(org.junit.Test)

Example 2 with ParameterServer

use of com.tencent.angel.ps.impl.ParameterServer in project angel by Tencent.

the class PSFailedReportTest method testPSFailedReport.

@Test
public void testPSFailedReport() throws Exception {
    ParameterServerId ps1Id = new ParameterServerId(0);
    final ParameterServerId ps2Id = new ParameterServerId(1);
    PSAttemptId ps1Attempt0Id = new PSAttemptId(ps1Id, 0);
    PSAttemptId ps2Attempt0Id = new PSAttemptId(ps2Id, 0);
    PSAttemptId ps2Attempt1Id = new PSAttemptId(ps2Id, 1);
    ParameterServer ps1Attempt0 = LocalClusterContext.get().getPS(ps1Attempt0Id).getPS();
    ParameterServer ps2Attempt0 = LocalClusterContext.get().getPS(ps2Attempt0Id).getPS();
    WorkerId worker0Id = new WorkerId(new WorkerGroupId(0), 0);
    WorkerAttemptId worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
    Worker worker0 = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
    TaskContext task0Context = worker0.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
    MatrixClient matrixClient = task0Context.getMatrix("w1");
    int iterNum = 20;
    for (int i = 0; i < iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt0.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    LOG.info("===================================================================ps2 failed");
    HashMap<PSLocation, Integer> failedCounters = new HashMap<>();
    PSLocation psLoc = new PSLocation(ps2Id, ps2Attempt0.getLocationManager().getPsLocation(ps2Id));
    failedCounters.put(psLoc, 10000);
    worker0.getPSAgent().getMasterClient().psFailedReport(failedCounters);
    Thread.sleep(20000);
    for (int i = iterNum; i < 2 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    ParameterServer ps2Attempt = LocalClusterContext.get().getPS(ps2Attempt1Id).getPS();
    for (int i = iterNum * 2; i < 3 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) HashMap(java.util.HashMap) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) ServerMatrix(com.tencent.angel.ps.impl.matrix.ServerMatrix) WorkerId(com.tencent.angel.worker.WorkerId) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) DenseIntVector(com.tencent.angel.ml.math.vector.DenseIntVector) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) MatrixStorageManager(com.tencent.angel.ps.impl.MatrixStorageManager) IntBuffer(java.nio.IntBuffer) ServerDenseIntRow(com.tencent.angel.ps.impl.matrix.ServerDenseIntRow) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) ParameterServerId(com.tencent.angel.ps.ParameterServerId) Test(org.junit.Test)

Example 3 with ParameterServer

use of com.tencent.angel.ps.impl.ParameterServer in project angel by Tencent.

the class PSManagerTest method testPSDone.

@SuppressWarnings("unchecked")
@Test
public void testPSDone() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Location masterLoc = ps.getMasterLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        WorkerDoneRequest workerRequest = WorkerDoneRequest.newBuilder().setWorkerAttemptId(ProtobufUtil.convertToIdProto(worker0Attempt0Id)).build();
        WorkerDoneResponse workerResponse = master.workerDone(null, workerRequest);
        assertEquals(workerResponse.getCommand(), WorkerCommandProto.W_SUCCESS);
        Thread.sleep(5000);
        angelAppMaster.getAppContext().getEventHandler().handle(new AppEvent(AppEventType.COMMIT));
        PSDoneRequest request = PSDoneRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).build();
        master.psDone(null, request);
        Thread.sleep(5000);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
        assertEquals(psAttempt.getInternalState(), PSAttemptStateInternal.SUCCESS);
        assertTrue(amPs.getState() == AMParameterServerState.SUCCESS);
        assertEquals(amPs.getNextAttemptNumber(), 1);
        assertNull(amPs.getRunningAttemptId());
        assertEquals(amPs.getSuccessAttemptId(), psAttempt0Id);
        assertEquals(amPs.getPSAttempts().size(), 1);
    } catch (Exception x) {
        LOG.error("run testPSDone failed ", x);
        throw x;
    }
}
Also used : WorkerDoneRequest(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneRequest) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) AngelException(com.tencent.angel.exception.AngelException) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) AppEvent(com.tencent.angel.master.app.AppEvent) TConnection(com.tencent.angel.ipc.TConnection) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) WorkerDoneResponse(com.tencent.angel.protobuf.generated.WorkerMasterServiceProtos.WorkerDoneResponse) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Example 4 with ParameterServer

use of com.tencent.angel.ps.impl.ParameterServer in project angel by Tencent.

the class PSManagerTest method testPSReport.

@Test
public void testPSReport() throws Exception {
    try {
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Location masterLoc = ps.getMasterLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        PSReportRequest.Builder builder = PSReportRequest.newBuilder();
        builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id));
        Pair.Builder pairBuilder = Pair.newBuilder();
        pairBuilder.setKey("ps_key1");
        pairBuilder.setValue("100");
        builder.addMetrics(pairBuilder.build());
        pairBuilder.setKey("ps_key2");
        pairBuilder.setValue("200");
        builder.addMetrics(pairBuilder.build());
        builder.setPsFailedReports(MLProtos.PSFailedReportsProto.getDefaultInstance());
        MatrixReportProto.Builder matrixBuilder = MatrixReportProto.newBuilder();
        ConcurrentHashMap<Integer, ServerMatrix> matrixIdMap = ps.getMatrixStorageManager().getMatrices();
        for (Entry<Integer, ServerMatrix> matrixEntry : matrixIdMap.entrySet()) {
            builder.addMatrixReports((matrixBuilder.setMatrixId(matrixEntry.getKey()).setMatrixName(matrixEntry.getValue().getName())));
        }
        PSReportResponse response = master.psReport(null, builder.build());
        assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_OK);
        assertEquals(response.getNeedCreateMatricesCount(), 0);
        assertEquals(response.getNeedReleaseMatrixIdsCount(), 0);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt = amPs.getPSAttempt(psAttempt0Id);
        Map<String, String> metrices = psAttempt.getMetrices();
        assertTrue(metrices.get("ps_key1").equals("100"));
        assertTrue(metrices.get("ps_key2").equals("200"));
        PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
        builder.setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id));
        response = master.psReport(null, builder.build());
        assertEquals(response.getPsCommand(), PSCommandProto.PSCOMMAND_SHUTDOWN);
    } catch (Exception x) {
        LOG.error("run testPSReport failed ", x);
        throw x;
    }
}
Also used : AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ServerMatrix(com.tencent.angel.ps.impl.matrix.ServerMatrix) AngelException(com.tencent.angel.exception.AngelException) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) TConnection(com.tencent.angel.ipc.TConnection) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) Location(com.tencent.angel.common.location.Location) Pair(com.tencent.angel.protobuf.generated.MLProtos.Pair) Test(org.junit.Test)

Example 5 with ParameterServer

use of com.tencent.angel.ps.impl.ParameterServer in project angel by Tencent.

the class PSManagerTest method testPSError.

@Test
public void testPSError() throws Exception {
    try {
        int heartbeatInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_HEARTBEAT_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_HEARTBEAT_INTERVAL_MS);
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        AMParameterServer amPs = psManager.getParameterServer(psId);
        PSAttempt psAttempt0 = amPs.getPSAttempt(psAttempt0Id);
        ParameterServer ps = LocalClusterContext.get().getPS(psAttempt0Id).getPS();
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        int w1Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w1").getId();
        int w2Id = angelAppMaster.getAppContext().getMatrixMetaManager().getMatrix("w2").getId();
        Location masterLoc = LocalClusterContext.get().getMaster().getAppMaster().getAppContext().getMasterService().getLocation();
        TConnection connection = TConnectionManager.getConnection(ps.getConf());
        MasterProtocol master = connection.getMasterService(masterLoc.getIp(), masterLoc.getPort());
        int task0Iteration = 2;
        int task1Iteration = 1;
        int task0w1Clock = 10;
        int task0w2Clock = 20;
        int task1w1Clock = 9;
        int task1w2Clock = 19;
        int w1Clock = (task0w1Clock < task1w1Clock) ? task0w1Clock : task1w1Clock;
        int w2Clock = (task0w2Clock < task1w2Clock) ? task0w2Clock : task1w2Clock;
        TaskContext task0Context = worker.getTaskManager().getRunningTask().get(task0Id).getTaskContext().getContext();
        TaskContext task1Context = worker.getTaskManager().getRunningTask().get(task1Id).getTaskContext().getContext();
        task0Context.setMatrixClock(w1Id, w1Clock);
        task1Context.setMatrixClock(w1Id, w1Clock);
        task0Context.setMatrixClock(w2Id, w2Clock);
        task1Context.setMatrixClock(w2Id, w2Clock);
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task0Iteration).setTaskId(ProtobufUtil.convertToIdProto(task0Id)).build());
        master.taskIteration(null, TaskIterationRequest.newBuilder().setIteration(task1Iteration).setTaskId(ProtobufUtil.convertToIdProto(task1Id)).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task0w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task0Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task0w2Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w1Id).setClock(task1w1Clock).build()).build());
        master.taskClock(null, TaskClockRequest.newBuilder().setTaskId(ProtobufUtil.convertToIdProto(task1Id)).setMatrixClock(MatrixClock.newBuilder().setMatrixId(w2Id).setClock(task1w2Clock).build()).build());
        assertEquals(amPs.getMaxAttempts(), 4);
        PSAttemptId psAttempt1Id = new PSAttemptId(psId, 1);
        PSAttemptId psAttempt2Id = new PSAttemptId(psId, 2);
        PSAttemptId psAttempt3Id = new PSAttemptId(psId, 3);
        // attempt 0
        ps.stop(-1);
        PSErrorRequest request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt0Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt1 = amPs.getPSAttempt(psAttempt1Id);
        assertTrue(psAttempt1 != null);
        assertEquals(psAttempt0.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 2);
        assertEquals(amPs.getRunningAttemptId(), psAttempt1Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 2);
        List<String> diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 1);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        ClockVectorManager clockVectorManager = ps.getClockVectorManager();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock, w2Clock);
        MatrixClient w1Task0Client = worker.getPSAgent().getMatrixClient("w1", 0);
        MatrixClient w1Task1Client = worker.getPSAgent().getMatrixClient("w1", 1);
        int matrixW1Id = w1Task0Client.getMatrixId();
        int[] delta = new int[100000];
        for (int i = 0; i < 100000; i++) {
            delta[i] = 2;
        }
        DenseIntVector deltaVec = new DenseIntVector(100000, delta);
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task0Client.increment(deltaVec);
        deltaVec = new DenseIntVector(100000, delta);
        deltaVec.setMatrixId(matrixW1Id);
        deltaVec.setRowId(0);
        w1Task1Client.increment(deltaVec);
        w1Task0Client.clock().get();
        w1Task1Client.clock().get();
        ps = LocalClusterContext.get().getPS(psAttempt1Id).getPS();
        int snapshotInterval = LocalClusterContext.get().getConf().getInt(AngelConf.ANGEL_PS_BACKUP_INTERVAL_MS, AngelConf.DEFAULT_ANGEL_PS_BACKUP_INTERVAL_MS);
        Thread.sleep(snapshotInterval * 2);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt1Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt2 = amPs.getPSAttempt(psAttempt2Id);
        assertTrue(psAttempt2 != null);
        assertEquals(psAttempt1.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 3);
        assertEquals(amPs.getRunningAttemptId(), psAttempt2Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 3);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 2);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt2Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        assertEquals(sum((DenseIntVector) w1Task0Client.getRow(0)), 400000);
        // attempt1
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt2Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        PSAttempt psAttempt3 = amPs.getPSAttempt(psAttempt3Id);
        assertTrue(psAttempt3 != null);
        assertEquals(psAttempt2.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.RUNNING);
        assertEquals(amPs.getState(), AMParameterServerState.RUNNING);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertEquals(amPs.getRunningAttemptId(), psAttempt3Id);
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 3);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        ps = LocalClusterContext.get().getPS(psAttempt3Id).getPS();
        checkMatrixInfo(ps, w1Id, w2Id, w1Clock + 1, w2Clock);
        ps.stop(-1);
        request = PSErrorRequest.newBuilder().setPsAttemptId(ProtobufUtil.convertToIdProto(psAttempt3Id)).setMsg("out of memory").build();
        master.psError(null, request);
        Thread.sleep(heartbeatInterval * 2);
        assertEquals(psAttempt3.getInternalState(), PSAttemptStateInternal.FAILED);
        assertEquals(amPs.getState(), AMParameterServerState.FAILED);
        assertEquals(angelAppMaster.getAppContext().getApp().getExternAppState(), AppState.FAILED);
        assertEquals(amPs.getNextAttemptNumber(), 4);
        assertNull(amPs.getRunningAttemptId());
        assertNull(amPs.getSuccessAttemptId());
        assertEquals(amPs.getPSAttempts().size(), 4);
        diagnostics = amPs.getDiagnostics();
        assertEquals(diagnostics.size(), 4);
        assertEquals(diagnostics.get(0), psAttempt0Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(1), psAttempt1Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(2), psAttempt2Id + " failed due to: out of memory");
        assertEquals(diagnostics.get(3), psAttempt3Id + " failed due to: out of memory");
    } catch (Exception x) {
        LOG.error("run testPSError failed ", x);
        throw x;
    }
}
Also used : TaskContext(com.tencent.angel.psagent.task.TaskContext) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) AngelException(com.tencent.angel.exception.AngelException) AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) DenseIntVector(com.tencent.angel.ml.math.vector.DenseIntVector) TConnection(com.tencent.angel.ipc.TConnection) PSAttemptId(com.tencent.angel.ps.PSAttemptId) ClockVectorManager(com.tencent.angel.ps.impl.ClockVectorManager) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Aggregations

ParameterServer (com.tencent.angel.ps.impl.ParameterServer)7 Test (org.junit.Test)7 Worker (com.tencent.angel.worker.Worker)5 Location (com.tencent.angel.common.location.Location)4 AngelException (com.tencent.angel.exception.AngelException)4 TConnection (com.tencent.angel.ipc.TConnection)4 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)4 PSAttemptId (com.tencent.angel.ps.PSAttemptId)4 ServerMatrix (com.tencent.angel.ps.impl.matrix.ServerMatrix)4 MatrixClient (com.tencent.angel.psagent.matrix.MatrixClient)4 PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)3 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)3 DenseIntVector (com.tencent.angel.ml.math.vector.DenseIntVector)3 MatrixStorageManager (com.tencent.angel.ps.impl.MatrixStorageManager)3 TaskContext (com.tencent.angel.worker.task.TaskContext)3 ParameterServerId (com.tencent.angel.ps.ParameterServerId)2 ServerDenseIntRow (com.tencent.angel.ps.impl.matrix.ServerDenseIntRow)2 TaskContext (com.tencent.angel.psagent.task.TaskContext)2 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)2 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)2