Search in sources :

Example 21 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class ParameterServerThreadStackBlock method render.

@Override
protected void render(Block html) {
    set(TITLE, join("Angel ParameterServerThread ", $(PSATTEMPT_ID)));
    PSAttemptId psAttempttId = null;
    try {
        psAttempttId = new PSAttemptId($(PSATTEMPT_ID));
    } catch (UnvalidIdStrException e) {
        LOG.error("unvalid id string, ", e);
        return;
    }
    try {
        LOG.info("start init PSClient");
        PSClient psClient = new PSClient(amContext, psAttempttId);
        String info = psClient.getThreadStack();
        html.pre()._(info)._();
    } catch (IOException | ServiceException e) {
        LOG.error("get thread stack from ps " + psAttempttId + " failed. ", e);
    }
}
Also used : PSAttemptId(com.tencent.angel.ps.PSAttemptId) ServiceException(com.google.protobuf.ServiceException) PSClient(com.tencent.angel.master.client.PSClient) IOException(java.io.IOException) UnvalidIdStrException(com.tencent.angel.exception.UnvalidIdStrException)

Example 22 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class MasterRecoverTest method setup.

@Before
public void setup() throws Exception {
    try {
        // set basic configuration keys
        Configuration conf = new Configuration();
        conf.setBoolean("mapred.mapper.new-api", true);
        conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
        conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
        // use local deploy mode and dummy dataspliter
        conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
        conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
        conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
        conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
        conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
        conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
        conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
        conf.setInt(AngelConf.ANGEL_PS_NUMBER, 1);
        conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 2);
        // get a angel client
        angelClient = AngelClientFactory.get(conf);
        // add matrix
        MatrixContext mMatrix = new MatrixContext();
        mMatrix.setName("w1");
        mMatrix.setRowNum(1);
        mMatrix.setColNum(100000);
        mMatrix.setMaxRowNumInBlock(1);
        mMatrix.setMaxColNumInBlock(50000);
        mMatrix.setRowType(RowType.T_INT_DENSE);
        mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
        mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
        mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
        mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_INT");
        angelClient.addMatrix(mMatrix);
        MatrixContext mMatrix2 = new MatrixContext();
        mMatrix2.setName("w2");
        mMatrix2.setRowNum(1);
        mMatrix2.setColNum(100000);
        mMatrix2.setMaxRowNumInBlock(1);
        mMatrix2.setMaxColNumInBlock(50000);
        mMatrix2.setRowType(RowType.T_DOUBLE_DENSE);
        mMatrix2.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
        mMatrix2.set(MatrixConf.MATRIX_HOGWILD, "false");
        mMatrix2.set(MatrixConf.MATRIX_AVERAGE, "false");
        mMatrix2.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_DOUBLE");
        angelClient.addMatrix(mMatrix2);
        angelClient.startPSServer();
        angelClient.run();
        Thread.sleep(5000);
        group0Id = new WorkerGroupId(0);
        worker0Id = new WorkerId(group0Id, 0);
        worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
        task0Id = new TaskId(0);
        task1Id = new TaskId(1);
        psId = new ParameterServerId(0);
        psAttempt0Id = new PSAttemptId(psId, 0);
    } catch (Exception x) {
        LOG.error("setup failed ", x);
        throw x;
    }
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) MatrixContext(com.tencent.angel.ml.matrix.MatrixContext) TaskId(com.tencent.angel.worker.task.TaskId) Configuration(org.apache.hadoop.conf.Configuration) PSAttemptId(com.tencent.angel.ps.PSAttemptId) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) WorkerId(com.tencent.angel.worker.WorkerId) ParameterServerId(com.tencent.angel.ps.ParameterServerId) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) Before(org.junit.Before)

Example 23 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class MatrixMetaManagerTest method setup.

@BeforeClass
public static void setup() throws Exception {
    try {
        // set basic configuration keys
        Configuration conf = new Configuration();
        conf.setBoolean("mapred.mapper.new-api", true);
        conf.setBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, true);
        conf.set(AngelConf.ANGEL_TASK_USER_TASKCLASS, DummyTask.class.getName());
        // use local deploy mode and dummy dataspliter
        conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL");
        conf.setBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, true);
        conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, CombineTextInputFormat.class.getName());
        conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/out");
        conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, LOCAL_FS + TMP_PATH + "/in");
        conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log");
        conf.setInt(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1);
        conf.setInt(AngelConf.ANGEL_WORKER_TASK_NUMBER, 2);
        conf.setInt(AngelConf.ANGEL_PSAGENT_CACHE_SYNC_TIMEINTERVAL_MS, 10000);
        // get a angel client
        angelClient = AngelClientFactory.get(conf);
        // add matrix
        MatrixContext mMatrix = new MatrixContext();
        mMatrix.setName("w1");
        mMatrix.setRowNum(1);
        mMatrix.setColNum(100000);
        mMatrix.setMaxRowNumInBlock(1);
        mMatrix.setMaxColNumInBlock(50000);
        mMatrix.setRowType(RowType.T_INT_DENSE);
        mMatrix.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
        mMatrix.set(MatrixConf.MATRIX_HOGWILD, "true");
        mMatrix.set(MatrixConf.MATRIX_AVERAGE, "false");
        mMatrix.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_INT");
        angelClient.addMatrix(mMatrix);
        MatrixContext mMatrix2 = new MatrixContext();
        mMatrix2.setName("w2");
        mMatrix2.setRowNum(1);
        mMatrix2.setColNum(100000);
        mMatrix2.setMaxRowNumInBlock(1);
        mMatrix2.setMaxColNumInBlock(50000);
        mMatrix2.setRowType(RowType.T_DOUBLE_DENSE);
        mMatrix2.set(MatrixConf.MATRIX_OPLOG_ENABLEFILTER, "false");
        mMatrix2.set(MatrixConf.MATRIX_HOGWILD, "false");
        mMatrix2.set(MatrixConf.MATRIX_AVERAGE, "false");
        mMatrix2.set(MatrixConf.MATRIX_OPLOG_TYPE, "DENSE_DOUBLE");
        angelClient.addMatrix(mMatrix2);
        angelClient.startPSServer();
        angelClient.run();
        Thread.sleep(5000);
        group0Id = new WorkerGroupId(0);
        worker0Id = new WorkerId(group0Id, 0);
        worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
        task0Id = new TaskId(0);
        task1Id = new TaskId(1);
        psId = new ParameterServerId(0);
        psAttempt0Id = new PSAttemptId(psId, 0);
    } catch (Exception x) {
        LOG.error("setup failed ", x);
        throw x;
    }
}
Also used : CombineTextInputFormat(org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat) TaskId(com.tencent.angel.worker.task.TaskId) Configuration(org.apache.hadoop.conf.Configuration) PSAttemptId(com.tencent.angel.ps.PSAttemptId) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) WorkerId(com.tencent.angel.worker.WorkerId) ParameterServerId(com.tencent.angel.ps.ParameterServerId) AngelException(com.tencent.angel.exception.AngelException) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) BeforeClass(org.junit.BeforeClass)

Example 24 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class PSManagerTest method testPSManager.

@Test
public void testPSManager() throws Exception {
    try {
        LOG.info("===========================testPSManager===============================");
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        ParameterServerManager psManager = angelAppMaster.getAppContext().getParameterServerManager();
        Map<ParameterServerId, AMParameterServer> psMap = psManager.getParameterServerMap();
        assertEquals(psMap.size(), 1);
        AMParameterServer ps = psMap.get(psId);
        assertTrue(ps != null);
        assertEquals(ps.getId(), psId);
        assertEquals(ps.getState(), AMParameterServerState.RUNNING);
        Map<PSAttemptId, PSAttempt> psAttempts = ps.getPSAttempts();
        assertEquals(psAttempts.size(), 1);
        PSAttempt psAttempt = psAttempts.get(psAttempt0Id);
        assertEquals(psAttempt.getInternalState(), PSAttemptStateInternal.RUNNING);
    } catch (Exception x) {
        LOG.error("run testPSManager failed ", x);
        throw x;
    }
}
Also used : AMParameterServer(com.tencent.angel.master.ps.ps.AMParameterServer) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSAttempt(com.tencent.angel.master.ps.attempt.PSAttempt) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) ParameterServerId(com.tencent.angel.ps.ParameterServerId) AngelException(com.tencent.angel.exception.AngelException) Test(org.junit.Test)

Example 25 with PSAttemptId

use of com.tencent.angel.ps.PSAttemptId in project angel by Tencent.

the class PeriodHATest method testHA.

@Test
public void testHA() throws Exception {
    ParameterServerId ps1Id = new ParameterServerId(0);
    final ParameterServerId ps2Id = new ParameterServerId(1);
    PSAttemptId ps1Attempt0Id = new PSAttemptId(ps1Id, 0);
    PSAttemptId ps2Attempt0Id = new PSAttemptId(ps2Id, 0);
    PSAttemptId ps2Attempt1Id = new PSAttemptId(ps2Id, 1);
    ParameterServer ps1Attempt0 = LocalClusterContext.get().getPS(ps1Attempt0Id).getPS();
    ParameterServer ps2Attempt0 = LocalClusterContext.get().getPS(ps2Attempt0Id).getPS();
    WorkerId worker0Id = new WorkerId(new WorkerGroupId(0), 0);
    WorkerAttemptId worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
    Worker worker0 = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
    TaskContext task0Context = worker0.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
    MatrixClient matrixClient = task0Context.getMatrix("w1");
    int iterNum = 20;
    for (int i = 0; i < iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt0.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    LOG.info("===================================================================ps2 failed");
    ps2Attempt0.failed("exit");
    for (int i = iterNum; i < 2 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    ParameterServer ps2Attempt = LocalClusterContext.get().getPS(ps2Attempt1Id).getPS();
    for (int i = iterNum * 2; i < 3 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) ServerMatrix(com.tencent.angel.ps.impl.matrix.ServerMatrix) WorkerId(com.tencent.angel.worker.WorkerId) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) DenseIntVector(com.tencent.angel.ml.math.vector.DenseIntVector) PSAttemptId(com.tencent.angel.ps.PSAttemptId) MatrixStorageManager(com.tencent.angel.ps.impl.MatrixStorageManager) IntBuffer(java.nio.IntBuffer) ServerDenseIntRow(com.tencent.angel.ps.impl.matrix.ServerDenseIntRow) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) ParameterServerId(com.tencent.angel.ps.ParameterServerId) Test(org.junit.Test)

Aggregations

PSAttemptId (com.tencent.angel.ps.PSAttemptId)27 ParameterServerId (com.tencent.angel.ps.ParameterServerId)15 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)15 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)11 WorkerId (com.tencent.angel.worker.WorkerId)11 Configuration (org.apache.hadoop.conf.Configuration)11 MatrixContext (com.tencent.angel.ml.matrix.MatrixContext)10 CombineTextInputFormat (org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat)10 TaskId (com.tencent.angel.worker.task.TaskId)8 Before (org.junit.Before)6 Test (org.junit.Test)6 AngelException (com.tencent.angel.exception.AngelException)5 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)5 DummyTask (com.tencent.angel.master.DummyTask)4 PSAttempt (com.tencent.angel.master.ps.attempt.PSAttempt)4 ParameterServer (com.tencent.angel.ps.impl.ParameterServer)4 PSAgentAttemptId (com.tencent.angel.psagent.PSAgentAttemptId)4 Id (com.tencent.angel.common.Id)3 Location (com.tencent.angel.common.location.Location)3 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)3