Search in sources :

Example 61 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class MatrixTransportClient method getRowsSplit.

@Override
public Future<List<ServerRow>> getRowsSplit(PartitionKey partKey, List<Integer> rowIndexes, int clock) {
    ParameterServerId serverId = PSAgentContext.get().getMatrixMetaManager().getMasterPS(partKey);
    GetRowsSplitRequest request = new GetRowsSplitRequest(clock, partKey, rowIndexes);
    FutureResult<List<ServerRow>> future = new FutureResult<>();
    requestToResultMap.put(request, future);
    addToGetQueueForServer(serverId, request);
    startGet();
    return future;
}
Also used : ParameterServerId(com.tencent.angel.ps.ParameterServerId)

Example 62 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class MatrixTransportClient method getPart.

@Override
public Future<ServerPartition> getPart(PartitionKey partKey, int clock) {
    ParameterServerId serverId = PSAgentContext.get().getMatrixMetaManager().getMasterPS(partKey);
    GetPartitionRequest request = new GetPartitionRequest(partKey, clock);
    FutureResult<ServerPartition> future = new FutureResult<>();
    requestToResultMap.put(request, future);
    addToGetQueueForServer(serverId, request);
    startGet();
    return future;
}
Also used : ParameterServerId(com.tencent.angel.ps.ParameterServerId) ServerPartition(com.tencent.angel.ps.impl.matrix.ServerPartition)

Example 63 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class MatrixTransportClient method putPart.

@Override
public Future<VoidResult> putPart(PartitionKey partKey, List<RowUpdateSplit> rowsSplit, int taskIndex, int clock, boolean updateClock) {
    ParameterServerId serverId = PSAgentContext.get().getMatrixMetaManager().getMasterPS(partKey);
    PutPartitionUpdateRequest request = new PutPartitionUpdateRequest(taskIndex, clock, partKey, rowsSplit, updateClock);
    FutureResult<VoidResult> future = new FutureResult<>();
    requestToResultMap.put(request, future);
    addToPutQueueForServer(serverId, request);
    startPut();
    return future;
}
Also used : VoidResult(com.tencent.angel.ml.matrix.psf.update.enhance.VoidResult) ParameterServerId(com.tencent.angel.ps.ParameterServerId)

Example 64 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class PeriodHATest method testHA.

@Test
public void testHA() throws Exception {
    ParameterServerId ps1Id = new ParameterServerId(0);
    final ParameterServerId ps2Id = new ParameterServerId(1);
    PSAttemptId ps1Attempt0Id = new PSAttemptId(ps1Id, 0);
    PSAttemptId ps2Attempt0Id = new PSAttemptId(ps2Id, 0);
    PSAttemptId ps2Attempt1Id = new PSAttemptId(ps2Id, 1);
    ParameterServer ps1Attempt0 = LocalClusterContext.get().getPS(ps1Attempt0Id).getPS();
    ParameterServer ps2Attempt0 = LocalClusterContext.get().getPS(ps2Attempt0Id).getPS();
    WorkerId worker0Id = new WorkerId(new WorkerGroupId(0), 0);
    WorkerAttemptId worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
    Worker worker0 = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
    TaskContext task0Context = worker0.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
    MatrixClient matrixClient = task0Context.getMatrix("w1");
    int iterNum = 20;
    for (int i = 0; i < iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt0.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    LOG.info("===================================================================ps2 failed");
    ps2Attempt0.failed("exit");
    for (int i = iterNum; i < 2 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    ParameterServer ps2Attempt = LocalClusterContext.get().getPS(ps2Attempt1Id).getPS();
    for (int i = iterNum * 2; i < 3 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) ServerMatrix(com.tencent.angel.ps.impl.matrix.ServerMatrix) WorkerId(com.tencent.angel.worker.WorkerId) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) DenseIntVector(com.tencent.angel.ml.math.vector.DenseIntVector) PSAttemptId(com.tencent.angel.ps.PSAttemptId) MatrixStorageManager(com.tencent.angel.ps.impl.MatrixStorageManager) IntBuffer(java.nio.IntBuffer) ServerDenseIntRow(com.tencent.angel.ps.impl.matrix.ServerDenseIntRow) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) ParameterServerId(com.tencent.angel.ps.ParameterServerId) Test(org.junit.Test)

Example 65 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class PeriodPusher method start.

/**
 * Start
 */
public void start() {
    super.start();
    dispatcher = new Thread(() -> {
        ParameterServerId psId = context.getPSAttemptId().getPsId();
        while (!stopped.get() && !Thread.interrupted()) {
            try {
                Thread.sleep(pushIntervalMs);
                Map<PartitionKey, Integer> parts = getAndClearAllNeedRecoverParts();
                Map<RecoverPartKey, FutureResult> futures = new HashMap<>(parts.size());
                for (PartitionKey part : parts.keySet()) {
                    PartitionLocation partLoc = context.getMaster().getPartLocation(part.getMatrixId(), part.getPartitionId());
                    if ((partLoc.psLocs.size() > 1) && psId.equals(partLoc.psLocs.get(0).psId)) {
                        int size = partLoc.psLocs.size();
                        for (int i = 1; i < size; i++) {
                            RecoverPartKey partKey = new RecoverPartKey(part, partLoc.psLocs.get(i));
                            LOG.info("Start to backup partition " + partKey.partKey + " to " + partKey.psLoc);
                            futures.put(partKey, recover(partKey));
                        }
                    }
                }
                waitResults(futures);
            } catch (Exception e) {
                if (!stopped.get()) {
                    LOG.error("recover parts failed ", e);
                }
            }
        }
    });
    dispatcher.setName("psha-push-dispatcher");
    dispatcher.start();
}
Also used : RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) PartitionKey(com.tencent.angel.PartitionKey) ParameterServerId(com.tencent.angel.ps.ParameterServerId) HashMap(java.util.HashMap) Map(java.util.Map) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Aggregations

ParameterServerId (com.tencent.angel.ps.ParameterServerId)65 PSAttemptId (com.tencent.angel.ps.PSAttemptId)33 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)28 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)28 WorkerId (com.tencent.angel.worker.WorkerId)28 Configuration (org.apache.hadoop.conf.Configuration)28 MatrixContext (com.tencent.angel.ml.matrix.MatrixContext)27 CombineTextInputFormat (org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat)27 Before (org.junit.Before)23 TaskId (com.tencent.angel.worker.task.TaskId)9 PSLocation (com.tencent.angel.ps.server.data.PSLocation)6 HashMap (java.util.HashMap)6 Location (com.tencent.angel.common.location.Location)5 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)5 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)5 ArrayList (java.util.ArrayList)5 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)5 Path (org.apache.hadoop.fs.Path)5 Test (org.junit.Test)5 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)4