Search in sources :

Example 6 with PSLocation

use of com.tencent.angel.ml.matrix.transport.PSLocation in project angel by Tencent.

the class MasterService method getPSLocation.

/**
 * get a specific parameter server location.
 * @param controller rpc controller of protobuf
 * @param request parameter server id
 * @throws ServiceException
 */
@Override
public GetPSLocationReponse getPSLocation(RpcController controller, GetPSLocationRequest request) throws ServiceException {
    GetPSLocationReponse.Builder resBuilder = GetPSLocationReponse.newBuilder();
    ParameterServerId psId = ProtobufUtil.convertToId(request.getPsId());
    Location psLocation = context.getLocationManager().getPsLocation(psId);
    if (psLocation == null) {
        resBuilder.setPsLocation(PSLocationProto.newBuilder().setPsId(request.getPsId()).setPsStatus(PSStatus.PS_NOTREADY).build());
    } else {
        resBuilder.setPsLocation(ProtobufUtil.convertToPSLocProto(psId, psLocation));
    }
    return resBuilder.build();
}
Also used : ParameterServerId(com.tencent.angel.ps.ParameterServerId) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 7 with PSLocation

use of com.tencent.angel.ml.matrix.transport.PSLocation in project angel by Tencent.

the class MasterService method getPartLocation.

/**
 * Get locations for a partition
 * @param controller
 * @param request
 * @return
 * @throws ServiceException
 */
@Override
public GetPartLocationResponse getPartLocation(RpcController controller, GetPartLocationRequest request) throws ServiceException {
    GetPartLocationResponse.Builder builder = GetPartLocationResponse.newBuilder();
    List<ParameterServerId> psIds = context.getMatrixMetaManager().getPss(request.getMatrixId(), request.getPartId());
    if (psIds != null) {
        int size = psIds.size();
        for (int i = 0; i < size; i++) {
            Location psLocation = context.getLocationManager().getPsLocation(psIds.get(i));
            if (psLocation == null) {
                builder.addLocations((PSLocationProto.newBuilder().setPsId(ProtobufUtil.convertToIdProto(psIds.get(i))).setPsStatus(PSStatus.PS_NOTREADY).build()));
            } else {
                builder.addLocations(ProtobufUtil.convertToPSLocProto(psIds.get(i), psLocation));
            }
        }
    }
    return builder.build();
}
Also used : ParameterServerId(com.tencent.angel.ps.ParameterServerId) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 8 with PSLocation

use of com.tencent.angel.ml.matrix.transport.PSLocation in project angel by Tencent.

the class PSFailedReportTest method testPSFailedReport.

@Test
public void testPSFailedReport() throws Exception {
    ParameterServerId ps1Id = new ParameterServerId(0);
    final ParameterServerId ps2Id = new ParameterServerId(1);
    PSAttemptId ps1Attempt0Id = new PSAttemptId(ps1Id, 0);
    PSAttemptId ps2Attempt0Id = new PSAttemptId(ps2Id, 0);
    PSAttemptId ps2Attempt1Id = new PSAttemptId(ps2Id, 1);
    ParameterServer ps1Attempt0 = LocalClusterContext.get().getPS(ps1Attempt0Id).getPS();
    ParameterServer ps2Attempt0 = LocalClusterContext.get().getPS(ps2Attempt0Id).getPS();
    WorkerId worker0Id = new WorkerId(new WorkerGroupId(0), 0);
    WorkerAttemptId worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
    Worker worker0 = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
    TaskContext task0Context = worker0.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
    MatrixClient matrixClient = task0Context.getMatrix("w1");
    int iterNum = 20;
    for (int i = 0; i < iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt0.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    LOG.info("===================================================================ps2 failed");
    HashMap<PSLocation, Integer> failedCounters = new HashMap<>();
    PSLocation psLoc = new PSLocation(ps2Id, ps2Attempt0.getLocationManager().getPsLocation(ps2Id));
    failedCounters.put(psLoc, 10000);
    worker0.getPSAgent().getMasterClient().psFailedReport(failedCounters);
    Thread.sleep(20000);
    for (int i = iterNum; i < 2 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    ParameterServer ps2Attempt = LocalClusterContext.get().getPS(ps2Attempt1Id).getPS();
    for (int i = iterNum * 2; i < 3 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) HashMap(java.util.HashMap) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) ServerMatrix(com.tencent.angel.ps.impl.matrix.ServerMatrix) WorkerId(com.tencent.angel.worker.WorkerId) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) DenseIntVector(com.tencent.angel.ml.math.vector.DenseIntVector) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) MatrixStorageManager(com.tencent.angel.ps.impl.MatrixStorageManager) IntBuffer(java.nio.IntBuffer) ServerDenseIntRow(com.tencent.angel.ps.impl.matrix.ServerDenseIntRow) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) ParameterServerId(com.tencent.angel.ps.ParameterServerId) Test(org.junit.Test)

Example 9 with PSLocation

use of com.tencent.angel.ml.matrix.transport.PSLocation in project angel by Tencent.

the class ProtobufUtil method convert.

public static HashMap<PSLocation, Integer> convert(PSFailedReportsProto reportsProto) {
    HashMap<PSLocation, Integer> reports = new HashMap<>();
    List<PSFailedReportProto> reportList = reportsProto.getPsFailedReportsList();
    int size = reportList.size();
    for (int i = 0; i < size; i++) {
        reports.put(new PSLocation(convertToId(reportList.get(i).getPsLoc().getPsId()), convert(reportList.get(i).getPsLoc().getLocation())), reportList.get(i).getFailedCounter());
    }
    return reports;
}
Also used : PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) HashMap(java.util.HashMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Int2IntOpenHashMap(it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap)

Example 10 with PSLocation

use of com.tencent.angel.ml.matrix.transport.PSLocation in project angel by Tencent.

the class AMMatrixMetaManager method handlePartReport.

private void handlePartReport(ParameterServerId psId, int matrixId, PartReport partReport) {
    ParameterServerId master = matrixMetaManager.getMasterPs(matrixId, partReport.partId);
    if (!psId.equals(master)) {
        MatrixMeta matrixMeta = matrixMetaManager.getMatrixMeta(matrixId);
        if (matrixMeta == null) {
            return;
        }
        matrixMeta.getPartitionMeta(partReport.partId).addReplicationPS(psId);
        if (partReport.state == PartitionState.INITIALIZING) {
            addNeedRecoverPart(master, new RecoverPartKey(new PartitionKey(matrixId, partReport.partId), new PSLocation(psId, context.getLocationManager().getPsLocation(psId))));
        } else if (partReport.state == PartitionState.READ_AND_WRITE) {
            ParameterServerId orignalMaster = matrixPartitionsOnPS.get(psId).get(matrixId).getPartitionMeta(partReport.partId).getMasterPs();
            if (orignalMaster.equals(psId)) {
                matrixMetaManager.getMatrixMeta(matrixId).getPartitionMeta(partReport.partId).makePsToMaster(psId);
            }
        }
    }
}
Also used : RecoverPartKey(com.tencent.angel.ps.recovery.ha.RecoverPartKey) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) PartitionKey(com.tencent.angel.PartitionKey) ParameterServerId(com.tencent.angel.ps.ParameterServerId)

Aggregations

PSLocation (com.tencent.angel.ml.matrix.transport.PSLocation)10 ParameterServerId (com.tencent.angel.ps.ParameterServerId)6 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)5 HashMap (java.util.HashMap)3 PartitionKey (com.tencent.angel.PartitionKey)2 Location (com.tencent.angel.common.location.Location)2 RecoverPartKey (com.tencent.angel.ps.recovery.ha.RecoverPartKey)2 DenseIntVector (com.tencent.angel.ml.math.vector.DenseIntVector)1 MLProtos (com.tencent.angel.protobuf.generated.MLProtos)1 PSAttemptId (com.tencent.angel.ps.PSAttemptId)1 MatrixStorageManager (com.tencent.angel.ps.impl.MatrixStorageManager)1 ParameterServer (com.tencent.angel.ps.impl.ParameterServer)1 ServerDenseIntRow (com.tencent.angel.ps.impl.matrix.ServerDenseIntRow)1 ServerMatrix (com.tencent.angel.ps.impl.matrix.ServerMatrix)1 MatrixClient (com.tencent.angel.psagent.matrix.MatrixClient)1 Worker (com.tencent.angel.worker.Worker)1 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)1 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)1 WorkerId (com.tencent.angel.worker.WorkerId)1 TaskContext (com.tencent.angel.worker.task.TaskContext)1