Search in sources :

Example 6 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class PSFailedReportTest method testPSFailedReport.

@Test
public void testPSFailedReport() throws Exception {
    ParameterServerId ps1Id = new ParameterServerId(0);
    final ParameterServerId ps2Id = new ParameterServerId(1);
    PSAttemptId ps1Attempt0Id = new PSAttemptId(ps1Id, 0);
    PSAttemptId ps2Attempt0Id = new PSAttemptId(ps2Id, 0);
    PSAttemptId ps2Attempt1Id = new PSAttemptId(ps2Id, 1);
    ParameterServer ps1Attempt0 = LocalClusterContext.get().getPS(ps1Attempt0Id).getPS();
    ParameterServer ps2Attempt0 = LocalClusterContext.get().getPS(ps2Attempt0Id).getPS();
    WorkerId worker0Id = new WorkerId(new WorkerGroupId(0), 0);
    WorkerAttemptId worker0Attempt0Id = new WorkerAttemptId(worker0Id, 0);
    Worker worker0 = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
    TaskContext task0Context = worker0.getTaskManager().getRunningTask().get(task0Id).getTaskContext();
    MatrixClient matrixClient = task0Context.getMatrix("w1");
    int iterNum = 20;
    for (int i = 0; i < iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt0.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    LOG.info("===================================================================ps2 failed");
    HashMap<PSLocation, Integer> failedCounters = new HashMap<>();
    PSLocation psLoc = new PSLocation(ps2Id, ps2Attempt0.getLocationManager().getPsLocation(ps2Id));
    failedCounters.put(psLoc, 10000);
    worker0.getPSAgent().getMasterClient().psFailedReport(failedCounters);
    Thread.sleep(20000);
    for (int i = iterNum; i < 2 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
    ParameterServer ps2Attempt = LocalClusterContext.get().getPS(ps2Attempt1Id).getPS();
    for (int i = iterNum * 2; i < 3 * iterNum; i++) {
        DenseIntVector update = new DenseIntVector(dim);
        for (int j = 0; j < dim; j++) {
            update.set(j, 1);
        }
        update.setMatrixId(matrixClient.getMatrixId());
        update.setRowId(0);
        matrixClient.increment(update);
        matrixClient.clock().get();
        Thread.sleep(1000);
        MatrixStorageManager ps1Storage = ps1Attempt0.getMatrixStorageManager();
        ServerMatrix ps1w1 = ps1Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps1w1.getPartition(0));
        assertNotNull(ps1w1.getPartition(1));
        IntBuffer row0Part0 = ((ServerDenseIntRow) ps1w1.getRow(0, 0)).getData();
        int part0Size = ps1w1.getRow(0, 0).size();
        IntBuffer row0Part1 = ((ServerDenseIntRow) ps1w1.getRow(1, 0)).getData();
        int part1Size = ps1w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
        MatrixStorageManager ps2Storage = ps2Attempt.getMatrixStorageManager();
        ServerMatrix ps2w1 = ps2Storage.getMatrix(matrixClient.getMatrixId());
        assertNotNull(ps2w1.getPartition(0));
        assertNotNull(ps2w1.getPartition(1));
        row0Part0 = ((ServerDenseIntRow) ps2w1.getRow(0, 0)).getData();
        part0Size = ps2w1.getRow(0, 0).size();
        row0Part1 = ((ServerDenseIntRow) ps2w1.getRow(1, 0)).getData();
        part1Size = ps2w1.getRow(1, 0).size();
        assertEquals(sum(row0Part0, part0Size), (i + 1) * dim / 2);
        assertEquals(sum(row0Part1, part1Size), (i + 1) * dim / 2);
    }
}
Also used : TaskContext(com.tencent.angel.worker.task.TaskContext) HashMap(java.util.HashMap) WorkerAttemptId(com.tencent.angel.worker.WorkerAttemptId) ServerMatrix(com.tencent.angel.ps.impl.matrix.ServerMatrix) WorkerId(com.tencent.angel.worker.WorkerId) ParameterServer(com.tencent.angel.ps.impl.ParameterServer) WorkerGroupId(com.tencent.angel.worker.WorkerGroupId) DenseIntVector(com.tencent.angel.ml.math.vector.DenseIntVector) PSAttemptId(com.tencent.angel.ps.PSAttemptId) PSLocation(com.tencent.angel.ml.matrix.transport.PSLocation) MatrixStorageManager(com.tencent.angel.ps.impl.MatrixStorageManager) IntBuffer(java.nio.IntBuffer) ServerDenseIntRow(com.tencent.angel.ps.impl.matrix.ServerDenseIntRow) Worker(com.tencent.angel.worker.Worker) MatrixClient(com.tencent.angel.psagent.matrix.MatrixClient) ParameterServerId(com.tencent.angel.ps.ParameterServerId) Test(org.junit.Test)

Example 7 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class PSMatrixMetaManager method getPartLocation.

/**
 * Get partition location: includes stored pss and the location of the pss
 *
 * @param partitionKey partition information
 * @return partition location
 * @throws ServiceException
 */
public PartitionLocation getPartLocation(PartitionKey partitionKey) throws ServiceException {
    List<ParameterServerId> psIds = getPss(partitionKey);
    if (psIds == null) {
        return new PartitionLocation(new ArrayList<>());
    }
    int size = psIds.size();
    List<PSLocation> psLocs = new ArrayList<>(size);
    for (int i = 0; i < size; i++) {
        psLocs.add(new PSLocation(psIds.get(i), context.getLocationManager().getPsLocation(psIds.get(i))));
    }
    return new PartitionLocation(psLocs);
}
Also used : PSLocation(com.tencent.angel.ps.server.data.PSLocation) ArrayList(java.util.ArrayList) ParameterServerId(com.tencent.angel.ps.ParameterServerId) PartitionLocation(com.tencent.angel.ml.matrix.PartitionLocation)

Example 8 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class MasterService method getPartLocation.

/**
 * Get locations for a partition
 */
@Override
public GetPartLocationResponse getPartLocation(RpcController controller, GetPartLocationRequest request) throws ServiceException {
    GetPartLocationResponse.Builder builder = GetPartLocationResponse.newBuilder();
    List<ParameterServerId> psIds = context.getMatrixMetaManager().getPss(request.getMatrixId(), request.getPartId());
    if (psIds != null) {
        int size = psIds.size();
        for (int i = 0; i < size; i++) {
            Location psLocation = context.getLocationManager().getPsLocation(psIds.get(i));
            if (psLocation == null) {
                builder.addLocations((PSLocationProto.newBuilder().setPsId(ProtobufUtil.convertToIdProto(psIds.get(i))).setPsStatus(PSStatus.PS_NOTREADY).build()));
            } else {
                builder.addLocations(ProtobufUtil.convertToPSLocProto(psIds.get(i), psLocation));
            }
        }
    }
    return builder.build();
}
Also used : GetPartLocationResponse(com.tencent.angel.protobuf.generated.MLProtos.GetPartLocationResponse) ParameterServerId(com.tencent.angel.ps.ParameterServerId) PSLocation(com.tencent.angel.ps.server.data.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 9 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class MasterService method getPSLocation.

/**
 * get a specific parameter server location.
 *
 * @param controller rpc controller of protobuf
 * @param request parameter server id
 */
@Override
public GetPSLocationReponse getPSLocation(RpcController controller, GetPSLocationRequest request) throws ServiceException {
    GetPSLocationReponse.Builder resBuilder = GetPSLocationReponse.newBuilder();
    ParameterServerId psId = ProtobufUtil.convertToId(request.getPsId());
    Location psLocation = context.getLocationManager().getPsLocation(psId);
    if (psLocation == null) {
        resBuilder.setPsLocation(PSLocationProto.newBuilder().setPsId(request.getPsId()).setPsStatus(PSStatus.PS_NOTREADY).build());
    } else {
        resBuilder.setPsLocation(ProtobufUtil.convertToPSLocProto(psId, psLocation));
    }
    return resBuilder.build();
}
Also used : GetPSLocationReponse(com.tencent.angel.protobuf.generated.MLProtos.GetPSLocationReponse) ParameterServerId(com.tencent.angel.ps.ParameterServerId) PSLocation(com.tencent.angel.ps.server.data.PSLocation) Location(com.tencent.angel.common.location.Location)

Example 10 with ParameterServerId

use of com.tencent.angel.ps.ParameterServerId in project angel by Tencent.

the class AngelApplicationMaster method initAndStart.

/**
 * init and start all service modules for angel applicaiton master.
 */
public void initAndStart() throws Exception {
    addIfService(angelApp);
    // init app state storage
    String tmpOutPath = conf.get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH);
    Path appStatePath = new Path(tmpOutPath, "app");
    LOG.info("app state output path = " + appStatePath.toUri().toString());
    FileSystem fs = appStatePath.getFileSystem(conf);
    appStateStorage = new AppStateStorage(appContext, appStatePath.toUri().toString(), fs);
    addIfService(appStateStorage);
    LOG.info("build app state storage success");
    // init event dispacher
    dispatcher = new AsyncDispatcher();
    addIfService(dispatcher);
    LOG.info("build event dispacher");
    // init location manager
    locationManager = new LocationManager();
    // init container allocator
    AngelDeployMode deployMode = appContext.getDeployMode();
    LOG.info("deploy mode=" + deployMode);
    if (deployMode == AngelDeployMode.LOCAL) {
        containerAllocator = new LocalContainerAllocator(appContext);
        containerLauncher = new LocalContainerLauncher(appContext);
    } else {
        containerAllocator = new YarnContainerAllocator(appContext);
        containerLauncher = new YarnContainerLauncher(appContext);
    }
    addIfService(containerAllocator);
    dispatcher.register(ContainerAllocatorEventType.class, containerAllocator);
    LOG.info("build containerAllocator success");
    addIfService(containerLauncher);
    dispatcher.register(ContainerLauncherEventType.class, containerLauncher);
    LOG.info("build containerLauncher success");
    // init a rpc service
    masterService = new MasterService(appContext);
    LOG.info("build master service success");
    // recover matrix meta if needed
    recoverMatrixMeta();
    // recover ps attempt information if need
    Map<ParameterServerId, Integer> psIdToAttemptIndexMap = recoverPSAttemptIndex();
    if (psIdToAttemptIndexMap == null) {
        LOG.info("recoverPSAttemptIndex return is null");
    } else {
        for (Entry<ParameterServerId, Integer> entry : psIdToAttemptIndexMap.entrySet()) {
            LOG.info("psId=" + entry.getKey() + ",attemptIndex=" + entry.getValue());
        }
    }
    // Init Client manager
    clientManager = new ClientManager(appContext);
    addIfService(clientManager);
    // Init PS Client manager
    psAgentManager = new PSAgentManager(appContext);
    addIfService(psAgentManager);
    // init parameter server manager
    psManager = new ParameterServerManager(appContext, psIdToAttemptIndexMap);
    addIfService(psManager);
    psManager.init();
    List<ParameterServerId> psIds = new ArrayList<>(psManager.getParameterServerMap().keySet());
    Collections.sort(psIds, new Comparator<ParameterServerId>() {

        @Override
        public int compare(ParameterServerId s1, ParameterServerId s2) {
            return s1.getIndex() - s2.getIndex();
        }
    });
    locationManager.setPsIds(psIds.toArray(new ParameterServerId[0]));
    dispatcher.register(ParameterServerManagerEventType.class, psManager);
    dispatcher.register(AMParameterServerEventType.class, new ParameterServerEventHandler());
    dispatcher.register(PSAttemptEventType.class, new PSAttemptEventDispatcher());
    LOG.info("build PSManager success");
    // recover task information if needed
    recoverTaskState();
    RunningMode mode = appContext.getRunningMode();
    LOG.info("running mode=" + mode);
    switch(mode) {
        case ANGEL_PS_WORKER:
            {
                // a dummy data spliter is just for test now
                boolean useDummyDataSpliter = conf.getBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, AngelConf.DEFAULT_ANGEL_AM_USE_DUMMY_DATASPLITER);
                if (useDummyDataSpliter) {
                    dataSpliter = new DummyDataSpliter(appContext);
                } else {
                    // recover data splits information if needed
                    recoveryDataSplits();
                }
                // init worker manager and register worker manager event
                workerManager = new WorkerManager(appContext);
                workerManager.adjustTaskNumber(dataSpliter.getSplitNum());
                addIfService(workerManager);
                dispatcher.register(WorkerManagerEventType.class, workerManager);
                dispatcher.register(AMWorkerGroupEventType.class, new WorkerGroupEventHandler());
                dispatcher.register(AMWorkerEventType.class, new WorkerEventHandler());
                dispatcher.register(WorkerAttemptEventType.class, new WorkerAttemptEventHandler());
                LOG.info("build WorkerManager success");
                break;
            }
        case ANGEL_PS:
            break;
    }
    // register slow worker/ps checker
    addIfService(new SlowChecker(appContext));
    algoMetricsService = new MetricsService(appContext);
    addIfService(algoMetricsService);
    dispatcher.register(MetricsEventType.class, algoMetricsService);
    // register app manager event and finish event
    dispatcher.register(AppEventType.class, angelApp);
    dispatcher.register(AppFinishEventType.class, new AppFinishEventHandler());
    // Init model saver & loader
    modelSaver = new AMModelSaver(appContext);
    addIfService(modelSaver);
    modelLoader = new AMModelLoader(appContext);
    addIfService(modelLoader);
    hbMonitor = new HeartbeatMonitor(appContext);
    addIfService(hbMonitor);
    masterService.init(conf);
    super.init(conf);
    // start a web service if use yarn deploy mode
    if (deployMode == AngelDeployMode.YARN) {
        try {
            webApp = WebApps.$for("angel", AMContext.class, appContext).with(conf).start(new AngelWebApp());
            LOG.info("start webapp server success");
            LOG.info("webApp.port()=" + webApp.port());
        } catch (Exception e) {
            LOG.error("Webapps failed to start. Ignoring for now:", e);
        }
    }
    masterService.start();
    locationManager.setMasterLocation(masterService.getLocation());
    super.serviceStart();
    psManager.startAllPS();
    AngelServiceLoader.startServiceIfNeed(this, getConfig());
    LOG.info("appAttemptId.getAttemptId()=" + appAttemptId.getAttemptId());
    if (appAttemptId.getAttemptId() > 1) {
        waitForAllPsRegisted();
        waitForAllMetricsInited();
        angelApp.startExecute();
    }
}
Also used : YarnContainerLauncher(com.tencent.angel.master.deploy.yarn.YarnContainerLauncher) AngelDeployMode(com.tencent.angel.AngelDeployMode) AMModelSaver(com.tencent.angel.master.matrix.committer.AMModelSaver) RunningMode(com.tencent.angel.RunningMode) DummyDataSpliter(com.tencent.angel.master.data.DummyDataSpliter) AMModelLoader(com.tencent.angel.master.matrix.committer.AMModelLoader) LocalContainerLauncher(com.tencent.angel.master.deploy.local.LocalContainerLauncher) WorkerAttemptEventType(com.tencent.angel.master.worker.attempt.WorkerAttemptEventType) FileSystem(org.apache.hadoop.fs.FileSystem) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) AMWorkerEventType(com.tencent.angel.master.worker.worker.AMWorkerEventType) Path(org.apache.hadoop.fs.Path) LocationManager(com.tencent.angel.common.location.LocationManager) LocalContainerAllocator(com.tencent.angel.master.deploy.local.LocalContainerAllocator) SlowChecker(com.tencent.angel.master.slowcheck.SlowChecker) MetricsService(com.tencent.angel.master.metrics.MetricsService) WorkerManagerEventType(com.tencent.angel.master.worker.WorkerManagerEventType) AppStateStorage(com.tencent.angel.master.oplog.AppStateStorage) IOException(java.io.IOException) WorkerManager(com.tencent.angel.master.worker.WorkerManager) YarnContainerAllocator(com.tencent.angel.master.deploy.yarn.YarnContainerAllocator) AsyncDispatcher(org.apache.hadoop.yarn.event.AsyncDispatcher) AMWorkerGroupEventType(com.tencent.angel.master.worker.workergroup.AMWorkerGroupEventType) ClientManager(com.tencent.angel.master.client.ClientManager) ParameterServerId(com.tencent.angel.ps.ParameterServerId) AngelWebApp(com.tencent.angel.webapp.AngelWebApp)

Aggregations

ParameterServerId (com.tencent.angel.ps.ParameterServerId)65 PSAttemptId (com.tencent.angel.ps.PSAttemptId)33 WorkerAttemptId (com.tencent.angel.worker.WorkerAttemptId)28 WorkerGroupId (com.tencent.angel.worker.WorkerGroupId)28 WorkerId (com.tencent.angel.worker.WorkerId)28 Configuration (org.apache.hadoop.conf.Configuration)28 MatrixContext (com.tencent.angel.ml.matrix.MatrixContext)27 CombineTextInputFormat (org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat)27 Before (org.junit.Before)23 TaskId (com.tencent.angel.worker.task.TaskId)9 PSLocation (com.tencent.angel.ps.server.data.PSLocation)6 HashMap (java.util.HashMap)6 Location (com.tencent.angel.common.location.Location)5 MatrixMeta (com.tencent.angel.ml.matrix.MatrixMeta)5 PartitionLocation (com.tencent.angel.ml.matrix.PartitionLocation)5 ArrayList (java.util.ArrayList)5 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)5 Path (org.apache.hadoop.fs.Path)5 Test (org.junit.Test)5 AMParameterServer (com.tencent.angel.master.ps.ps.AMParameterServer)4