Search in sources :

Example 1 with WorkerManager

use of com.tencent.angel.master.worker.WorkerManager in project angel by Tencent.

the class PSAgentTest method testClockCache.

@Test
public void testClockCache() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        assertTrue(taskManager != null);
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        assertTrue(worker != null);
        PSAgent psAgent = worker.getPSAgent();
        assertTrue(psAgent != null);
        ClockCache clockCache = psAgent.getClockCache();
        assertTrue(clockCache != null);
        int rowClock = clockCache.getClock(1, 0);
        assertEquals(rowClock, 0);
    } catch (Exception x) {
        LOG.error("run testClockCache failed ", x);
        throw x;
    }
}
Also used : WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) ClockCache(com.tencent.angel.psagent.clock.ClockCache) AngelApplicationMaster(com.tencent.angel.master.AngelApplicationMaster) Worker(com.tencent.angel.worker.Worker) Test(org.junit.Test)

Example 2 with WorkerManager

use of com.tencent.angel.master.worker.WorkerManager in project angel by Tencent.

the class PSAgentTest method testPSAgentContext.

@Test
public void testPSAgentContext() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        assertTrue(taskManager != null);
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        assertTrue(worker != null);
        PSAgent psAgent = worker.getPSAgent();
        assertTrue(psAgent != null);
        PSAgentContext psAgentContext = PSAgentContext.get();
        assertTrue(psAgentContext.getPsAgent() != null);
        assertTrue(psAgentContext.getConf() != null);
        assertTrue(psAgentContext.getMetrics() != null);
        assertTrue(psAgentContext.getMasterClient() != null);
        assertTrue(psAgentContext.getIdProto() != null);
        assertTrue(psAgentContext.getOpLogCache() != null);
        assertTrue(psAgentContext.getMatrixTransportClient() != null);
        assertTrue(psAgentContext.getMatrixMetaManager() != null);
        assertTrue(psAgentContext.getMatrixMetaManager() != null);
        assertTrue(psAgentContext.getLocationManager() != null);
        assertEquals(psAgentContext.getRunningMode(), psAgent.getRunningMode());
        assertEquals(psAgentContext.getIp(), psAgent.getIp());
        assertEquals(psAgentContext.getStaleness(), psAgent.getConf().getInt(AngelConf.ANGEL_STALENESS, AngelConf.DEFAULT_ANGEL_STALENESS));
        assertEquals(psAgentContext.getConsistencyController(), psAgent.getConsistencyController());
        assertEquals(psAgentContext.getMatrixOpLogCache(), psAgent.getOpLogCache());
        assertEquals(psAgentContext.getClockCache(), psAgent.getClockCache());
        assertEquals(psAgentContext.getMatricesCache(), psAgent.getMatricesCache());
        assertEquals(psAgentContext.getMatrixStorageManager(), psAgent.getMatrixStorageManager());
        assertEquals(psAgentContext.getMatrixClientAdapter(), psAgent.getMatrixClientAdapter());
        assertEquals(psAgentContext.getExecutor(), psAgent.getExecutor());
        assertTrue(psAgentContext.getTaskContext(1) != null);
        assertTrue(psAgentContext.getTaskContext(2) != null);
        int taskNum = psAgentContext.getTotalTaskNum();
        assertEquals(taskNum, 2);
        int localTaskNum = psAgentContext.getLocalTaskNum();
        assertEquals(localTaskNum, 2);
    } catch (Exception x) {
        LOG.error("run testPSAgentContext failed ", x);
        throw x;
    }
}
Also used : WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) AngelApplicationMaster(com.tencent.angel.master.AngelApplicationMaster) Worker(com.tencent.angel.worker.Worker) Test(org.junit.Test)

Example 3 with WorkerManager

use of com.tencent.angel.master.worker.WorkerManager in project angel by Tencent.

the class PSAgentTest method testTaskContext.

@Test
public void testTaskContext() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        assertTrue(taskManager != null);
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        assertTrue(worker != null);
        PSAgent psAgent = worker.getPSAgent();
        assertTrue(psAgent != null);
        PSAgentContext psAgentContext = PSAgentContext.get();
        assertTrue(psAgentContext.getPsAgent() != null);
        TaskContext taskContext1 = psAgentContext.getTaskContext(1);
        TaskContext taskContext2 = psAgentContext.getTaskContext(2);
        assertTrue(taskContext1 != null);
        assertTrue(taskContext2 != null);
        assertEquals(taskContext1.getIndex(), 1);
        assertEquals(taskContext2.getIndex(), 2);
        assertEquals(taskContext1.getEpoch(), 0);
        assertEquals(taskContext2.getEpoch(), 0);
        assertEquals(taskContext1.getMatrixClock(1), 0);
        assertEquals(taskContext2.getMatrixClock(2), 0);
        assertEquals(taskContext1.getMatrixClocks().size(), 1);
        assertEquals(taskContext2.getMatrixClocks().size(), 1);
        assertEquals(taskContext1.getProgress(), 0.0, 1e-5);
        assertEquals(taskContext2.getProgress(), 0.0, 1e-5);
    } catch (Exception x) {
        LOG.error("run testTaskContext failed ", x);
        throw x;
    }
}
Also used : WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) TaskContext(com.tencent.angel.psagent.task.TaskContext) AngelApplicationMaster(com.tencent.angel.master.AngelApplicationMaster) Worker(com.tencent.angel.worker.Worker) Test(org.junit.Test)

Example 4 with WorkerManager

use of com.tencent.angel.master.worker.WorkerManager in project angel by Tencent.

the class AngelApplicationMaster method initAndStart.

/**
 * init and start all service modules for angel applicaiton master.
 */
public void initAndStart() throws Exception {
    addIfService(angelApp);
    // init app state storage
    String tmpOutPath = conf.get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH);
    Path appStatePath = new Path(tmpOutPath, "app");
    LOG.info("app state output path = " + appStatePath.toUri().toString());
    FileSystem fs = appStatePath.getFileSystem(conf);
    appStateStorage = new AppStateStorage(appContext, appStatePath.toUri().toString(), fs);
    addIfService(appStateStorage);
    LOG.info("build app state storage success");
    // init event dispacher
    dispatcher = new AsyncDispatcher();
    addIfService(dispatcher);
    LOG.info("build event dispacher");
    // init location manager
    locationManager = new LocationManager();
    // init container allocator
    AngelDeployMode deployMode = appContext.getDeployMode();
    LOG.info("deploy mode=" + deployMode);
    if (deployMode == AngelDeployMode.LOCAL) {
        containerAllocator = new LocalContainerAllocator(appContext);
        containerLauncher = new LocalContainerLauncher(appContext);
    } else {
        containerAllocator = new YarnContainerAllocator(appContext);
        containerLauncher = new YarnContainerLauncher(appContext);
    }
    addIfService(containerAllocator);
    dispatcher.register(ContainerAllocatorEventType.class, containerAllocator);
    LOG.info("build containerAllocator success");
    addIfService(containerLauncher);
    dispatcher.register(ContainerLauncherEventType.class, containerLauncher);
    LOG.info("build containerLauncher success");
    // init a rpc service
    masterService = new MasterService(appContext);
    LOG.info("build master service success");
    // recover matrix meta if needed
    recoverMatrixMeta();
    // recover ps attempt information if need
    Map<ParameterServerId, Integer> psIdToAttemptIndexMap = recoverPSAttemptIndex();
    if (psIdToAttemptIndexMap == null) {
        LOG.info("recoverPSAttemptIndex return is null");
    } else {
        for (Entry<ParameterServerId, Integer> entry : psIdToAttemptIndexMap.entrySet()) {
            LOG.info("psId=" + entry.getKey() + ",attemptIndex=" + entry.getValue());
        }
    }
    // init parameter server manager
    psManager = new ParameterServerManager(appContext, psIdToAttemptIndexMap);
    addIfService(psManager);
    psManager.init();
    List<ParameterServerId> psIds = new ArrayList<>(psManager.getParameterServerMap().keySet());
    Collections.sort(psIds, new Comparator<ParameterServerId>() {

        @Override
        public int compare(ParameterServerId s1, ParameterServerId s2) {
            return s1.getIndex() - s2.getIndex();
        }
    });
    locationManager.setPsIds(psIds.toArray(new ParameterServerId[0]));
    dispatcher.register(ParameterServerManagerEventType.class, psManager);
    dispatcher.register(AMParameterServerEventType.class, new ParameterServerEventHandler());
    dispatcher.register(PSAttemptEventType.class, new PSAttemptEventDispatcher());
    LOG.info("build PSManager success");
    // recover task information if needed
    recoverTaskState();
    RunningMode mode = appContext.getRunningMode();
    LOG.info("running mode=" + mode);
    switch(mode) {
        case ANGEL_PS_PSAGENT:
            {
                // init psagent manager and register psagent manager event
                psAgentManager = new PSAgentManager(appContext);
                addIfService(psAgentManager);
                dispatcher.register(PSAgentManagerEventType.class, psAgentManager);
                dispatcher.register(AMPSAgentEventType.class, new PSAgentEventHandler());
                dispatcher.register(PSAgentAttemptEventType.class, new PSAgentAttemptEventHandler());
                LOG.info("build PSAgentManager success");
                break;
            }
        case ANGEL_PS_WORKER:
            {
                // a dummy data spliter is just for test now
                boolean useDummyDataSpliter = conf.getBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, AngelConf.DEFAULT_ANGEL_AM_USE_DUMMY_DATASPLITER);
                if (useDummyDataSpliter) {
                    dataSpliter = new DummyDataSpliter(appContext);
                } else {
                    // recover data splits information if needed
                    recoveryDataSplits();
                }
                // init worker manager and register worker manager event
                workerManager = new WorkerManager(appContext);
                workerManager.adjustTaskNumber(dataSpliter.getSplitNum());
                addIfService(workerManager);
                dispatcher.register(WorkerManagerEventType.class, workerManager);
                dispatcher.register(AMWorkerGroupEventType.class, new WorkerGroupEventHandler());
                dispatcher.register(AMWorkerEventType.class, new WorkerEventHandler());
                dispatcher.register(WorkerAttemptEventType.class, new WorkerAttemptEventHandler());
                LOG.info("build WorkerManager success");
                break;
            }
        case ANGEL_PS:
            break;
    }
    // register slow worker/ps checker
    addIfService(new SlowChecker(appContext));
    algoMetricsService = new MetricsService(appContext);
    addIfService(algoMetricsService);
    dispatcher.register(MetricsEventType.class, algoMetricsService);
    // register app manager event and finish event
    dispatcher.register(AppEventType.class, angelApp);
    dispatcher.register(AppFinishEventType.class, new AppFinishEventHandler());
    masterService.init(conf);
    super.init(conf);
    // start a web service if use yarn deploy mode
    if (deployMode == AngelDeployMode.YARN) {
        try {
            webApp = WebApps.$for("angel", AMContext.class, appContext).with(conf).start(new AngelWebApp());
            LOG.info("start webapp server success");
            LOG.info("webApp.port()=" + webApp.port());
        } catch (Exception e) {
            LOG.error("Webapps failed to start. Ignoring for now:", e);
        }
    }
    masterService.start();
    locationManager.setMasterLocation(masterService.getLocation());
    super.serviceStart();
    psManager.startAllPS();
    AngelServiceLoader.startServiceIfNeed(this, getConfig());
    LOG.info("appAttemptId.getAttemptId()=" + appAttemptId.getAttemptId());
    if (appAttemptId.getAttemptId() > 1) {
        waitForAllPsRegisted();
        waitForAllMetricsInited();
        angelApp.startExecute();
    }
}
Also used : YarnContainerLauncher(com.tencent.angel.master.deploy.yarn.YarnContainerLauncher) AngelDeployMode(com.tencent.angel.AngelDeployMode) RunningMode(com.tencent.angel.RunningMode) DummyDataSpliter(com.tencent.angel.master.data.DummyDataSpliter) LocalContainerLauncher(com.tencent.angel.master.deploy.local.LocalContainerLauncher) WorkerAttemptEventType(com.tencent.angel.master.worker.attempt.WorkerAttemptEventType) FileSystem(org.apache.hadoop.fs.FileSystem) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) AMWorkerEventType(com.tencent.angel.master.worker.worker.AMWorkerEventType) Path(org.apache.hadoop.fs.Path) LocationManager(com.tencent.angel.common.location.LocationManager) LocalContainerAllocator(com.tencent.angel.master.deploy.local.LocalContainerAllocator) SlowChecker(com.tencent.angel.master.slowcheck.SlowChecker) MetricsService(com.tencent.angel.master.metrics.MetricsService) WorkerManagerEventType(com.tencent.angel.master.worker.WorkerManagerEventType) AppStateStorage(com.tencent.angel.master.oplog.AppStateStorage) IOException(java.io.IOException) WorkerManager(com.tencent.angel.master.worker.WorkerManager) YarnContainerAllocator(com.tencent.angel.master.deploy.yarn.YarnContainerAllocator) AsyncDispatcher(org.apache.hadoop.yarn.event.AsyncDispatcher) AMWorkerGroupEventType(com.tencent.angel.master.worker.workergroup.AMWorkerGroupEventType) ParameterServerId(com.tencent.angel.ps.ParameterServerId) AngelWebApp(com.tencent.angel.webapp.AngelWebApp)

Example 5 with WorkerManager

use of com.tencent.angel.master.worker.WorkerManager in project angel by Tencent.

the class PSAgentTest method testPSClient.

@Test
public void testPSClient() throws Exception {
    try {
        AngelApplicationMaster angelAppMaster = LocalClusterContext.get().getMaster().getAppMaster();
        assertTrue(angelAppMaster != null);
        AMTaskManager taskManager = angelAppMaster.getAppContext().getTaskManager();
        assertTrue(taskManager != null);
        WorkerManager workerManager = angelAppMaster.getAppContext().getWorkerManager();
        assertTrue(workerManager != null);
        Worker worker = LocalClusterContext.get().getWorker(worker0Attempt0Id).getWorker();
        assertTrue(worker != null);
        PSAgent psAgent = worker.getPSAgent();
        assertTrue(psAgent != null);
        // psAgent.initAndStart();
        // test conf
        Configuration conf = psAgent.getConf();
        assertTrue(conf != null);
        assertEquals(conf.get(AngelConf.ANGEL_DEPLOY_MODE), "LOCAL");
        // test master location
        Location masterLoc = psAgent.getMasterLocation();
        String ipRegex = "(2[5][0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})\\.(25[0-5]|2[0-4]\\d|1\\d{2}|\\d{1,2})";
        Pattern pattern = Pattern.compile(ipRegex);
        Matcher matcher = pattern.matcher(masterLoc.getIp());
        assertTrue(matcher.matches());
        assertTrue(masterLoc.getPort() >= 1 && masterLoc.getPort() <= 65535);
        // test app id
        ApplicationId appId = psAgent.getAppId();
        // test user
        String user = psAgent.getUser();
        // test ps agent attempt id
        PSAgentAttemptId psAgentAttemptId = psAgent.getId();
        assertEquals(psAgentAttemptId.toString(), "PSAgentAttempt_0_0");
        assertEquals(psAgentAttemptId.getIndex(), 0);
        // test ps agent id
        PSAgentId psAgentId = psAgentAttemptId.getPsAgentId();
        assertEquals(psAgentId.toString(), "PSAgent_0");
        assertEquals(psAgentId.getIndex(), 0);
        // test connection
        TConnection conn = psAgent.getConnection();
        assertTrue(conn != null);
        // test master client
        MasterClient masterClient = psAgent.getMasterClient();
        assertTrue(masterClient != null);
        // test ip
        String ip = psAgent.getIp();
        matcher = pattern.matcher(ip);
        assertTrue(matcher.matches());
        // test loc
        Location loc = psAgent.getLocation();
        assertTrue(loc != null);
        matcher = pattern.matcher(loc.getIp());
        assertTrue(matcher.matches());
        assertTrue(loc.getPort() >= 1 && loc.getPort() <= 65535);
    } catch (Exception x) {
        LOG.error("run testPSClient failed ", x);
        throw x;
    }
}
Also used : Pattern(java.util.regex.Pattern) Configuration(org.apache.hadoop.conf.Configuration) Matcher(java.util.regex.Matcher) MasterClient(com.tencent.angel.psagent.client.MasterClient) WorkerManager(com.tencent.angel.master.worker.WorkerManager) AMTaskManager(com.tencent.angel.master.task.AMTaskManager) TConnection(com.tencent.angel.ipc.TConnection) AngelApplicationMaster(com.tencent.angel.master.AngelApplicationMaster) Worker(com.tencent.angel.worker.Worker) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Location(com.tencent.angel.common.location.Location) Test(org.junit.Test)

Aggregations

WorkerManager (com.tencent.angel.master.worker.WorkerManager)12 AMTaskManager (com.tencent.angel.master.task.AMTaskManager)11 Worker (com.tencent.angel.worker.Worker)10 Test (org.junit.Test)10 AngelApplicationMaster (com.tencent.angel.master.AngelApplicationMaster)9 Location (com.tencent.angel.common.location.Location)4 Matcher (java.util.regex.Matcher)4 Pattern (java.util.regex.Pattern)4 ParameterServerId (com.tencent.angel.ps.ParameterServerId)3 MasterClient (com.tencent.angel.psagent.client.MasterClient)3 PSAgentMatrixMetaManager (com.tencent.angel.psagent.matrix.PSAgentMatrixMetaManager)3 AMTask (com.tencent.angel.master.task.AMTask)2 AMWorker (com.tencent.angel.master.worker.worker.AMWorker)2 PSAgentLocationManager (com.tencent.angel.psagent.matrix.PSAgentLocationManager)2 TaskContext (com.tencent.angel.psagent.task.TaskContext)2 ServiceException (com.google.protobuf.ServiceException)1 AngelDeployMode (com.tencent.angel.AngelDeployMode)1 PartitionKey (com.tencent.angel.PartitionKey)1 RunningMode (com.tencent.angel.RunningMode)1 Id (com.tencent.angel.common.Id)1