Search in sources :

Example 1 with RunningMode

use of com.tencent.angel.RunningMode in project angel by Tencent.

the class PSAgent method done.

/**
 * Notify run success message to master
 */
public void done() {
    if (!exitedFlag.getAndSet(true)) {
        LOG.info("psagent success done");
        RunningMode mode = PSAgentContext.get().getRunningMode();
        // Notify run success to master only on ANGEL_PS_PSAGENT running mode
        if (mode == RunningMode.ANGEL_PS_PSAGENT) {
            try {
                masterClient.psAgentDone();
                LOG.info("send done message to appmaster success");
            } catch (ServiceException e) {
                LOG.error("send done message error ", e);
            } finally {
                try {
                    connection.close();
                } catch (Exception e) {
                    LOG.error("close connection error", e);
                }
            }
        }
        // Stop all modules
        if (executor != null) {
            executor.done();
        } else {
            stop();
        }
        // Exit the process if on ANGEL_PS_PSAGENT mode
        if (mode == RunningMode.ANGEL_PS_PSAGENT) {
            System.exit(0);
        }
    }
}
Also used : ServiceException(com.google.protobuf.ServiceException) RunningMode(com.tencent.angel.RunningMode) ServiceException(com.google.protobuf.ServiceException) InvalidParameterException(com.tencent.angel.exception.InvalidParameterException) TimeOutException(com.tencent.angel.exception.TimeOutException) AngelException(com.tencent.angel.exception.AngelException) IOException(java.io.IOException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException)

Example 2 with RunningMode

use of com.tencent.angel.RunningMode in project angel by Tencent.

the class AngelApplicationMaster method initAndStart.

/**
 * init and start all service modules for angel applicaiton master.
 */
public void initAndStart() throws Exception {
    addIfService(angelApp);
    // init app state storage
    String tmpOutPath = conf.get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH);
    Path appStatePath = new Path(tmpOutPath, "app");
    LOG.info("app state output path = " + appStatePath.toUri().toString());
    FileSystem fs = appStatePath.getFileSystem(conf);
    appStateStorage = new AppStateStorage(appContext, appStatePath.toUri().toString(), fs);
    addIfService(appStateStorage);
    LOG.info("build app state storage success");
    // init event dispacher
    dispatcher = new AsyncDispatcher();
    addIfService(dispatcher);
    LOG.info("build event dispacher");
    // init location manager
    locationManager = new LocationManager();
    // init container allocator
    AngelDeployMode deployMode = appContext.getDeployMode();
    LOG.info("deploy mode=" + deployMode);
    if (deployMode == AngelDeployMode.LOCAL) {
        containerAllocator = new LocalContainerAllocator(appContext);
        containerLauncher = new LocalContainerLauncher(appContext);
    } else {
        containerAllocator = new YarnContainerAllocator(appContext);
        containerLauncher = new YarnContainerLauncher(appContext);
    }
    addIfService(containerAllocator);
    dispatcher.register(ContainerAllocatorEventType.class, containerAllocator);
    LOG.info("build containerAllocator success");
    addIfService(containerLauncher);
    dispatcher.register(ContainerLauncherEventType.class, containerLauncher);
    LOG.info("build containerLauncher success");
    // init a rpc service
    masterService = new MasterService(appContext);
    LOG.info("build master service success");
    // recover matrix meta if needed
    recoverMatrixMeta();
    // recover ps attempt information if need
    Map<ParameterServerId, Integer> psIdToAttemptIndexMap = recoverPSAttemptIndex();
    if (psIdToAttemptIndexMap == null) {
        LOG.info("recoverPSAttemptIndex return is null");
    } else {
        for (Entry<ParameterServerId, Integer> entry : psIdToAttemptIndexMap.entrySet()) {
            LOG.info("psId=" + entry.getKey() + ",attemptIndex=" + entry.getValue());
        }
    }
    // init parameter server manager
    psManager = new ParameterServerManager(appContext, psIdToAttemptIndexMap);
    addIfService(psManager);
    psManager.init();
    List<ParameterServerId> psIds = new ArrayList<>(psManager.getParameterServerMap().keySet());
    Collections.sort(psIds, new Comparator<ParameterServerId>() {

        @Override
        public int compare(ParameterServerId s1, ParameterServerId s2) {
            return s1.getIndex() - s2.getIndex();
        }
    });
    locationManager.setPsIds(psIds.toArray(new ParameterServerId[0]));
    dispatcher.register(ParameterServerManagerEventType.class, psManager);
    dispatcher.register(AMParameterServerEventType.class, new ParameterServerEventHandler());
    dispatcher.register(PSAttemptEventType.class, new PSAttemptEventDispatcher());
    LOG.info("build PSManager success");
    // recover task information if needed
    recoverTaskState();
    RunningMode mode = appContext.getRunningMode();
    LOG.info("running mode=" + mode);
    switch(mode) {
        case ANGEL_PS_PSAGENT:
            {
                // init psagent manager and register psagent manager event
                psAgentManager = new PSAgentManager(appContext);
                addIfService(psAgentManager);
                dispatcher.register(PSAgentManagerEventType.class, psAgentManager);
                dispatcher.register(AMPSAgentEventType.class, new PSAgentEventHandler());
                dispatcher.register(PSAgentAttemptEventType.class, new PSAgentAttemptEventHandler());
                LOG.info("build PSAgentManager success");
                break;
            }
        case ANGEL_PS_WORKER:
            {
                // a dummy data spliter is just for test now
                boolean useDummyDataSpliter = conf.getBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, AngelConf.DEFAULT_ANGEL_AM_USE_DUMMY_DATASPLITER);
                if (useDummyDataSpliter) {
                    dataSpliter = new DummyDataSpliter(appContext);
                } else {
                    // recover data splits information if needed
                    recoveryDataSplits();
                }
                // init worker manager and register worker manager event
                workerManager = new WorkerManager(appContext);
                workerManager.adjustTaskNumber(dataSpliter.getSplitNum());
                addIfService(workerManager);
                dispatcher.register(WorkerManagerEventType.class, workerManager);
                dispatcher.register(AMWorkerGroupEventType.class, new WorkerGroupEventHandler());
                dispatcher.register(AMWorkerEventType.class, new WorkerEventHandler());
                dispatcher.register(WorkerAttemptEventType.class, new WorkerAttemptEventHandler());
                LOG.info("build WorkerManager success");
                break;
            }
        case ANGEL_PS:
            break;
    }
    // register slow worker/ps checker
    addIfService(new SlowChecker(appContext));
    algoMetricsService = new MetricsService(appContext);
    addIfService(algoMetricsService);
    dispatcher.register(MetricsEventType.class, algoMetricsService);
    // register app manager event and finish event
    dispatcher.register(AppEventType.class, angelApp);
    dispatcher.register(AppFinishEventType.class, new AppFinishEventHandler());
    masterService.init(conf);
    super.init(conf);
    // start a web service if use yarn deploy mode
    if (deployMode == AngelDeployMode.YARN) {
        try {
            webApp = WebApps.$for("angel", AMContext.class, appContext).with(conf).start(new AngelWebApp());
            LOG.info("start webapp server success");
            LOG.info("webApp.port()=" + webApp.port());
        } catch (Exception e) {
            LOG.error("Webapps failed to start. Ignoring for now:", e);
        }
    }
    masterService.start();
    locationManager.setMasterLocation(masterService.getLocation());
    super.serviceStart();
    psManager.startAllPS();
    AngelServiceLoader.startServiceIfNeed(this, getConfig());
    LOG.info("appAttemptId.getAttemptId()=" + appAttemptId.getAttemptId());
    if (appAttemptId.getAttemptId() > 1) {
        waitForAllPsRegisted();
        waitForAllMetricsInited();
        angelApp.startExecute();
    }
}
Also used : YarnContainerLauncher(com.tencent.angel.master.deploy.yarn.YarnContainerLauncher) AngelDeployMode(com.tencent.angel.AngelDeployMode) RunningMode(com.tencent.angel.RunningMode) DummyDataSpliter(com.tencent.angel.master.data.DummyDataSpliter) LocalContainerLauncher(com.tencent.angel.master.deploy.local.LocalContainerLauncher) WorkerAttemptEventType(com.tencent.angel.master.worker.attempt.WorkerAttemptEventType) FileSystem(org.apache.hadoop.fs.FileSystem) ParameterServerManager(com.tencent.angel.master.ps.ParameterServerManager) AMWorkerEventType(com.tencent.angel.master.worker.worker.AMWorkerEventType) Path(org.apache.hadoop.fs.Path) LocationManager(com.tencent.angel.common.location.LocationManager) LocalContainerAllocator(com.tencent.angel.master.deploy.local.LocalContainerAllocator) SlowChecker(com.tencent.angel.master.slowcheck.SlowChecker) MetricsService(com.tencent.angel.master.metrics.MetricsService) WorkerManagerEventType(com.tencent.angel.master.worker.WorkerManagerEventType) AppStateStorage(com.tencent.angel.master.oplog.AppStateStorage) IOException(java.io.IOException) WorkerManager(com.tencent.angel.master.worker.WorkerManager) YarnContainerAllocator(com.tencent.angel.master.deploy.yarn.YarnContainerAllocator) AsyncDispatcher(org.apache.hadoop.yarn.event.AsyncDispatcher) AMWorkerGroupEventType(com.tencent.angel.master.worker.workergroup.AMWorkerGroupEventType) ParameterServerId(com.tencent.angel.ps.ParameterServerId) AngelWebApp(com.tencent.angel.webapp.AngelWebApp)

Example 3 with RunningMode

use of com.tencent.angel.RunningMode in project angel by Tencent.

the class AngelClient method setOutputDirectory.

protected void setOutputDirectory() throws IOException {
    String actionType = conf.get(AngelConf.ANGEL_ACTION_TYPE, AngelConf.DEFAULT_ANGEL_ACTION_TYPE);
    RunningMode runningMode = RunningMode.valueOf(conf.get(AngelConf.ANGEL_RUNNING_MODE, AngelConf.DEFAULT_ANGEL_RUNNING_MODE));
    LOG.info("running mode = " + runningMode);
    boolean deleteOnExist = conf.getBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, AngelConf.DEFAULT_ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST);
    String path = null;
    if (!actionType.matches("predict")) {
        path = conf.get(AngelConf.ANGEL_SAVE_MODEL_PATH);
    } else {
        path = conf.get(AngelConf.ANGEL_PREDICT_PATH);
    }
    if (path == null) {
        throw new IOException("output directory is null. you must set " + AngelConf.ANGEL_SAVE_MODEL_PATH + " at training mode or set " + AngelConf.ANGEL_PREDICT_PATH + " at predict mode");
    }
    conf.set(AngelConf.ANGEL_JOB_OUTPUT_PATH, path);
    Path outputPath = new Path(path);
    FileSystem outFs = outputPath.getFileSystem(conf);
    if (outFs.exists(outputPath)) {
        if (deleteOnExist) {
            outFs.delete(outputPath, true);
        } else {
            throw new IOException("output path " + outputPath + " already exist, please check");
        }
    }
    Path outputParentPath = outputPath.getParent();
    if (!outFs.exists(outputParentPath)) {
        LOG.info("Make dir for model output parent path: " + outputParentPath);
        if (!outFs.mkdirs(outputParentPath)) {
            throw new IOException("Failed to make dir for model output parent path: " + outputParentPath);
        }
    }
    if (runningMode == RunningMode.ANGEL_PS_WORKER) {
        String logPathStr = conf.get(AngelConf.ANGEL_LOG_PATH);
        if (logPathStr != null) {
            Path logPath = new Path(logPathStr);
            FileSystem logFs = logPath.getFileSystem(conf);
            if (logFs.exists(logPath)) {
                if (deleteOnExist) {
                    logFs.delete(logPath, true);
                } else {
                    throw new IOException("log path " + logPath + " already exist, please check");
                }
            }
        }
    }
    Path tmpOutputPath = HdfsUtil.generateTmpDirectory(conf, getAppId(), outputPath);
    internalStateFile = new Path(HdfsUtil.generateTmpDirectory(conf, getAppId(), outputPath), "state");
    conf.set(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH, tmpOutputPath.toString());
    LOG.info(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH + "=" + tmpOutputPath.toString());
    LOG.info("internal state file is " + internalStateFile);
    conf.set(AngelConf.ANGEL_APP_SERILIZE_STATE_FILE, internalStateFile.toString());
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) IOException(java.io.IOException) RunningMode(com.tencent.angel.RunningMode)

Example 4 with RunningMode

use of com.tencent.angel.RunningMode in project angel by Tencent.

the class AngelClient method setInputDirectory.

protected void setInputDirectory() throws IOException {
    boolean isUseDummy = conf.getBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, AngelConf.DEFAULT_ANGEL_AM_USE_DUMMY_DATASPLITER);
    if (isUseDummy) {
        return;
    }
    String actionType = conf.get(AngelConf.ANGEL_ACTION_TYPE, AngelConf.DEFAULT_ANGEL_ACTION_TYPE);
    RunningMode runningMode = RunningMode.valueOf(conf.get(AngelConf.ANGEL_RUNNING_MODE, AngelConf.DEFAULT_ANGEL_RUNNING_MODE));
    String path = null;
    if (!actionType.matches("predict")) {
        path = conf.get(AngelConf.ANGEL_TRAIN_DATA_PATH);
    } else {
        path = conf.get(AngelConf.ANGEL_PREDICT_DATA_PATH);
    }
    if (runningMode == RunningMode.ANGEL_PS_WORKER) {
        if (path == null) {
            throw new IOException("input data directory is empty, you should set it");
        } else {
            conf.set(AngelConf.ANGEL_JOB_INPUT_PATH, path);
        }
    }
}
Also used : IOException(java.io.IOException) RunningMode(com.tencent.angel.RunningMode)

Example 5 with RunningMode

use of com.tencent.angel.RunningMode in project angel by Tencent.

the class PSAgent method error.

/**
 * Notify run failed message to master
 *
 * @param errorMsg detail failed message
 */
public void error(String errorMsg) {
    if (!exitedFlag.getAndSet(true)) {
        LOG.info("psagent falied");
        // Notify run success to master only on ANGEL_PS_PSAGENT running mode
        RunningMode mode = PSAgentContext.get().getRunningMode();
        if (mode == RunningMode.ANGEL_PS_PSAGENT) {
            try {
                masterClient.psAgentError(errorMsg);
                LOG.info("psagent failed message : " + errorMsg + ", send it to appmaster success");
            } catch (ServiceException e) {
                LOG.error("send error message error ", e);
            } finally {
                try {
                    connection.close();
                } catch (Exception e) {
                    LOG.error("close connection error", e);
                }
            }
        }
        // Stop all modules
        if (executor != null) {
            executor.error(errorMsg);
        } else {
            stop();
        }
        // Exit the process if on ANGEL_PS_PSAGENT mode
        if (mode == RunningMode.ANGEL_PS_PSAGENT) {
            System.exit(-1);
        }
    }
}
Also used : ServiceException(com.google.protobuf.ServiceException) RunningMode(com.tencent.angel.RunningMode) ServiceException(com.google.protobuf.ServiceException) InvalidParameterException(com.tencent.angel.exception.InvalidParameterException) TimeOutException(com.tencent.angel.exception.TimeOutException) AngelException(com.tencent.angel.exception.AngelException) IOException(java.io.IOException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException)

Aggregations

RunningMode (com.tencent.angel.RunningMode)5 IOException (java.io.IOException)5 ServiceException (com.google.protobuf.ServiceException)2 AngelException (com.tencent.angel.exception.AngelException)2 InvalidParameterException (com.tencent.angel.exception.InvalidParameterException)2 TimeOutException (com.tencent.angel.exception.TimeOutException)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 Path (org.apache.hadoop.fs.Path)2 YarnRuntimeException (org.apache.hadoop.yarn.exceptions.YarnRuntimeException)2 AngelDeployMode (com.tencent.angel.AngelDeployMode)1 LocationManager (com.tencent.angel.common.location.LocationManager)1 DummyDataSpliter (com.tencent.angel.master.data.DummyDataSpliter)1 LocalContainerAllocator (com.tencent.angel.master.deploy.local.LocalContainerAllocator)1 LocalContainerLauncher (com.tencent.angel.master.deploy.local.LocalContainerLauncher)1 YarnContainerAllocator (com.tencent.angel.master.deploy.yarn.YarnContainerAllocator)1 YarnContainerLauncher (com.tencent.angel.master.deploy.yarn.YarnContainerLauncher)1 MetricsService (com.tencent.angel.master.metrics.MetricsService)1 AppStateStorage (com.tencent.angel.master.oplog.AppStateStorage)1 ParameterServerManager (com.tencent.angel.master.ps.ParameterServerManager)1 SlowChecker (com.tencent.angel.master.slowcheck.SlowChecker)1