use of com.tencent.angel.RunningMode in project angel by Tencent.
the class PSAgent method done.
/**
* Notify run success message to master
*/
public void done() {
if (!exitedFlag.getAndSet(true)) {
LOG.info("psagent success done");
RunningMode mode = PSAgentContext.get().getRunningMode();
// Notify run success to master only on ANGEL_PS_PSAGENT running mode
if (mode == RunningMode.ANGEL_PS_PSAGENT) {
try {
masterClient.psAgentDone();
LOG.info("send done message to appmaster success");
} catch (ServiceException e) {
LOG.error("send done message error ", e);
} finally {
try {
connection.close();
} catch (Exception e) {
LOG.error("close connection error", e);
}
}
}
// Stop all modules
if (executor != null) {
executor.done();
} else {
stop();
}
// Exit the process if on ANGEL_PS_PSAGENT mode
if (mode == RunningMode.ANGEL_PS_PSAGENT) {
System.exit(0);
}
}
}
use of com.tencent.angel.RunningMode in project angel by Tencent.
the class AngelApplicationMaster method initAndStart.
/**
* init and start all service modules for angel applicaiton master.
*/
public void initAndStart() throws Exception {
addIfService(angelApp);
// init app state storage
String tmpOutPath = conf.get(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH);
Path appStatePath = new Path(tmpOutPath, "app");
LOG.info("app state output path = " + appStatePath.toUri().toString());
FileSystem fs = appStatePath.getFileSystem(conf);
appStateStorage = new AppStateStorage(appContext, appStatePath.toUri().toString(), fs);
addIfService(appStateStorage);
LOG.info("build app state storage success");
// init event dispacher
dispatcher = new AsyncDispatcher();
addIfService(dispatcher);
LOG.info("build event dispacher");
// init location manager
locationManager = new LocationManager();
// init container allocator
AngelDeployMode deployMode = appContext.getDeployMode();
LOG.info("deploy mode=" + deployMode);
if (deployMode == AngelDeployMode.LOCAL) {
containerAllocator = new LocalContainerAllocator(appContext);
containerLauncher = new LocalContainerLauncher(appContext);
} else {
containerAllocator = new YarnContainerAllocator(appContext);
containerLauncher = new YarnContainerLauncher(appContext);
}
addIfService(containerAllocator);
dispatcher.register(ContainerAllocatorEventType.class, containerAllocator);
LOG.info("build containerAllocator success");
addIfService(containerLauncher);
dispatcher.register(ContainerLauncherEventType.class, containerLauncher);
LOG.info("build containerLauncher success");
// init a rpc service
masterService = new MasterService(appContext);
LOG.info("build master service success");
// recover matrix meta if needed
recoverMatrixMeta();
// recover ps attempt information if need
Map<ParameterServerId, Integer> psIdToAttemptIndexMap = recoverPSAttemptIndex();
if (psIdToAttemptIndexMap == null) {
LOG.info("recoverPSAttemptIndex return is null");
} else {
for (Entry<ParameterServerId, Integer> entry : psIdToAttemptIndexMap.entrySet()) {
LOG.info("psId=" + entry.getKey() + ",attemptIndex=" + entry.getValue());
}
}
// init parameter server manager
psManager = new ParameterServerManager(appContext, psIdToAttemptIndexMap);
addIfService(psManager);
psManager.init();
List<ParameterServerId> psIds = new ArrayList<>(psManager.getParameterServerMap().keySet());
Collections.sort(psIds, new Comparator<ParameterServerId>() {
@Override
public int compare(ParameterServerId s1, ParameterServerId s2) {
return s1.getIndex() - s2.getIndex();
}
});
locationManager.setPsIds(psIds.toArray(new ParameterServerId[0]));
dispatcher.register(ParameterServerManagerEventType.class, psManager);
dispatcher.register(AMParameterServerEventType.class, new ParameterServerEventHandler());
dispatcher.register(PSAttemptEventType.class, new PSAttemptEventDispatcher());
LOG.info("build PSManager success");
// recover task information if needed
recoverTaskState();
RunningMode mode = appContext.getRunningMode();
LOG.info("running mode=" + mode);
switch(mode) {
case ANGEL_PS_PSAGENT:
{
// init psagent manager and register psagent manager event
psAgentManager = new PSAgentManager(appContext);
addIfService(psAgentManager);
dispatcher.register(PSAgentManagerEventType.class, psAgentManager);
dispatcher.register(AMPSAgentEventType.class, new PSAgentEventHandler());
dispatcher.register(PSAgentAttemptEventType.class, new PSAgentAttemptEventHandler());
LOG.info("build PSAgentManager success");
break;
}
case ANGEL_PS_WORKER:
{
// a dummy data spliter is just for test now
boolean useDummyDataSpliter = conf.getBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, AngelConf.DEFAULT_ANGEL_AM_USE_DUMMY_DATASPLITER);
if (useDummyDataSpliter) {
dataSpliter = new DummyDataSpliter(appContext);
} else {
// recover data splits information if needed
recoveryDataSplits();
}
// init worker manager and register worker manager event
workerManager = new WorkerManager(appContext);
workerManager.adjustTaskNumber(dataSpliter.getSplitNum());
addIfService(workerManager);
dispatcher.register(WorkerManagerEventType.class, workerManager);
dispatcher.register(AMWorkerGroupEventType.class, new WorkerGroupEventHandler());
dispatcher.register(AMWorkerEventType.class, new WorkerEventHandler());
dispatcher.register(WorkerAttemptEventType.class, new WorkerAttemptEventHandler());
LOG.info("build WorkerManager success");
break;
}
case ANGEL_PS:
break;
}
// register slow worker/ps checker
addIfService(new SlowChecker(appContext));
algoMetricsService = new MetricsService(appContext);
addIfService(algoMetricsService);
dispatcher.register(MetricsEventType.class, algoMetricsService);
// register app manager event and finish event
dispatcher.register(AppEventType.class, angelApp);
dispatcher.register(AppFinishEventType.class, new AppFinishEventHandler());
masterService.init(conf);
super.init(conf);
// start a web service if use yarn deploy mode
if (deployMode == AngelDeployMode.YARN) {
try {
webApp = WebApps.$for("angel", AMContext.class, appContext).with(conf).start(new AngelWebApp());
LOG.info("start webapp server success");
LOG.info("webApp.port()=" + webApp.port());
} catch (Exception e) {
LOG.error("Webapps failed to start. Ignoring for now:", e);
}
}
masterService.start();
locationManager.setMasterLocation(masterService.getLocation());
super.serviceStart();
psManager.startAllPS();
AngelServiceLoader.startServiceIfNeed(this, getConfig());
LOG.info("appAttemptId.getAttemptId()=" + appAttemptId.getAttemptId());
if (appAttemptId.getAttemptId() > 1) {
waitForAllPsRegisted();
waitForAllMetricsInited();
angelApp.startExecute();
}
}
use of com.tencent.angel.RunningMode in project angel by Tencent.
the class AngelClient method setOutputDirectory.
protected void setOutputDirectory() throws IOException {
String actionType = conf.get(AngelConf.ANGEL_ACTION_TYPE, AngelConf.DEFAULT_ANGEL_ACTION_TYPE);
RunningMode runningMode = RunningMode.valueOf(conf.get(AngelConf.ANGEL_RUNNING_MODE, AngelConf.DEFAULT_ANGEL_RUNNING_MODE));
LOG.info("running mode = " + runningMode);
boolean deleteOnExist = conf.getBoolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, AngelConf.DEFAULT_ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST);
String path = null;
if (!actionType.matches("predict")) {
path = conf.get(AngelConf.ANGEL_SAVE_MODEL_PATH);
} else {
path = conf.get(AngelConf.ANGEL_PREDICT_PATH);
}
if (path == null) {
throw new IOException("output directory is null. you must set " + AngelConf.ANGEL_SAVE_MODEL_PATH + " at training mode or set " + AngelConf.ANGEL_PREDICT_PATH + " at predict mode");
}
conf.set(AngelConf.ANGEL_JOB_OUTPUT_PATH, path);
Path outputPath = new Path(path);
FileSystem outFs = outputPath.getFileSystem(conf);
if (outFs.exists(outputPath)) {
if (deleteOnExist) {
outFs.delete(outputPath, true);
} else {
throw new IOException("output path " + outputPath + " already exist, please check");
}
}
Path outputParentPath = outputPath.getParent();
if (!outFs.exists(outputParentPath)) {
LOG.info("Make dir for model output parent path: " + outputParentPath);
if (!outFs.mkdirs(outputParentPath)) {
throw new IOException("Failed to make dir for model output parent path: " + outputParentPath);
}
}
if (runningMode == RunningMode.ANGEL_PS_WORKER) {
String logPathStr = conf.get(AngelConf.ANGEL_LOG_PATH);
if (logPathStr != null) {
Path logPath = new Path(logPathStr);
FileSystem logFs = logPath.getFileSystem(conf);
if (logFs.exists(logPath)) {
if (deleteOnExist) {
logFs.delete(logPath, true);
} else {
throw new IOException("log path " + logPath + " already exist, please check");
}
}
}
}
Path tmpOutputPath = HdfsUtil.generateTmpDirectory(conf, getAppId(), outputPath);
internalStateFile = new Path(HdfsUtil.generateTmpDirectory(conf, getAppId(), outputPath), "state");
conf.set(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH, tmpOutputPath.toString());
LOG.info(AngelConf.ANGEL_JOB_TMP_OUTPUT_PATH + "=" + tmpOutputPath.toString());
LOG.info("internal state file is " + internalStateFile);
conf.set(AngelConf.ANGEL_APP_SERILIZE_STATE_FILE, internalStateFile.toString());
}
use of com.tencent.angel.RunningMode in project angel by Tencent.
the class AngelClient method setInputDirectory.
protected void setInputDirectory() throws IOException {
boolean isUseDummy = conf.getBoolean(AngelConf.ANGEL_AM_USE_DUMMY_DATASPLITER, AngelConf.DEFAULT_ANGEL_AM_USE_DUMMY_DATASPLITER);
if (isUseDummy) {
return;
}
String actionType = conf.get(AngelConf.ANGEL_ACTION_TYPE, AngelConf.DEFAULT_ANGEL_ACTION_TYPE);
RunningMode runningMode = RunningMode.valueOf(conf.get(AngelConf.ANGEL_RUNNING_MODE, AngelConf.DEFAULT_ANGEL_RUNNING_MODE));
String path = null;
if (!actionType.matches("predict")) {
path = conf.get(AngelConf.ANGEL_TRAIN_DATA_PATH);
} else {
path = conf.get(AngelConf.ANGEL_PREDICT_DATA_PATH);
}
if (runningMode == RunningMode.ANGEL_PS_WORKER) {
if (path == null) {
throw new IOException("input data directory is empty, you should set it");
} else {
conf.set(AngelConf.ANGEL_JOB_INPUT_PATH, path);
}
}
}
use of com.tencent.angel.RunningMode in project angel by Tencent.
the class PSAgent method error.
/**
* Notify run failed message to master
*
* @param errorMsg detail failed message
*/
public void error(String errorMsg) {
if (!exitedFlag.getAndSet(true)) {
LOG.info("psagent falied");
// Notify run success to master only on ANGEL_PS_PSAGENT running mode
RunningMode mode = PSAgentContext.get().getRunningMode();
if (mode == RunningMode.ANGEL_PS_PSAGENT) {
try {
masterClient.psAgentError(errorMsg);
LOG.info("psagent failed message : " + errorMsg + ", send it to appmaster success");
} catch (ServiceException e) {
LOG.error("send error message error ", e);
} finally {
try {
connection.close();
} catch (Exception e) {
LOG.error("close connection error", e);
}
}
}
// Stop all modules
if (executor != null) {
executor.error(errorMsg);
} else {
stop();
}
// Exit the process if on ANGEL_PS_PSAGENT mode
if (mode == RunningMode.ANGEL_PS_PSAGENT) {
System.exit(-1);
}
}
}
Aggregations