use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKMasterController method jobScaled.
@Override
public void jobScaled(int change, int numberOfWorkers1) {
if (change < 0) {
scaledDownWorkers = new LinkedList<>();
for (int i = numberOfWorkers1; i < numberOfWorkers; i++) {
scaledDownWorkers.add(i);
}
}
this.numberOfWorkers = numberOfWorkers1;
// generate en event and inform all other workers
JobMasterAPI.JobScaled jobScaled = JobMasterAPI.JobScaled.newBuilder().setNumberOfWorkers(numberOfWorkers1).setChange(change).build();
JobMasterAPI.JobEvent jobEvent = JobMasterAPI.JobEvent.newBuilder().setJobScaled(jobScaled).build();
try {
ZKEventsManager.publishEvent(client, rootPath, jobID, jobEvent);
} catch (Twister2Exception e) {
throw new Twister2RuntimeException(e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKMasterController method allJoined.
@Override
public void allJoined() {
List<JobMasterAPI.WorkerInfo> workers = workerMonitor.getWorkerInfoList();
JobMasterAPI.AllJoined allWorkersJoined = JobMasterAPI.AllJoined.newBuilder().addAllWorkerInfo(workers).setNumberOfWorkers(workers.size()).build();
JobMasterAPI.JobEvent jobEvent = JobMasterAPI.JobEvent.newBuilder().setAllJoined(allWorkersJoined).build();
try {
ZKEventsManager.publishEvent(client, rootPath, jobID, jobEvent);
} catch (Twister2Exception e) {
throw new Twister2RuntimeException(e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKBarrierHandler method initialize.
/**
* initialize ZKBarrierHandler,
* create znode children caches for job master to watch barrier events
*/
public void initialize(JobMasterAPI.JobMasterState initialState) throws Twister2Exception {
if (!(initialState == JobMasterAPI.JobMasterState.JM_STARTED || initialState == JobMasterAPI.JobMasterState.JM_RESTARTED)) {
throw new Twister2Exception("initialState has to be either JobMasterState.JM_STARTED or " + "JobMasterState.JM_RESTARTED. Supplied value: " + initialState);
}
try {
String zkServerAddresses = ZKContext.serverAddresses(config);
int sessionTimeoutMs = FaultToleranceContext.sessionTimeout(config);
client = ZKUtils.connectToServer(zkServerAddresses, sessionTimeoutMs);
// with scaling up/down, it may have been changed
if (initialState == JobMasterAPI.JobMasterState.JM_RESTARTED) {
// do not get previous events on barriers
// get current snapshots of both barriers at the restart
String defaultBarrierDir = ZKUtils.defaultBarrierDir(rootPath, jobID);
defaultBarrierCache = new PathChildrenCache(client, defaultBarrierDir, true);
addBarrierChildrenCacheListener(defaultBarrierCache, JobMasterAPI.BarrierType.DEFAULT);
defaultBarrierCache.start(PathChildrenCache.StartMode.BUILD_INITIAL_CACHE);
TreeSet<Integer> existingWorkers = new TreeSet<>();
long timeout = getInitialWorkersAtBarrier(defaultBarrierCache, existingWorkers);
if (!existingWorkers.isEmpty()) {
barrierMonitor.initDefaultAfterRestart(existingWorkers, timeout, numberOfWorkers);
LOG.info("Existing workers at default barrier: " + existingWorkers.size());
existingWorkers.clear();
}
// do not get previous events on barriers
// get current snapshots of both barriers at restart
String initBarrierDir = ZKUtils.initBarrierDir(rootPath, jobID);
initBarrierCache = new PathChildrenCache(client, initBarrierDir, true);
addBarrierChildrenCacheListener(initBarrierCache, JobMasterAPI.BarrierType.INIT);
initBarrierCache.start(PathChildrenCache.StartMode.BUILD_INITIAL_CACHE);
timeout = getInitialWorkersAtBarrier(initBarrierCache, existingWorkers);
if (!existingWorkers.isEmpty()) {
barrierMonitor.initInitAfterRestart(existingWorkers, timeout, numberOfWorkers);
LOG.info("Existing workers at init barrier: " + existingWorkers);
}
} else {
// We listen for status updates for the default barrier path
String defaultBarrierDir = ZKUtils.defaultBarrierDir(rootPath, jobID);
defaultBarrierCache = new PathChildrenCache(client, defaultBarrierDir, true);
addBarrierChildrenCacheListener(defaultBarrierCache, JobMasterAPI.BarrierType.DEFAULT);
defaultBarrierCache.start();
// We listen for status updates for the init barrier path
String initBarrierDir = ZKUtils.initBarrierDir(rootPath, jobID);
initBarrierCache = new PathChildrenCache(client, initBarrierDir, true);
addBarrierChildrenCacheListener(initBarrierCache, JobMasterAPI.BarrierType.INIT);
initBarrierCache.start();
}
} catch (Twister2Exception e) {
throw e;
} catch (Exception e) {
throw new Twister2Exception("Exception when initializing ZKMasterController.", e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class JobMasterStarter method main.
public static void main(String[] args) {
// we can not initialize the logger fully yet,
// but we need to set the format as the first thing
LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
// get environment variables
String jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
String encodedNodeInfoList = System.getenv(K8sEnvVariables.ENCODED_NODE_INFO_LIST.name());
String hostIP = System.getenv(K8sEnvVariables.HOST_IP.name());
boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
// load the configuration parameters from configuration directory
String configDir = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY;
Config config = K8sWorkerUtils.loadConfig(configDir);
// read job description file
String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
jobDescFileName = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY + File.separator + jobDescFileName;
job = JobUtils.readJobFile(jobDescFileName);
LOG.info("Job description file is loaded: " + jobDescFileName);
// add any configuration from job file to the config object
// if there are the same config parameters in both,
// job file configurations will override
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
config = Config.newBuilder().putAll(config).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
// init logger
K8sWorkerUtils.initLogger(config, "jobMaster", JobMasterContext.persistentVolumeRequested(config));
LOG.info("JobMaster is starting. Current time: " + System.currentTimeMillis());
LOG.info("Number of configuration parameters: " + config.size());
// get podIP from localhost
InetAddress localHost = null;
try {
localHost = InetAddress.getLocalHost();
} catch (UnknownHostException e) {
throw new RuntimeException("Cannot get localHost.", e);
}
String podIP = localHost.getHostAddress();
// construct nodeInfo for Job Master
JobMasterAPI.NodeInfo nodeInfo = KubernetesContext.nodeLocationsFromConfig(config) ? KubernetesContext.getNodeInfo(config, hostIP) : K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfoList, hostIP);
LOG.info("NodeInfo for JobMaster: " + nodeInfo);
KubernetesController controller = KubernetesController.init(KubernetesContext.namespace(config));
JobTerminator jobTerminator = new JobTerminator(config, controller);
K8sScaler k8sScaler = new K8sScaler(config, job, controller);
// get restart count from job ConfigMap
// if jm is running for the first time, initialize restart count at CM
String keyName = KubernetesUtils.createRestartJobMasterKey();
int restartCount = K8sWorkerUtils.initRestartFromCM(controller, jobID, keyName);
JobMasterState initialState = JobMasterState.JM_STARTED;
if (restartCount > 0) {
initialState = JobMasterState.JM_RESTARTED;
// so, we terminate the job with failure
if (!ZKContext.isZooKeeperServerUsed(config)) {
jobTerminator.terminateJob(jobID, JobAPI.JobState.FAILED);
return;
}
}
if (ZKContext.isZooKeeperServerUsed(config)) {
boolean zkInitialized = initializeZooKeeper(config, jobID, podIP, initialState);
if (!zkInitialized) {
jobTerminator.terminateJob(jobID, JobAPI.JobState.FAILED);
return;
}
}
// start JobMaster
JobMaster jobMaster = new JobMaster(config, podIP, jobTerminator, job, nodeInfo, k8sScaler, initialState);
// start configMap watcher to watch the kill parameter from twister2 client
JobKillWatcher jkWatcher = new JobKillWatcher(KubernetesContext.namespace(config), jobID, controller, jobMaster);
jkWatcher.start();
// on any uncaught exception, we will label the job master as FAILED and
// throw a RuntimeException
// JVM will be restarted by K8s
Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Job Master FAILED...", throwable);
jobMaster.jmFailed();
jkWatcher.close();
controller.close();
throw new RuntimeException("Worker failed with the exception", throwable);
});
try {
jobMaster.startJobMasterBlocking();
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
// close the controller
controller.close();
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class NomadJobMasterStarter method launch.
/**
* launch the job master
*
* @return false if setup fails
*/
public boolean launch() {
// get the job working directory
/* String jobWorkingDirectory = NomadContext.workingDirectory(config);
LOG.log(Level.INFO, "job working directory ....." + jobWorkingDirectory);
if (NomadContext.sharedFileSystem(config)) {
if (!setupWorkingDirectory(job, jobWorkingDirectory)) {
throw new RuntimeException("Failed to setup the directory");
}
}
Config newConfig = Config.newBuilder().putAll(config).put(
SchedulerContext.WORKING_DIRECTORY, jobWorkingDirectory).build();
// now start the controller, which will get the resources from
// slurm and start the job
//IController controller = new NomadController(true);
controller.initialize(newConfig);*/
String indexEnv = System.getenv("NOMAD_ALLOC_INDEX");
String idEnv = System.getenv("NOMAD_ALLOC_ID");
int workerID = Integer.valueOf(indexEnv);
MPIWorkerStarter.initJMLogger(config);
LOG.log(Level.INFO, String.format("Worker id = %s and index = %d", idEnv, workerID));
ZKJobMasterRegistrar registrar = null;
int port = JobMasterContext.jobMasterPort(config);
String hostAddress = null;
try {
hostAddress = Inet4Address.getLocalHost().getHostAddress();
} catch (UnknownHostException e) {
e.printStackTrace();
}
try {
registrar = new ZKJobMasterRegistrar(config, hostAddress, port, job.getJobId());
LOG.info("JobMaster REGISTERED..:" + hostAddress);
} catch (Exception e) {
LOG.info("JobMaster CAN NOT BE REGISTERED:");
e.printStackTrace();
}
boolean initialized = registrar.initialize();
if (!initialized) {
LOG.info("CAN NOT INITIALIZE");
}
if (!initialized && registrar.sameZNodeExist()) {
registrar.deleteJobMasterZNode();
registrar.initialize();
}
// start the Job Master locally
JobMaster jobMaster = null;
JobMasterAPI.NodeInfo jobMasterNodeInfo = NomadContext.getNodeInfo(config, hostAddress);
IScalerPerCluster clusterScaler = new NullScaler();
Thread jmThread = null;
int workerCount = job.getNumberOfWorkers();
LOG.info("Worker Count..: " + workerCount);
// if you want to set it manually
// if (JobMasterContext.jobMasterIP(config) != null) {
// hostAddress = JobMasterContext.jobMasterIP(config);
// }
LOG.log(Level.INFO, String.format("Starting the Job Master: %s:%d", hostAddress, port));
JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
NullTerminator nt = new NullTerminator();
jobMaster = new JobMaster(config, hostAddress, nt, job, jobMasterNodeInfo, clusterScaler, initialState);
jobMaster.addShutdownHook(true);
try {
jobMaster.startJobMasterBlocking();
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
// jmThread = jobMaster.startJobMasterThreaded();
waitIndefinitely();
registrar.deleteJobMasterZNode();
registrar.close();
boolean start = controller.start(job);
// }
return start;
}
Aggregations