use of edu.iu.dsc.tws.rsched.schedulers.k8s.KubernetesController in project twister2 by DSC-SPIDAL.
the class K8sWorkerUtils method getAndInitRestartCount.
public static int getAndInitRestartCount(Config cnfg, String jbID, JobMasterAPI.WorkerInfo wInfo) {
// initialize the controller to talk to Kubernetes master
KubernetesController controller = KubernetesController.init(KubernetesContext.namespace(cnfg));
String keyName = KubernetesUtils.createRestartWorkerKey(wInfo.getWorkerID());
int restartCount = initRestartFromCM(controller, jbID, keyName);
LOG.info("Worker restartCount: " + restartCount);
// close the controller
controller.close();
return restartCount;
}
use of edu.iu.dsc.tws.rsched.schedulers.k8s.KubernetesController in project twister2 by DSC-SPIDAL.
the class Twister2Submitter method clearAllJobs.
/**
* Clear left over resources for all checkpointed jobs that are not currently running:
* The Job package for the checkpointed job stored in user home directory
* Checkpointed data that are saved when jobs failed.
*/
public static void clearAllJobs(Config config) {
// get the list of currently running jobs
List<String> runningJobs = new LinkedList<>();
KubernetesController controller = null;
if (KubernetesContext.isKubernetesCluster(config)) {
controller = KubernetesController.init(KubernetesContext.namespace(config));
runningJobs = controller.getTwister2ConfigMapNames();
}
// todo: get running jobs from other clusters
// get all local job directories
List<String> localJobDirs = FileUtils.getDirectories(FsContext.uploaderJobDirectory(config));
// remove running jobs
localJobDirs.removeIf(runningJobs::contains);
// remove local directories
localJobDirs.forEach(jd -> deleteJobDir(jd, config));
// delete PVC for the jobs that have ended in Kubernetes
if (KubernetesContext.isKubernetesCluster(config)) {
List<String> pvcList = controller.getTwister2PersistentVolumeClaims();
pvcList.removeIf(runningJobs::contains);
pvcList.forEach(controller::deletePersistentVolumeClaim);
controller.close();
}
// todo: need to delete checkpointed data in other clusters and on HDFS
}
use of edu.iu.dsc.tws.rsched.schedulers.k8s.KubernetesController in project twister2 by DSC-SPIDAL.
the class JobMasterStarter method main.
public static void main(String[] args) {
// we can not initialize the logger fully yet,
// but we need to set the format as the first thing
LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
// get environment variables
String jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
String encodedNodeInfoList = System.getenv(K8sEnvVariables.ENCODED_NODE_INFO_LIST.name());
String hostIP = System.getenv(K8sEnvVariables.HOST_IP.name());
boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
// load the configuration parameters from configuration directory
String configDir = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY;
Config config = K8sWorkerUtils.loadConfig(configDir);
// read job description file
String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
jobDescFileName = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY + File.separator + jobDescFileName;
job = JobUtils.readJobFile(jobDescFileName);
LOG.info("Job description file is loaded: " + jobDescFileName);
// add any configuration from job file to the config object
// if there are the same config parameters in both,
// job file configurations will override
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
config = Config.newBuilder().putAll(config).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
// init logger
K8sWorkerUtils.initLogger(config, "jobMaster", JobMasterContext.persistentVolumeRequested(config));
LOG.info("JobMaster is starting. Current time: " + System.currentTimeMillis());
LOG.info("Number of configuration parameters: " + config.size());
// get podIP from localhost
InetAddress localHost = null;
try {
localHost = InetAddress.getLocalHost();
} catch (UnknownHostException e) {
throw new RuntimeException("Cannot get localHost.", e);
}
String podIP = localHost.getHostAddress();
// construct nodeInfo for Job Master
JobMasterAPI.NodeInfo nodeInfo = KubernetesContext.nodeLocationsFromConfig(config) ? KubernetesContext.getNodeInfo(config, hostIP) : K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfoList, hostIP);
LOG.info("NodeInfo for JobMaster: " + nodeInfo);
KubernetesController controller = KubernetesController.init(KubernetesContext.namespace(config));
JobTerminator jobTerminator = new JobTerminator(config, controller);
K8sScaler k8sScaler = new K8sScaler(config, job, controller);
// get restart count from job ConfigMap
// if jm is running for the first time, initialize restart count at CM
String keyName = KubernetesUtils.createRestartJobMasterKey();
int restartCount = K8sWorkerUtils.initRestartFromCM(controller, jobID, keyName);
JobMasterState initialState = JobMasterState.JM_STARTED;
if (restartCount > 0) {
initialState = JobMasterState.JM_RESTARTED;
// so, we terminate the job with failure
if (!ZKContext.isZooKeeperServerUsed(config)) {
jobTerminator.terminateJob(jobID, JobAPI.JobState.FAILED);
return;
}
}
if (ZKContext.isZooKeeperServerUsed(config)) {
boolean zkInitialized = initializeZooKeeper(config, jobID, podIP, initialState);
if (!zkInitialized) {
jobTerminator.terminateJob(jobID, JobAPI.JobState.FAILED);
return;
}
}
// start JobMaster
JobMaster jobMaster = new JobMaster(config, podIP, jobTerminator, job, nodeInfo, k8sScaler, initialState);
// start configMap watcher to watch the kill parameter from twister2 client
JobKillWatcher jkWatcher = new JobKillWatcher(KubernetesContext.namespace(config), jobID, controller, jobMaster);
jkWatcher.start();
// on any uncaught exception, we will label the job master as FAILED and
// throw a RuntimeException
// JVM will be restarted by K8s
Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Job Master FAILED...", throwable);
jobMaster.jmFailed();
jkWatcher.close();
controller.close();
throw new RuntimeException("Worker failed with the exception", throwable);
});
try {
jobMaster.startJobMasterBlocking();
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
// close the controller
controller.close();
}
use of edu.iu.dsc.tws.rsched.schedulers.k8s.KubernetesController in project twister2 by DSC-SPIDAL.
the class UploaderToWebServers method undo.
@Override
public boolean undo() {
String jobPackageFile = KubernetesUtils.jobPackageFullPath(config, jobID);
KubernetesController controller = KubernetesController.init(KubernetesContext.namespace(config));
return controller.deleteJobPackage(webServerPodNames, jobPackageFile);
}
Aggregations