use of edu.iu.dsc.tws.rsched.schedulers.k8s.worker.K8sPersistentVolume in project twister2 by DSC-SPIDAL.
the class MPIWorkerStarter method main.
@SuppressWarnings("RegexpSinglelineJava")
public static void main(String[] args) {
// we can not initialize the logger fully yet,
// but we need to set the format as the first thing
LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
String jobMasterIP = System.getenv(K8sEnvVariables.JOB_MASTER_IP.name());
jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
if (jobMasterIP == null) {
throw new RuntimeException("JobMasterIP address is null");
}
if (jobID == null) {
throw new RuntimeException("jobID is null");
}
// load the configuration parameters from configuration directory
String configDir = POD_MEMORY_VOLUME + "/" + JOB_ARCHIVE_DIRECTORY;
config = K8sWorkerUtils.loadConfig(configDir);
// initialize MPI
try {
MPI.Init(args);
workerID = MPI.COMM_WORLD.getRank();
numberOfWorkers = MPI.COMM_WORLD.getSize();
} catch (MPIException e) {
LOG.log(Level.SEVERE, "Could not get rank or size from MPI.COMM_WORLD", e);
throw new RuntimeException(e);
}
// initialize persistent volume
K8sPersistentVolume pv = null;
if (KubernetesContext.persistentVolumeRequested(config)) {
// create persistent volume object
String persistentJobDir = KubernetesConstants.PERSISTENT_VOLUME_MOUNT;
pv = new K8sPersistentVolume(persistentJobDir, workerID);
}
// initialize persistent logging
K8sWorkerUtils.initWorkerLogger(workerID, pv, config);
// read job description file
String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
jobDescFileName = POD_MEMORY_VOLUME + "/" + JOB_ARCHIVE_DIRECTORY + "/" + jobDescFileName;
job = JobUtils.readJobFile(jobDescFileName);
LOG.info("Job description file is loaded: " + jobDescFileName);
// add any configuration from job file to the config object
// if there are the same config parameters in both,
// job file configurations will override
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
// update jobMasterIP
// update config with jobMasterIP
config = Config.newBuilder().putAll(config).put(JobMasterContext.JOB_MASTER_IP, jobMasterIP).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
InetAddress localHost = null;
String podName = null;
try {
localHost = InetAddress.getLocalHost();
} catch (UnknownHostException e) {
LOG.log(Level.SEVERE, "Cannot get localHost.", e);
}
String podIP = localHost.getHostAddress();
podName = localHost.getHostName();
// get up to first dot: t2-job-0-0
if (podName.indexOf(".") > 0) {
podName = podName.substring(0, podName.indexOf("."));
}
int workerPort = KubernetesContext.workerBasePort(config) + workerID * (SchedulerContext.numberOfAdditionalPorts(config) + 1);
String nodeIP = null;
try {
nodeIP = Files.readAllLines(Paths.get("hostip.txt")).get(0);
} catch (IOException e) {
LOG.log(Level.WARNING, "Could not get host-ip from hostip.txt file.", e);
}
JobMasterAPI.NodeInfo nodeInfo = null;
if (nodeIP == null) {
LOG.warning("Could not get nodeIP for this pod. Using podIP as nodeIP.");
nodeInfo = NodeInfoUtils.createNodeInfo(podIP, null, null);
} else if (KubernetesContext.nodeLocationsFromConfig(config)) {
nodeInfo = KubernetesContext.getNodeInfo(config, nodeIP);
} else {
try {
String encodedNodeInfos = Files.readAllLines(Paths.get("node-info-list.txt")).get(0);
nodeInfo = K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfos, nodeIP);
} catch (IOException e) {
LOG.log(Level.WARNING, "Could not get node-info list from file: node-info-list.txt. " + "Will use podIP as nodeIP", e);
nodeInfo = NodeInfoUtils.createNodeInfo(podIP, null, null);
}
}
LOG.info(String.format("PodName: %s, NodeInfo for this worker: %s", podName, nodeInfo));
computeResource = K8sWorkerUtils.getComputeResource(job, podName);
// generate additional ports if requested
Map<String, Integer> additionalPorts = K8sWorkerUtils.generateAdditionalPorts(config, workerPort);
workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, podIP, workerPort, nodeInfo, computeResource, additionalPorts);
LOG.info("Worker information summary: \n" + "MPI Rank(workerID): " + workerID + "\n" + "MPI Size(number of workers): " + numberOfWorkers + "\n" + "POD_IP: " + podIP + "\n" + "HOSTNAME(podname): " + podName);
int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, jobID, workerInfo);
WorkerRuntime.init(config, job, workerInfo, restartCount);
/**
* Interfaces to interact with other workers and Job Master if there is any
*/
IWorkerController workerController = WorkerRuntime.getWorkerController();
IWorkerStatusUpdater workerStatusUpdater = WorkerRuntime.getWorkerStatusUpdater();
// finish up this worker
if (restartCount >= FaultToleranceContext.maxMpiJobRestarts(config)) {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
WorkerRuntime.close();
return;
}
// on any uncaught exception,
// we will label the worker as FULLY_FAILED and exit JVM with error code 1
Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Worker FAILED...", throwable);
if (restartCount >= FaultToleranceContext.maxMpiJobRestarts(config) - 1) {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
} else {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
}
WorkerRuntime.close();
System.exit(1);
});
// start the worker
boolean completed = startWorker(workerController, pv, podName);
if (completed) {
// update worker status to COMPLETED
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
} else {
// if not successfully completed, it means it is fully failed
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
}
// finalize MPI
try {
MPI.Finalize();
} catch (MPIException ignore) {
}
WorkerRuntime.close();
}
Aggregations