use of edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater in project twister2 by DSC-SPIDAL.
the class MPIWorkerStarter method main.
@SuppressWarnings("RegexpSinglelineJava")
public static void main(String[] args) {
// we can not initialize the logger fully yet,
// but we need to set the format as the first thing
LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
String jobMasterIP = System.getenv(K8sEnvVariables.JOB_MASTER_IP.name());
jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
if (jobMasterIP == null) {
throw new RuntimeException("JobMasterIP address is null");
}
if (jobID == null) {
throw new RuntimeException("jobID is null");
}
// load the configuration parameters from configuration directory
String configDir = POD_MEMORY_VOLUME + "/" + JOB_ARCHIVE_DIRECTORY;
config = K8sWorkerUtils.loadConfig(configDir);
// initialize MPI
try {
MPI.Init(args);
workerID = MPI.COMM_WORLD.getRank();
numberOfWorkers = MPI.COMM_WORLD.getSize();
} catch (MPIException e) {
LOG.log(Level.SEVERE, "Could not get rank or size from MPI.COMM_WORLD", e);
throw new RuntimeException(e);
}
// initialize persistent volume
K8sPersistentVolume pv = null;
if (KubernetesContext.persistentVolumeRequested(config)) {
// create persistent volume object
String persistentJobDir = KubernetesConstants.PERSISTENT_VOLUME_MOUNT;
pv = new K8sPersistentVolume(persistentJobDir, workerID);
}
// initialize persistent logging
K8sWorkerUtils.initWorkerLogger(workerID, pv, config);
// read job description file
String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
jobDescFileName = POD_MEMORY_VOLUME + "/" + JOB_ARCHIVE_DIRECTORY + "/" + jobDescFileName;
job = JobUtils.readJobFile(jobDescFileName);
LOG.info("Job description file is loaded: " + jobDescFileName);
// add any configuration from job file to the config object
// if there are the same config parameters in both,
// job file configurations will override
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
// update jobMasterIP
// update config with jobMasterIP
config = Config.newBuilder().putAll(config).put(JobMasterContext.JOB_MASTER_IP, jobMasterIP).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
InetAddress localHost = null;
String podName = null;
try {
localHost = InetAddress.getLocalHost();
} catch (UnknownHostException e) {
LOG.log(Level.SEVERE, "Cannot get localHost.", e);
}
String podIP = localHost.getHostAddress();
podName = localHost.getHostName();
// get up to first dot: t2-job-0-0
if (podName.indexOf(".") > 0) {
podName = podName.substring(0, podName.indexOf("."));
}
int workerPort = KubernetesContext.workerBasePort(config) + workerID * (SchedulerContext.numberOfAdditionalPorts(config) + 1);
String nodeIP = null;
try {
nodeIP = Files.readAllLines(Paths.get("hostip.txt")).get(0);
} catch (IOException e) {
LOG.log(Level.WARNING, "Could not get host-ip from hostip.txt file.", e);
}
JobMasterAPI.NodeInfo nodeInfo = null;
if (nodeIP == null) {
LOG.warning("Could not get nodeIP for this pod. Using podIP as nodeIP.");
nodeInfo = NodeInfoUtils.createNodeInfo(podIP, null, null);
} else if (KubernetesContext.nodeLocationsFromConfig(config)) {
nodeInfo = KubernetesContext.getNodeInfo(config, nodeIP);
} else {
try {
String encodedNodeInfos = Files.readAllLines(Paths.get("node-info-list.txt")).get(0);
nodeInfo = K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfos, nodeIP);
} catch (IOException e) {
LOG.log(Level.WARNING, "Could not get node-info list from file: node-info-list.txt. " + "Will use podIP as nodeIP", e);
nodeInfo = NodeInfoUtils.createNodeInfo(podIP, null, null);
}
}
LOG.info(String.format("PodName: %s, NodeInfo for this worker: %s", podName, nodeInfo));
computeResource = K8sWorkerUtils.getComputeResource(job, podName);
// generate additional ports if requested
Map<String, Integer> additionalPorts = K8sWorkerUtils.generateAdditionalPorts(config, workerPort);
workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, podIP, workerPort, nodeInfo, computeResource, additionalPorts);
LOG.info("Worker information summary: \n" + "MPI Rank(workerID): " + workerID + "\n" + "MPI Size(number of workers): " + numberOfWorkers + "\n" + "POD_IP: " + podIP + "\n" + "HOSTNAME(podname): " + podName);
int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, jobID, workerInfo);
WorkerRuntime.init(config, job, workerInfo, restartCount);
/**
* Interfaces to interact with other workers and Job Master if there is any
*/
IWorkerController workerController = WorkerRuntime.getWorkerController();
IWorkerStatusUpdater workerStatusUpdater = WorkerRuntime.getWorkerStatusUpdater();
// finish up this worker
if (restartCount >= FaultToleranceContext.maxMpiJobRestarts(config)) {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
WorkerRuntime.close();
return;
}
// on any uncaught exception,
// we will label the worker as FULLY_FAILED and exit JVM with error code 1
Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Worker FAILED...", throwable);
if (restartCount >= FaultToleranceContext.maxMpiJobRestarts(config) - 1) {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
} else {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
}
WorkerRuntime.close();
System.exit(1);
});
// start the worker
boolean completed = startWorker(workerController, pv, podName);
if (completed) {
// update worker status to COMPLETED
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
} else {
// if not successfully completed, it means it is fully failed
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
}
// finalize MPI
try {
MPI.Finalize();
} catch (MPIException ignore) {
}
WorkerRuntime.close();
}
use of edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater in project twister2 by DSC-SPIDAL.
the class K8sWorkerStarter method main.
@SuppressWarnings("RegexpSinglelineJava")
public static void main(String[] args) {
// we can not initialize the logger fully yet,
// but we need to set the format as the first thing
LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
// all environment variables
int workerPort = Integer.parseInt(System.getenv(K8sEnvVariables.WORKER_PORT.name()));
String containerName = System.getenv(K8sEnvVariables.CONTAINER_NAME.name());
String podName = System.getenv(K8sEnvVariables.POD_NAME.name());
String hostIP = System.getenv(K8sEnvVariables.HOST_IP.name());
String hostName = System.getenv(K8sEnvVariables.HOST_NAME.name());
String jobMasterIP = System.getenv(K8sEnvVariables.JOB_MASTER_IP.name());
String encodedNodeInfoList = System.getenv(K8sEnvVariables.ENCODED_NODE_INFO_LIST.name());
jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
if (jobID == null) {
throw new RuntimeException("JobID is null");
}
// load the configuration parameters from configuration directory
String configDir = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY;
config = K8sWorkerUtils.loadConfig(configDir);
// read job description file
String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
jobDescFileName = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY + File.separator + jobDescFileName;
job = JobUtils.readJobFile(jobDescFileName);
LOG.info("Job description file is loaded: " + jobDescFileName);
// add any configuration from job file to the config object
// if there are the same config parameters in both,
// job file configurations will override
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
config = Config.newBuilder().putAll(config).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
// if there is a driver or ZK is not used for group management, then we need to connect to JM
if (!job.getDriverClassName().isEmpty() || !(ZKContext.isZooKeeperServerUsed(config)) || CheckpointingContext.isCheckpointingEnabled(config)) {
jobMasterIP = updateJobMasterIp(jobMasterIP);
}
// get podIP from localhost
InetAddress localHost = null;
try {
localHost = InetAddress.getLocalHost();
} catch (UnknownHostException e) {
throw new RuntimeException("Cannot get localHost.", e);
}
String podIP = localHost.getHostAddress();
JobMasterAPI.NodeInfo nodeInfo = KubernetesContext.nodeLocationsFromConfig(config) ? KubernetesContext.getNodeInfo(config, hostIP) : K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfoList, hostIP);
LOG.info("PodName: " + podName + ", NodeInfo: " + nodeInfo);
// set workerID
workerID = K8sWorkerUtils.calculateWorkerID(job, podName, containerName);
// get computeResource for this worker
computeResource = K8sWorkerUtils.getComputeResource(job, podName);
// generate additional ports if requested
Map<String, Integer> additionalPorts = K8sWorkerUtils.generateAdditionalPorts(config, workerPort);
// construct WorkerInfo
workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, podIP, workerPort, nodeInfo, computeResource, additionalPorts);
// initialize persistent volume
K8sPersistentVolume pv = null;
if (KubernetesContext.persistentVolumeRequested(config)) {
// create persistent volume object
String persistentJobDir = KubernetesConstants.PERSISTENT_VOLUME_MOUNT;
pv = new K8sPersistentVolume(persistentJobDir, workerID);
}
// initialize persistent logging
K8sWorkerUtils.initWorkerLogger(workerID, pv, config);
LOG.info("Worker information summary: \n" + "workerID: " + workerID + "\n" + "POD_IP: " + podIP + "\n" + "HOSTNAME(podname): " + podName + "\n" + "workerPort: " + workerPort + "\n" + "hostName(nodeName): " + hostName + "\n" + "hostIP(nodeIP): " + hostIP + "\n");
int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, jobID, workerInfo);
WorkerRuntime.init(config, job, workerInfo, restartCount);
/**
* Interfaces to interact with other workers and Job Master if there is any
*/
IWorkerController workerController = WorkerRuntime.getWorkerController();
IWorkerStatusUpdater workerStatusUpdater = WorkerRuntime.getWorkerStatusUpdater();
// finish up this worker
if (restartCount >= FaultToleranceContext.maxWorkerRestarts(config)) {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
WorkerRuntime.close();
externallyKilled = false;
return;
}
// add shut down hook
addShutdownHook(workerStatusUpdater);
// on any uncaught exception, we will label the worker as FAILED and throw a RuntimeException
// JVM will be restarted by K8s
Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Worker FAILED...", throwable);
if (restartCount >= FaultToleranceContext.maxWorkerRestarts(config) - 1) {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
} else {
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
}
WorkerRuntime.close();
externallyKilled = false;
System.exit(1);
});
// start the worker
boolean completed = startWorker(workerController, pv);
if (completed) {
// update worker status to COMPLETED
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
} else {
// if not successfully completed, it means it is fully failed
workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
}
WorkerRuntime.close();
externallyKilled = false;
}
use of edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater in project twister2 by DSC-SPIDAL.
the class JobMasterClientExample method simulateClient.
/**
* a method to simulate JMWorkerAgent running in workers
*/
public static void simulateClient(Config config, JobAPI.Job job, int workerID) {
String workerIP = JMWorkerController.convertStringToIP("localhost").getHostAddress();
int workerPort = 10000 + (int) (Math.random() * 10000);
JobMasterAPI.NodeInfo nodeInfo = NodeInfoUtils.createNodeInfo("node.ip", "rack01", null);
JobAPI.ComputeResource computeResource = job.getComputeResource(0);
Map<String, Integer> additionalPorts = generateAdditionalPorts(config, workerPort);
JobMasterAPI.WorkerInfo workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, workerIP, workerPort, nodeInfo, computeResource, additionalPorts);
int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, job.getJobId(), workerInfo);
long start = System.currentTimeMillis();
WorkerRuntime.init(config, job, workerInfo, restartCount);
long delay = System.currentTimeMillis() - start;
LOG.severe("worker-" + workerID + " startupDelay " + delay);
IWorkerStatusUpdater statusUpdater = WorkerRuntime.getWorkerStatusUpdater();
IWorkerController workerController = WorkerRuntime.getWorkerController();
ISenderToDriver senderToDriver = WorkerRuntime.getSenderToDriver();
WorkerRuntime.addReceiverFromDriver(new IReceiverFromDriver() {
@Override
public void driverMessageReceived(Any anyMessage) {
LOG.info("Received message from IDriver: \n" + anyMessage);
senderToDriver.sendToDriver(anyMessage);
}
});
try {
List<JobMasterAPI.WorkerInfo> workerList = workerController.getAllWorkers();
LOG.info("All workers joined... IDs: " + getIDs(workerList));
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
return;
}
// wait
sleeeep(2 * 1000);
try {
workerController.waitOnBarrier();
LOG.info("All workers reached the barrier. Proceeding.......");
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
}
// int id = job.getNumberOfWorkers() - 1;
// JobMasterAPI.WorkerInfo info = workerController.getWorkerInfoForID(id);
// LOG.info("WorkerInfo for " + id + ": \n" + info);
// wait up to 3sec
sleeeep((long) (Math.random() * 10 * 1000));
// start the worker
try {
throwException(workerID);
} catch (Throwable t) {
// update worker status to FAILED
statusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
WorkerRuntime.close();
// System.exit(1);
throw t;
}
statusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
WorkerRuntime.close();
System.out.println("Client has finished the computation. Client exiting.");
}
Aggregations