use of edu.iu.dsc.tws.master.server.JobMaster in project twister2 by DSC-SPIDAL.
the class MPILauncher method launch.
@Override
public Twister2JobState launch(JobAPI.Job job) {
LOG.log(Level.INFO, "Launching job for cluster {0}", MPIContext.clusterType(config));
Twister2JobState state = new Twister2JobState(false);
if (!configsOK()) {
return state;
}
// distributing bundle if not running in shared file system
if (!MPIContext.isSharedFs(config)) {
LOG.info("Configured as NON SHARED file system. " + "Running bootstrap procedure to distribute files...");
try {
this.distributeJobFiles(job);
} catch (IOException e) {
LOG.log(Level.SEVERE, "Error in distributing job files", e);
throw new RuntimeException("Error in distributing job files");
}
} else {
LOG.info("Configured as SHARED file system. " + "Skipping bootstrap procedure & setting up working directory");
if (!setupWorkingDirectory(job.getJobId())) {
throw new RuntimeException("Failed to setup the directory");
}
}
config = Config.newBuilder().putAll(config).put(SchedulerContext.WORKING_DIRECTORY, jobWorkingDirectory).build();
JobMaster jobMaster = null;
Thread jmThread = null;
if (JobMasterContext.isJobMasterUsed(config) && JobMasterContext.jobMasterRunsInClient(config)) {
// Since the job master is running on client we can collect job information
state.setDetached(false);
try {
int port = NetworkUtils.getFreePort();
String hostAddress = JobMasterContext.jobMasterIP(config);
if (hostAddress == null) {
hostAddress = ResourceSchedulerUtils.getHostIP(config);
}
// add the port and ip to config
config = Config.newBuilder().putAll(config).put("__job_master_port__", port).put("__job_master_ip__", hostAddress).build();
LOG.log(Level.INFO, String.format("Starting the job master: %s:%d", hostAddress, port));
JobMasterAPI.NodeInfo jobMasterNodeInfo = NodeInfoUtils.createNodeInfo(hostAddress, "default", "default");
IScalerPerCluster nullScaler = new NullScaler();
JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
NullTerminator nt = new NullTerminator();
jobMaster = new JobMaster(config, "0.0.0.0", port, nt, job, jobMasterNodeInfo, nullScaler, initialState);
jobMaster.addShutdownHook(true);
jmThread = jobMaster.startJobMasterThreaded();
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, "Exception when starting Job master: ", e);
throw new RuntimeException(e);
}
}
final boolean[] start = { false };
// now start the controller, which will get the resources and start
Thread controllerThread = new Thread(() -> {
IController controller = new MPIController(true);
controller.initialize(config);
start[0] = controller.start(job);
});
controllerThread.setName("MPIController");
controllerThread.start();
// wait until the controller finishes
try {
controllerThread.join();
} catch (InterruptedException ignore) {
}
// now lets wait on client
if (jmThread != null && JobMasterContext.isJobMasterUsed(config) && JobMasterContext.jobMasterRunsInClient(config)) {
try {
jmThread.join();
} catch (InterruptedException ignore) {
}
}
if (jobMaster != null && jobMaster.getDriver() != null) {
if (jobMaster.getDriver().getState() != DriverJobState.FAILED) {
state.setJobstate(DriverJobState.COMPLETED);
} else {
state.setJobstate(jobMaster.getDriver().getState());
}
state.setFinalMessages(jobMaster.getDriver().getMessages());
}
state.setRequestGranted(start[0]);
return state;
}
use of edu.iu.dsc.tws.master.server.JobMaster in project twister2 by DSC-SPIDAL.
the class MPIWorkerStarter method startMaster.
/**
* Start the JobMaster
*/
private void startMaster() {
try {
// init the logger
initJMLogger(config);
// release the port for JM
NetworkUtils.releaseWorkerPorts();
int port = JobMasterContext.jobMasterPort(config);
String hostAddress = ResourceSchedulerUtils.getHostIP(config);
LOG.log(Level.INFO, String.format("Starting the job master: %s:%d", hostAddress, port));
JobMasterAPI.NodeInfo jobMasterNodeInfo = null;
IScalerPerCluster clusterScaler = new NullScaler();
JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
NullTerminator nt = new NullTerminator();
jobMaster = new JobMaster(config, "0.0.0.0", port, nt, job, jobMasterNodeInfo, clusterScaler, initialState);
jobMaster.startJobMasterBlocking();
LOG.log(Level.INFO, "JobMaster done... ");
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, "Exception when starting Job master: ", e);
throw new RuntimeException(e);
}
}
use of edu.iu.dsc.tws.master.server.JobMaster in project twister2 by DSC-SPIDAL.
the class JobMasterExample method main.
/**
* this main method is for locally testing only
* A JobMaster instance is started locally on the default port:
* edu.iu.dsc.tws.master.JobMasterContext.JOB_MASTER_PORT_DEFAULT = 11011
* <p>
* numberOfWorkers to join is expected as a parameter
* <p>
* When all workers joined and all have sent completed messages,
* this server also completes and exits
* <p>
* En example usage of JobMaster can be seen in:
* edu.iu.dsc.tws.rsched.schedulers.k8s.master.JobMasterStarter
*/
public static void main(String[] args) {
if (args.length != 1) {
LOG.info("usage: java JobMasterExample numberOfWorkers");
return;
}
int numberOfWorkers = Integer.parseInt(args[0]);
String host = "0.0.0.0";
// we assume that the twister2Home is the current directory
// String configDir = "../twister2/config/src/yaml/";
String configDir = "";
String twister2Home = Paths.get(configDir).toAbsolutePath().toString();
Config config = ConfigLoader.loadConfig(twister2Home, "conf", "kubernetes");
config = JobMasterClientExample.updateConfig(config, config, host);
LOG.info("Loaded: " + config.size() + " configuration parameters.");
// Twister2Job twister2Job = Twister2Job.loadTwister2Job(config, null);
Twister2Job twister2Job = Twister2Job.newBuilder().setJobName("hello-world-job").setWorkerClass(HelloWorld.class).addComputeResource(.2, 128, numberOfWorkers).build();
twister2Job.setUserName(System.getProperty("user.name"));
JobAPI.Job job = twister2Job.serialize();
LOG.info("JobID: " + job.getJobId());
JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
JobMasterStarter.job = job;
if (ZKContext.isZooKeeperServerUsed(config)) {
if ("start".equalsIgnoreCase(args[0])) {
JobMasterStarter.initializeZooKeeper(config, job.getJobId(), host, initialState);
} else if ("restart".equalsIgnoreCase(args[0])) {
initialState = JobMasterAPI.JobMasterState.JM_RESTARTED;
JobMasterStarter.initializeZooKeeper(config, job.getJobId(), host, initialState);
job = JobMasterStarter.job;
} else {
LOG.info("usage: java JobMasterExample start/restart");
return;
}
}
// write jobID to file
String dir = System.getProperty("user.home") + "/.twister2";
if (!FileUtils.isDirectoryExists(dir)) {
FileUtils.createDirectory(dir);
}
String filename = dir + "/last-job-id.txt";
FileUtils.writeToFile(filename, (job.getJobId() + "").getBytes(), true);
LOG.info("Written jobID to file: " + job.getJobId());
String ip = null;
try {
ip = Inet4Address.getLocalHost().getHostAddress();
} catch (UnknownHostException e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return;
}
JobMasterAPI.NodeInfo jobMasterNode = NodeInfoUtils.createNodeInfo(ip, null, null);
KubernetesController controller = KubernetesController.init("default");
K8sScaler k8sScaler = new K8sScaler(config, job, controller);
IJobTerminator jobTerminator = new NullTerminator();
JobMaster jobMaster = new JobMaster(config, host, jobTerminator, job, jobMasterNode, k8sScaler, initialState);
try {
// jobMaster.startJobMasterThreaded();
jobMaster.startJobMasterBlocking();
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, "Exception when starting Job master: ", e);
throw new RuntimeException(e);
}
LOG.info("Threaded Job Master started:" + "\nnumberOfWorkers: " + job.getNumberOfWorkers() + "\njobID: " + job.getJobId());
}
use of edu.iu.dsc.tws.master.server.JobMaster in project twister2 by DSC-SPIDAL.
the class KubernetesLauncher method startJobMasterOnClient.
/**
* start the JobMaster locally on submitting client
* this is a blocking call
* it finishes after the job has completed
*/
private boolean startJobMasterOnClient(JobAPI.Job job) {
// get Dashboard IP address from dashboard service name
String dashAddress = JobMasterContext.dashboardHost(config);
// get the IP address for the service name by querying Kubernetes master
if (dashAddress.endsWith("svc.cluster.local")) {
String dashIP = getDashboardIP(dashAddress);
String dashURL = "http://" + dashIP;
if (dashIP == null) {
LOG.warning("Could not get Dashboard server IP address from dashboard service name: " + dashAddress + " will not connect to Dashboard. *****");
dashURL = null;
}
config = JobMasterContext.updateDashboardHost(config, dashURL);
LOG.info("Dashboard server HTTP URL: " + dashURL);
}
String hostAdress = RequestObjectBuilder.getJobMasterIP();
JobMasterAPI.NodeInfo nodeInfo = NodeInfoUtils.createNodeInfo(hostAdress, null, null);
K8sScaler k8sScaler = new K8sScaler(config, job, controller);
JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
JobTerminator jobTerminator = new JobTerminator(config, controller);
JobMaster jobMaster = new JobMaster(config, hostAdress, jobTerminator, job, nodeInfo, k8sScaler, initialState);
jobMaster.addShutdownHook(true);
// jobMaster.startJobMasterThreaded();
try {
jobMaster.startJobMasterBlocking();
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return false;
}
return true;
}
use of edu.iu.dsc.tws.master.server.JobMaster in project twister2 by DSC-SPIDAL.
the class JobMasterStarter method main.
public static void main(String[] args) {
// we can not initialize the logger fully yet,
// but we need to set the format as the first thing
LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
// get environment variables
String jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
String encodedNodeInfoList = System.getenv(K8sEnvVariables.ENCODED_NODE_INFO_LIST.name());
String hostIP = System.getenv(K8sEnvVariables.HOST_IP.name());
boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
// load the configuration parameters from configuration directory
String configDir = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY;
Config config = K8sWorkerUtils.loadConfig(configDir);
// read job description file
String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
jobDescFileName = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY + File.separator + jobDescFileName;
job = JobUtils.readJobFile(jobDescFileName);
LOG.info("Job description file is loaded: " + jobDescFileName);
// add any configuration from job file to the config object
// if there are the same config parameters in both,
// job file configurations will override
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
config = Config.newBuilder().putAll(config).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
// init logger
K8sWorkerUtils.initLogger(config, "jobMaster", JobMasterContext.persistentVolumeRequested(config));
LOG.info("JobMaster is starting. Current time: " + System.currentTimeMillis());
LOG.info("Number of configuration parameters: " + config.size());
// get podIP from localhost
InetAddress localHost = null;
try {
localHost = InetAddress.getLocalHost();
} catch (UnknownHostException e) {
throw new RuntimeException("Cannot get localHost.", e);
}
String podIP = localHost.getHostAddress();
// construct nodeInfo for Job Master
JobMasterAPI.NodeInfo nodeInfo = KubernetesContext.nodeLocationsFromConfig(config) ? KubernetesContext.getNodeInfo(config, hostIP) : K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfoList, hostIP);
LOG.info("NodeInfo for JobMaster: " + nodeInfo);
KubernetesController controller = KubernetesController.init(KubernetesContext.namespace(config));
JobTerminator jobTerminator = new JobTerminator(config, controller);
K8sScaler k8sScaler = new K8sScaler(config, job, controller);
// get restart count from job ConfigMap
// if jm is running for the first time, initialize restart count at CM
String keyName = KubernetesUtils.createRestartJobMasterKey();
int restartCount = K8sWorkerUtils.initRestartFromCM(controller, jobID, keyName);
JobMasterState initialState = JobMasterState.JM_STARTED;
if (restartCount > 0) {
initialState = JobMasterState.JM_RESTARTED;
// so, we terminate the job with failure
if (!ZKContext.isZooKeeperServerUsed(config)) {
jobTerminator.terminateJob(jobID, JobAPI.JobState.FAILED);
return;
}
}
if (ZKContext.isZooKeeperServerUsed(config)) {
boolean zkInitialized = initializeZooKeeper(config, jobID, podIP, initialState);
if (!zkInitialized) {
jobTerminator.terminateJob(jobID, JobAPI.JobState.FAILED);
return;
}
}
// start JobMaster
JobMaster jobMaster = new JobMaster(config, podIP, jobTerminator, job, nodeInfo, k8sScaler, initialState);
// start configMap watcher to watch the kill parameter from twister2 client
JobKillWatcher jkWatcher = new JobKillWatcher(KubernetesContext.namespace(config), jobID, controller, jobMaster);
jkWatcher.start();
// on any uncaught exception, we will label the job master as FAILED and
// throw a RuntimeException
// JVM will be restarted by K8s
Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Job Master FAILED...", throwable);
jobMaster.jmFailed();
jkWatcher.close();
controller.close();
throw new RuntimeException("Worker failed with the exception", throwable);
});
try {
jobMaster.startJobMasterBlocking();
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
// close the controller
controller.close();
}
Aggregations