use of edu.iu.dsc.tws.common.zk.ZKJobMasterFinder in project twister2 by DSC-SPIDAL.
the class NomadWorkerStarter method createWorkerController.
/**
* Create the resource plan
*/
private IWorkerController createWorkerController() {
// first get the worker id
String indexEnv = System.getenv("NOMAD_ALLOC_INDEX");
String idEnv = System.getenv("NOMAD_ALLOC_ID");
int workerID = Integer.valueOf(indexEnv);
MPIWorkerStarter.initWorkerLogger(config, workerID);
LOG.log(Level.INFO, String.format("Worker id = %s and index = %d", idEnv, workerID));
Map<String, Integer> ports = getPorts(config);
Map<String, String> localIps = getIPAddress(ports);
int numberOfWorkers = job.getNumberOfWorkers();
LOG.info("Worker Count..: " + numberOfWorkers);
JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, 0);
// Map<String, Integer> additionalPorts =
// NomadContext.generateAdditionalPorts(config, startingPort);
int port = ports.get("worker");
String host = localIps.get("worker");
JobMasterAPI.NodeInfo nodeInfo = NomadContext.getNodeInfo(config, host);
JobMasterAPI.WorkerInfo workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, host, port, nodeInfo, computeResource, ports);
int jobMasterPort = 0;
String jobMasterIP = null;
// find the jobmaster
if (!JobMasterContext.jobMasterRunsInClient(config)) {
ZKJobMasterFinder finder = new ZKJobMasterFinder(config, job.getJobId());
finder.initialize();
String jobMasterIPandPort = finder.getJobMasterIPandPort();
if (jobMasterIPandPort == null) {
LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
LOG.info("Job Master address: " + jobMasterIPandPort);
} else {
LOG.info("Job Master address: " + jobMasterIPandPort);
}
finder.close();
String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
jobMasterPort = Integer.parseInt(jobMasterPortStr);
jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
} else {
jobMasterIP = JobMasterContext.jobMasterIP(config);
jobMasterPort = JobMasterContext.jobMasterPort(config);
}
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
int workerCount = job.getNumberOfWorkers();
LOG.info("Worker Count..: " + workerCount);
this.masterClient = createMasterAgent(config, jobMasterIP, jobMasterPort, workerInfo, numberOfWorkers);
return masterClient.getJMWorkerController();
}
use of edu.iu.dsc.tws.common.zk.ZKJobMasterFinder in project twister2 by DSC-SPIDAL.
the class ZKJobMasterFinderExample method main.
/**
* This class is used together with ZKJobMasterRegistrarExample.java
* That class registers the Job Master and this class discovers it
* <p>
* This class tries to get the Job Master address from a ZooKeeper server
* If the Job Master has not been registered yet,
* it can wait for it to be registered
*/
public static void main(String[] args) {
if (args.length != 1) {
printUsage();
return;
}
String zkAddress = args[0];
String jobID = "test-job";
Config cnfg = buildTestConfig(zkAddress);
ZKJobMasterFinder finder = new ZKJobMasterFinder(cnfg, jobID);
finder.initialize();
String jobMasterIPandPort = finder.getJobMasterIPandPort();
if (jobMasterIPandPort == null) {
LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
LOG.info("Job Master address: " + jobMasterIPandPort);
} else {
LOG.info("Job Master address: " + jobMasterIPandPort);
}
finder.close();
LOG.info("Done, exiting ...");
}
use of edu.iu.dsc.tws.common.zk.ZKJobMasterFinder in project twister2 by DSC-SPIDAL.
the class MesosDockerWorker method main.
public static void main(String[] args) throws Exception {
// gets the docker home directory
// String homeDir = System.getenv("HOME");
workerId = Integer.parseInt(System.getenv("WORKER_ID"));
jobID = System.getenv("JOB_NAME");
MesosDockerWorker worker = new MesosDockerWorker();
String twister2Home = Paths.get("").toAbsolutePath().toString();
String configDir = "twister2-job";
config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
resourceIndex = Integer.parseInt(System.getenv("COMPUTE_RESOURCE_INDEX"));
MesosWorkerLogger logger = new MesosWorkerLogger(config, "/persistent-volume/logs", "worker" + workerId);
logger.initLogging();
LOG.info("WORKER ID ..:" + workerId);
Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(config, startingPort);
MesosWorkerController workerController = null;
JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobID + ".job");
try {
JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
workerController = new MesosWorkerController(config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerId, computeResource, additionalPorts);
} catch (Exception e) {
LOG.severe("Error " + e.getMessage());
}
// find the jobmaster
ZKJobMasterFinder finder = new ZKJobMasterFinder(config, job.getJobId());
finder.initialize();
String jobMasterIPandPort = finder.getJobMasterIPandPort();
if (jobMasterIPandPort == null) {
LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
LOG.info("Job Master address: " + jobMasterIPandPort);
} else {
LOG.info("Job Master address: " + jobMasterIPandPort);
}
finder.close();
String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
int jobMasterPort = Integer.parseInt(jobMasterPortStr);
String jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
// LOG.info("JobMaster IP..: " + jobMasterIP);
// LOG.info("Worker ID..: " + workerId);
// StringBuilder outputBuilder = new StringBuilder();
// int workerCount = workerController.getNumberOfWorkers();
int workerCount = job.getNumberOfWorkers();
LOG.info("Worker Count..: " + workerCount);
LOG.info(workerController.getWorkerInfo().toString());
// start job master client
worker.startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort, workerCount);
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
startWorker(workerController, null);
try {
Thread.sleep(3000);
} catch (InterruptedException e) {
LOG.info("sleep exception" + e.getMessage());
}
closeWorker();
}
use of edu.iu.dsc.tws.common.zk.ZKJobMasterFinder in project twister2 by DSC-SPIDAL.
the class MesosMPIMasterStarter method main.
public static void main(String[] args) throws Exception {
MesosMPIMasterStarter mpiMaster = new MesosMPIMasterStarter();
// Thread.sleep(5000);
// gets the docker home directory
String homeDir = System.getenv("HOME");
int workerId = Integer.parseInt(System.getenv("WORKER_ID"));
mpiMaster.jobName = System.getenv("JOB_NAME");
resourceIndex = Integer.parseInt(System.getenv("COMPUTE_RESOURCE_INDEX"));
String twister2Home = Paths.get("").toAbsolutePath().toString();
String configDir = "twister2-job";
mpiMaster.config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
MesosWorkerLogger logger = new MesosWorkerLogger(mpiMaster.config, "/persistent-volume/logs", "mpiMaster");
logger.initLogging();
Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(mpiMaster.config, startingPort);
MesosWorkerController workerController = null;
List<JobMasterAPI.WorkerInfo> workerInfoList = new ArrayList<JobMasterAPI.WorkerInfo>();
int numberOfWorkers = 0;
JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + mpiMaster.jobName + ".job");
try {
JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
workerController = new MesosWorkerController(mpiMaster.config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerId, computeResource, additionalPorts);
LOG.info("Initializing with zookeeper");
workerController.initializeWithZooKeeper();
LOG.info("Waiting for all workers to join");
workerInfoList = workerController.getAllWorkers();
LOG.info("Everyone has joined");
// container.execute(worker.config, id, null, workerController, null);
} catch (Exception e) {
LOG.severe("Host unkown " + e.getMessage());
}
ZKJobMasterFinder finder = new ZKJobMasterFinder(mpiMaster.config, job.getJobId());
finder.initialize();
String jobMasterIPandPort = finder.getJobMasterIPandPort();
if (jobMasterIPandPort == null) {
LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
LOG.info("Job Master address: " + jobMasterIPandPort);
} else {
LOG.info("Job Master address: " + jobMasterIPandPort);
}
finder.close();
// old way of finding
// String jobMasterIP = workerNetworkInfoList.get(0).getWorkerIP().getHostAddress();
String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
int jobMasterPort = Integer.parseInt(jobMasterPortStr);
String jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
LOG.info("JobMaster IP..: " + jobMasterIP);
LOG.info("Worker ID..: " + workerId);
StringBuilder outputBuilder = new StringBuilder();
int workerCount = workerController.getNumberOfWorkers();
LOG.info("Worker Count..: " + workerCount);
mpiMaster.startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort, numberOfWorkers);
Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/twister2/hostFile", true)));
for (int i = 0; i < workerCount; i++) {
writer.write(workerInfoList.get(i).getWorkerIP() + "\n");
LOG.info("Host IP..: " + workerInfoList.get(i).getWorkerIP());
}
writer.close();
// mpi master has the id equals to 1
// id==0 is job master
String mpiClassNameToRun = "edu.iu.dsc.tws.rsched.schedulers.mesos.mpi.MesosMPIWorkerStarter";
LOG.info("Before mpirun");
String[] command = { "mpirun", "-allow-run-as-root", "-npernode", "1", "--mca", "btl_tcp_if_include", "eth0", "--hostfile", "/twister2/hostFile", "java", "-cp", "twister2-job/libexamples-java.jar:twister2-core/lib/*", mpiClassNameToRun, mpiMaster.jobName, jobMasterIP };
LOG.info("command:" + String.join(" ", command));
ProcessUtils.runSyncProcess(false, command, outputBuilder, new File("."), true);
mpiMaster.jobMasterAgent.sendWorkerCompletedMessage(JobMasterAPI.WorkerState.COMPLETED);
mpiMaster.jobMasterAgent.close();
workerController.close();
LOG.info("Job DONE");
}
Aggregations