use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger in project twister2 by DSC-SPIDAL.
the class MesosJobMasterStarter method main.
public static void main(String[] args) {
// we can not initialize the logger fully yet,
// but we need to set the format as the first thing
String homeDir = System.getenv("HOME");
int workerId = Integer.parseInt(System.getenv("WORKER_ID"));
String jobName = System.getenv("JOB_NAME");
String jobId = System.getenv("JOB_ID");
String twister2Home = Paths.get("").toAbsolutePath().toString();
String configDir = "twister2-job";
Config config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
Config.Builder builder = Config.newBuilder().putAll(config);
builder.put(Context.JOB_ID, jobId);
config = builder.build();
JobTerminator terminator = new JobTerminator(config, System.getenv("FRAMEWORK_ID"));
MesosWorkerLogger logger = new MesosWorkerLogger(config, "/persistent-volume/logs", "master");
logger.initLogging();
edu.iu.dsc.tws.rsched.schedulers.mesos.MesosController controller;
controller = new edu.iu.dsc.tws.rsched.schedulers.mesos.MesosController(config);
JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobName + ".job");
// try {
// workerController = new MesosWorkerController(config, job,
// Inet4Address.getLocalHost().getHostAddress(), 2023, workerId);
// LOG.info("Initializing with zookeeper");
// workerController.initializeWithZooKeeper();
// LOG.info("Waiting for all workers to join");
// workerController.getAllWorkers(
// ZKContext.maxWaitTimeForAllWorkersToJoin(config));
// LOG.info("Everyone has joined");
// // //container.execute(worker.config, id, null, workerController, null);
//
//
// } catch (Exception e) {
// LOG.severe("Error " + e.getMessage());
// }
// this block is for ZKjobmaster register
ZKJobMasterRegistrar registrar = null;
try {
registrar = new ZKJobMasterRegistrar(config, Inet4Address.getLocalHost().getHostAddress(), 11011, job.getJobId());
LOG.info("JobMaster REGISTERED..:" + Inet4Address.getLocalHost().getHostAddress());
} catch (UnknownHostException e) {
LOG.info("JobMaster CAN NOT BE REGISTERED:");
e.printStackTrace();
}
boolean initialized = registrar.initialize();
if (!initialized) {
LOG.info("CAN NOT INITIALIZE");
}
if (!initialized && registrar.sameZNodeExist()) {
registrar.deleteJobMasterZNode();
registrar.initialize();
}
if (!JobMasterContext.jobMasterRunsInClient(config)) {
JobMaster jobMaster;
try {
String workerIp = Inet4Address.getLocalHost().getHostAddress();
JobMasterAPI.NodeInfo jobMasterNodeInfo = MesosContext.getNodeInfo(config, workerIp);
IScalerPerCluster clusterScaler = new NullScaler();
MesosScaler mesosScaler = new MesosScaler(config, job, controller);
mesosScaler.setFrameWorkId(System.getenv("FRAMEWORK_ID"));
JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
// JobMaster.jobID = jobId;
jobMaster = new JobMaster(config, InetAddress.getLocalHost().getHostAddress(), terminator, job, jobMasterNodeInfo, clusterScaler, initialState);
// jobMaster.jobId = jobId;
LOG.info("JobMaster host address...:" + InetAddress.getLocalHost().getHostAddress());
jobMaster.startJobMasterBlocking();
// jobMaster.startJobMasterThreaded();
} catch (Exception e) {
LOG.log(Level.SEVERE, "Exception when getting local host address: ", e);
}
}
waitIndefinitely();
registrar.deleteJobMasterZNode();
registrar.close();
}
use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger in project twister2 by DSC-SPIDAL.
the class MesosMPIWorkerStarter method main.
public static void main(String[] args) {
try {
MPI.Init(args);
workerID = MPI.COMM_WORLD.getRank();
numberOfWorkers = MPI.COMM_WORLD.getSize();
System.out.println("Worker ranking..:" + workerID + " Number of workers..:" + numberOfWorkers);
} catch (MPIException e) {
LOG.log(Level.SEVERE, "Could not get rank or size from mpi.COMM_WORLD", e);
throw new RuntimeException(e);
}
jobName = args[0];
String twister2Home = Paths.get("").toAbsolutePath().toString();
String configDir = "twister2-job";
config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
MesosWorkerLogger logger = new MesosWorkerLogger(config, "/persistent-volume/logs", "worker" + workerID);
logger.initLogging();
MesosWorkerController workerController = null;
// List<WorkerNetworkInfo> workerNetworkInfoList = new ArrayList<>();
Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(config, startingPort);
try {
JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobName + ".job");
// add any configuration from job file to the config object
// if there are the same config parameters in both,
// job file configurations will override
config = JobUtils.overrideConfigs(job, config);
config = JobUtils.updateConfigs(job, config);
// this will change to get proper resource index.
JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
LOG.info("in worker starter...... job worker count:" + job.getNumberOfWorkers());
workerController = new MesosWorkerController(config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerID, computeResource, additionalPorts);
workerController.initializeWithZooKeeper();
} catch (Exception e) {
LOG.severe("Error " + e.getMessage());
}
// can not access docker env variable so it was passed as a parameter
String jobMasterIP = args[1];
LOG.info("JobMaster IP..: " + jobMasterIP);
LOG.info("Worker ID..: " + workerID);
int jobMasterPort = JobMasterContext.jobMasterPort(config);
startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort);
LOG.info("\nWorker Controller\nWorker ID..: " + workerController.getWorkerInfo().getWorkerID() + "\nIP address..: " + workerController.getWorkerInfo().getWorkerIP());
startWorker(workerController, null);
try {
Thread.sleep(2000);
} catch (InterruptedException e) {
LOG.info("sleep exception" + e.getMessage());
}
try {
MPI.Finalize();
} catch (MPIException ignore) {
LOG.info("MPI Finalize Exception" + ignore.getMessage());
}
closeWorker();
// workerController.close();
}
use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger in project twister2 by DSC-SPIDAL.
the class MesosMPIMasterStarter method main.
public static void main(String[] args) throws Exception {
MesosMPIMasterStarter mpiMaster = new MesosMPIMasterStarter();
// Thread.sleep(5000);
// gets the docker home directory
String homeDir = System.getenv("HOME");
int workerId = Integer.parseInt(System.getenv("WORKER_ID"));
mpiMaster.jobName = System.getenv("JOB_NAME");
resourceIndex = Integer.parseInt(System.getenv("COMPUTE_RESOURCE_INDEX"));
String twister2Home = Paths.get("").toAbsolutePath().toString();
String configDir = "twister2-job";
mpiMaster.config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
MesosWorkerLogger logger = new MesosWorkerLogger(mpiMaster.config, "/persistent-volume/logs", "mpiMaster");
logger.initLogging();
Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(mpiMaster.config, startingPort);
MesosWorkerController workerController = null;
List<JobMasterAPI.WorkerInfo> workerInfoList = new ArrayList<JobMasterAPI.WorkerInfo>();
int numberOfWorkers = 0;
JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + mpiMaster.jobName + ".job");
try {
JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
workerController = new MesosWorkerController(mpiMaster.config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerId, computeResource, additionalPorts);
LOG.info("Initializing with zookeeper");
workerController.initializeWithZooKeeper();
LOG.info("Waiting for all workers to join");
workerInfoList = workerController.getAllWorkers();
LOG.info("Everyone has joined");
// container.execute(worker.config, id, null, workerController, null);
} catch (Exception e) {
LOG.severe("Host unkown " + e.getMessage());
}
ZKJobMasterFinder finder = new ZKJobMasterFinder(mpiMaster.config, job.getJobId());
finder.initialize();
String jobMasterIPandPort = finder.getJobMasterIPandPort();
if (jobMasterIPandPort == null) {
LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
LOG.info("Job Master address: " + jobMasterIPandPort);
} else {
LOG.info("Job Master address: " + jobMasterIPandPort);
}
finder.close();
// old way of finding
// String jobMasterIP = workerNetworkInfoList.get(0).getWorkerIP().getHostAddress();
String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
int jobMasterPort = Integer.parseInt(jobMasterPortStr);
String jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
LOG.info("JobMaster IP..: " + jobMasterIP);
LOG.info("Worker ID..: " + workerId);
StringBuilder outputBuilder = new StringBuilder();
int workerCount = workerController.getNumberOfWorkers();
LOG.info("Worker Count..: " + workerCount);
mpiMaster.startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort, numberOfWorkers);
Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/twister2/hostFile", true)));
for (int i = 0; i < workerCount; i++) {
writer.write(workerInfoList.get(i).getWorkerIP() + "\n");
LOG.info("Host IP..: " + workerInfoList.get(i).getWorkerIP());
}
writer.close();
// mpi master has the id equals to 1
// id==0 is job master
String mpiClassNameToRun = "edu.iu.dsc.tws.rsched.schedulers.mesos.mpi.MesosMPIWorkerStarter";
LOG.info("Before mpirun");
String[] command = { "mpirun", "-allow-run-as-root", "-npernode", "1", "--mca", "btl_tcp_if_include", "eth0", "--hostfile", "/twister2/hostFile", "java", "-cp", "twister2-job/libexamples-java.jar:twister2-core/lib/*", mpiClassNameToRun, mpiMaster.jobName, jobMasterIP };
LOG.info("command:" + String.join(" ", command));
ProcessUtils.runSyncProcess(false, command, outputBuilder, new File("."), true);
mpiMaster.jobMasterAgent.sendWorkerCompletedMessage(JobMasterAPI.WorkerState.COMPLETED);
mpiMaster.jobMasterAgent.close();
workerController.close();
LOG.info("Job DONE");
}
Aggregations