Search in sources :

Example 1 with MesosWorkerController

use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController in project twister2 by DSC-SPIDAL.

the class MesosMPISlaveStarter method main.

public static void main(String[] args) throws Exception {
    // Thread.sleep(5000);
    workerID = Integer.parseInt(System.getenv("WORKER_ID"));
    jobName = System.getenv("JOB_NAME");
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    resourceIndex = Integer.parseInt(System.getenv("COMPUTE_RESOURCE_INDEX"));
    Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(config, startingPort);
    MesosWorkerController workerController;
    List<JobMasterAPI.WorkerInfo> workerNetworkInfoList = new ArrayList<>();
    try {
        JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobName + ".job");
        JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
        workerController = new MesosWorkerController(config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerID, computeResource, additionalPorts);
        LOG.info("Initializing with zookeeper ");
        workerController.initializeWithZooKeeper();
        LOG.info("Waiting for all workers to join");
        workerNetworkInfoList = workerController.getAllWorkers();
        LOG.info("Everyone has joined");
        Thread.sleep(30000);
        workerController.close();
    } catch (Exception e) {
        LOG.severe("Host unknown " + e.getMessage());
    }
    Thread.sleep(3000000);
}
Also used : ArrayList(java.util.ArrayList) MesosWorkerController(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI)

Example 2 with MesosWorkerController

use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController in project twister2 by DSC-SPIDAL.

the class MesosMPIWorkerStarter method main.

public static void main(String[] args) {
    try {
        MPI.Init(args);
        workerID = MPI.COMM_WORLD.getRank();
        numberOfWorkers = MPI.COMM_WORLD.getSize();
        System.out.println("Worker ranking..:" + workerID + " Number of workers..:" + numberOfWorkers);
    } catch (MPIException e) {
        LOG.log(Level.SEVERE, "Could not get rank or size from mpi.COMM_WORLD", e);
        throw new RuntimeException(e);
    }
    jobName = args[0];
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    MesosWorkerLogger logger = new MesosWorkerLogger(config, "/persistent-volume/logs", "worker" + workerID);
    logger.initLogging();
    MesosWorkerController workerController = null;
    // List<WorkerNetworkInfo> workerNetworkInfoList = new ArrayList<>();
    Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(config, startingPort);
    try {
        JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobName + ".job");
        // add any configuration from job file to the config object
        // if there are the same config parameters in both,
        // job file configurations will override
        config = JobUtils.overrideConfigs(job, config);
        config = JobUtils.updateConfigs(job, config);
        // this will change to get proper resource index.
        JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
        LOG.info("in worker starter...... job worker count:" + job.getNumberOfWorkers());
        workerController = new MesosWorkerController(config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerID, computeResource, additionalPorts);
        workerController.initializeWithZooKeeper();
    } catch (Exception e) {
        LOG.severe("Error " + e.getMessage());
    }
    // can not access docker env variable so it was passed as a parameter
    String jobMasterIP = args[1];
    LOG.info("JobMaster IP..: " + jobMasterIP);
    LOG.info("Worker ID..: " + workerID);
    int jobMasterPort = JobMasterContext.jobMasterPort(config);
    startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort);
    LOG.info("\nWorker Controller\nWorker ID..: " + workerController.getWorkerInfo().getWorkerID() + "\nIP address..: " + workerController.getWorkerInfo().getWorkerIP());
    startWorker(workerController, null);
    try {
        Thread.sleep(2000);
    } catch (InterruptedException e) {
        LOG.info("sleep exception" + e.getMessage());
    }
    try {
        MPI.Finalize();
    } catch (MPIException ignore) {
        LOG.info("MPI Finalize Exception" + ignore.getMessage());
    }
    closeWorker();
// workerController.close();
}
Also used : MesosWorkerLogger(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger) MesosWorkerController(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI) MPIException(mpi.MPIException) MPIException(mpi.MPIException)

Example 3 with MesosWorkerController

use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController in project twister2 by DSC-SPIDAL.

the class MesosMPIMasterStarter method main.

public static void main(String[] args) throws Exception {
    MesosMPIMasterStarter mpiMaster = new MesosMPIMasterStarter();
    // Thread.sleep(5000);
    // gets the docker home directory
    String homeDir = System.getenv("HOME");
    int workerId = Integer.parseInt(System.getenv("WORKER_ID"));
    mpiMaster.jobName = System.getenv("JOB_NAME");
    resourceIndex = Integer.parseInt(System.getenv("COMPUTE_RESOURCE_INDEX"));
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    mpiMaster.config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    MesosWorkerLogger logger = new MesosWorkerLogger(mpiMaster.config, "/persistent-volume/logs", "mpiMaster");
    logger.initLogging();
    Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(mpiMaster.config, startingPort);
    MesosWorkerController workerController = null;
    List<JobMasterAPI.WorkerInfo> workerInfoList = new ArrayList<JobMasterAPI.WorkerInfo>();
    int numberOfWorkers = 0;
    JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + mpiMaster.jobName + ".job");
    try {
        JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
        workerController = new MesosWorkerController(mpiMaster.config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerId, computeResource, additionalPorts);
        LOG.info("Initializing with zookeeper");
        workerController.initializeWithZooKeeper();
        LOG.info("Waiting for all workers to join");
        workerInfoList = workerController.getAllWorkers();
        LOG.info("Everyone has joined");
    // container.execute(worker.config, id, null, workerController, null);
    } catch (Exception e) {
        LOG.severe("Host unkown " + e.getMessage());
    }
    ZKJobMasterFinder finder = new ZKJobMasterFinder(mpiMaster.config, job.getJobId());
    finder.initialize();
    String jobMasterIPandPort = finder.getJobMasterIPandPort();
    if (jobMasterIPandPort == null) {
        LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
        jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
        LOG.info("Job Master address: " + jobMasterIPandPort);
    } else {
        LOG.info("Job Master address: " + jobMasterIPandPort);
    }
    finder.close();
    // old way of finding
    // String jobMasterIP = workerNetworkInfoList.get(0).getWorkerIP().getHostAddress();
    String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
    int jobMasterPort = Integer.parseInt(jobMasterPortStr);
    String jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
    LOG.info("JobMaster IP..: " + jobMasterIP);
    LOG.info("Worker ID..: " + workerId);
    StringBuilder outputBuilder = new StringBuilder();
    int workerCount = workerController.getNumberOfWorkers();
    LOG.info("Worker Count..: " + workerCount);
    mpiMaster.startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort, numberOfWorkers);
    Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/twister2/hostFile", true)));
    for (int i = 0; i < workerCount; i++) {
        writer.write(workerInfoList.get(i).getWorkerIP() + "\n");
        LOG.info("Host IP..: " + workerInfoList.get(i).getWorkerIP());
    }
    writer.close();
    // mpi master has the id equals to 1
    // id==0 is job master
    String mpiClassNameToRun = "edu.iu.dsc.tws.rsched.schedulers.mesos.mpi.MesosMPIWorkerStarter";
    LOG.info("Before mpirun");
    String[] command = { "mpirun", "-allow-run-as-root", "-npernode", "1", "--mca", "btl_tcp_if_include", "eth0", "--hostfile", "/twister2/hostFile", "java", "-cp", "twister2-job/libexamples-java.jar:twister2-core/lib/*", mpiClassNameToRun, mpiMaster.jobName, jobMasterIP };
    LOG.info("command:" + String.join(" ", command));
    ProcessUtils.runSyncProcess(false, command, outputBuilder, new File("."), true);
    mpiMaster.jobMasterAgent.sendWorkerCompletedMessage(JobMasterAPI.WorkerState.COMPLETED);
    mpiMaster.jobMasterAgent.close();
    workerController.close();
    LOG.info("Job DONE");
}
Also used : MesosWorkerLogger(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger) ZKJobMasterFinder(edu.iu.dsc.tws.common.zk.ZKJobMasterFinder) ArrayList(java.util.ArrayList) MesosWorkerController(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI) BufferedWriter(java.io.BufferedWriter) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

JobAPI (edu.iu.dsc.tws.proto.system.job.JobAPI)3 MesosWorkerController (edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController)3 MesosWorkerLogger (edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger)2 ArrayList (java.util.ArrayList)2 ZKJobMasterFinder (edu.iu.dsc.tws.common.zk.ZKJobMasterFinder)1 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1 MPIException (mpi.MPIException)1