Search in sources :

Example 1 with ZKJobMasterFinder

use of edu.iu.dsc.tws.common.zk.ZKJobMasterFinder in project twister2 by DSC-SPIDAL.

the class NomadWorkerStarter method createWorkerController.

/**
 * Create the resource plan
 */
private IWorkerController createWorkerController() {
    // first get the worker id
    String indexEnv = System.getenv("NOMAD_ALLOC_INDEX");
    String idEnv = System.getenv("NOMAD_ALLOC_ID");
    int workerID = Integer.valueOf(indexEnv);
    MPIWorkerStarter.initWorkerLogger(config, workerID);
    LOG.log(Level.INFO, String.format("Worker id = %s and index = %d", idEnv, workerID));
    Map<String, Integer> ports = getPorts(config);
    Map<String, String> localIps = getIPAddress(ports);
    int numberOfWorkers = job.getNumberOfWorkers();
    LOG.info("Worker Count..: " + numberOfWorkers);
    JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, 0);
    // Map<String, Integer> additionalPorts =
    // NomadContext.generateAdditionalPorts(config, startingPort);
    int port = ports.get("worker");
    String host = localIps.get("worker");
    JobMasterAPI.NodeInfo nodeInfo = NomadContext.getNodeInfo(config, host);
    JobMasterAPI.WorkerInfo workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, host, port, nodeInfo, computeResource, ports);
    int jobMasterPort = 0;
    String jobMasterIP = null;
    // find the jobmaster
    if (!JobMasterContext.jobMasterRunsInClient(config)) {
        ZKJobMasterFinder finder = new ZKJobMasterFinder(config, job.getJobId());
        finder.initialize();
        String jobMasterIPandPort = finder.getJobMasterIPandPort();
        if (jobMasterIPandPort == null) {
            LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
            jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
            LOG.info("Job Master address: " + jobMasterIPandPort);
        } else {
            LOG.info("Job Master address: " + jobMasterIPandPort);
        }
        finder.close();
        String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
        jobMasterPort = Integer.parseInt(jobMasterPortStr);
        jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
    } else {
        jobMasterIP = JobMasterContext.jobMasterIP(config);
        jobMasterPort = JobMasterContext.jobMasterPort(config);
    }
    config = JobUtils.overrideConfigs(job, config);
    config = JobUtils.updateConfigs(job, config);
    int workerCount = job.getNumberOfWorkers();
    LOG.info("Worker Count..: " + workerCount);
    this.masterClient = createMasterAgent(config, jobMasterIP, jobMasterPort, workerInfo, numberOfWorkers);
    return masterClient.getJMWorkerController();
}
Also used : JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) ZKJobMasterFinder(edu.iu.dsc.tws.common.zk.ZKJobMasterFinder) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI)

Example 2 with ZKJobMasterFinder

use of edu.iu.dsc.tws.common.zk.ZKJobMasterFinder in project twister2 by DSC-SPIDAL.

the class ZKJobMasterFinderExample method main.

/**
 * This class is used together with ZKJobMasterRegistrarExample.java
 * That class registers the Job Master and this class discovers it
 * <p>
 * This class tries to get the Job Master address from a ZooKeeper server
 * If the Job Master has not been registered yet,
 * it can wait for it to be registered
 */
public static void main(String[] args) {
    if (args.length != 1) {
        printUsage();
        return;
    }
    String zkAddress = args[0];
    String jobID = "test-job";
    Config cnfg = buildTestConfig(zkAddress);
    ZKJobMasterFinder finder = new ZKJobMasterFinder(cnfg, jobID);
    finder.initialize();
    String jobMasterIPandPort = finder.getJobMasterIPandPort();
    if (jobMasterIPandPort == null) {
        LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
        jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
        LOG.info("Job Master address: " + jobMasterIPandPort);
    } else {
        LOG.info("Job Master address: " + jobMasterIPandPort);
    }
    finder.close();
    LOG.info("Done, exiting ...");
}
Also used : ZKJobMasterFinder(edu.iu.dsc.tws.common.zk.ZKJobMasterFinder) Config(edu.iu.dsc.tws.api.config.Config)

Example 3 with ZKJobMasterFinder

use of edu.iu.dsc.tws.common.zk.ZKJobMasterFinder in project twister2 by DSC-SPIDAL.

the class MesosDockerWorker method main.

public static void main(String[] args) throws Exception {
    // gets the docker home directory
    // String homeDir = System.getenv("HOME");
    workerId = Integer.parseInt(System.getenv("WORKER_ID"));
    jobID = System.getenv("JOB_NAME");
    MesosDockerWorker worker = new MesosDockerWorker();
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    resourceIndex = Integer.parseInt(System.getenv("COMPUTE_RESOURCE_INDEX"));
    MesosWorkerLogger logger = new MesosWorkerLogger(config, "/persistent-volume/logs", "worker" + workerId);
    logger.initLogging();
    LOG.info("WORKER ID ..:" + workerId);
    Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(config, startingPort);
    MesosWorkerController workerController = null;
    JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobID + ".job");
    try {
        JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
        workerController = new MesosWorkerController(config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerId, computeResource, additionalPorts);
    } catch (Exception e) {
        LOG.severe("Error " + e.getMessage());
    }
    // find the jobmaster
    ZKJobMasterFinder finder = new ZKJobMasterFinder(config, job.getJobId());
    finder.initialize();
    String jobMasterIPandPort = finder.getJobMasterIPandPort();
    if (jobMasterIPandPort == null) {
        LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
        jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
        LOG.info("Job Master address: " + jobMasterIPandPort);
    } else {
        LOG.info("Job Master address: " + jobMasterIPandPort);
    }
    finder.close();
    String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
    int jobMasterPort = Integer.parseInt(jobMasterPortStr);
    String jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
    // LOG.info("JobMaster IP..: " + jobMasterIP);
    // LOG.info("Worker ID..: " + workerId);
    // StringBuilder outputBuilder = new StringBuilder();
    // int workerCount = workerController.getNumberOfWorkers();
    int workerCount = job.getNumberOfWorkers();
    LOG.info("Worker Count..: " + workerCount);
    LOG.info(workerController.getWorkerInfo().toString());
    // start job master client
    worker.startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort, workerCount);
    config = JobUtils.overrideConfigs(job, config);
    config = JobUtils.updateConfigs(job, config);
    startWorker(workerController, null);
    try {
        Thread.sleep(3000);
    } catch (InterruptedException e) {
        LOG.info("sleep exception" + e.getMessage());
    }
    closeWorker();
}
Also used : ZKJobMasterFinder(edu.iu.dsc.tws.common.zk.ZKJobMasterFinder) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI)

Example 4 with ZKJobMasterFinder

use of edu.iu.dsc.tws.common.zk.ZKJobMasterFinder in project twister2 by DSC-SPIDAL.

the class MesosMPIMasterStarter method main.

public static void main(String[] args) throws Exception {
    MesosMPIMasterStarter mpiMaster = new MesosMPIMasterStarter();
    // Thread.sleep(5000);
    // gets the docker home directory
    String homeDir = System.getenv("HOME");
    int workerId = Integer.parseInt(System.getenv("WORKER_ID"));
    mpiMaster.jobName = System.getenv("JOB_NAME");
    resourceIndex = Integer.parseInt(System.getenv("COMPUTE_RESOURCE_INDEX"));
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    mpiMaster.config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    MesosWorkerLogger logger = new MesosWorkerLogger(mpiMaster.config, "/persistent-volume/logs", "mpiMaster");
    logger.initLogging();
    Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(mpiMaster.config, startingPort);
    MesosWorkerController workerController = null;
    List<JobMasterAPI.WorkerInfo> workerInfoList = new ArrayList<JobMasterAPI.WorkerInfo>();
    int numberOfWorkers = 0;
    JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + mpiMaster.jobName + ".job");
    try {
        JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
        workerController = new MesosWorkerController(mpiMaster.config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerId, computeResource, additionalPorts);
        LOG.info("Initializing with zookeeper");
        workerController.initializeWithZooKeeper();
        LOG.info("Waiting for all workers to join");
        workerInfoList = workerController.getAllWorkers();
        LOG.info("Everyone has joined");
    // container.execute(worker.config, id, null, workerController, null);
    } catch (Exception e) {
        LOG.severe("Host unkown " + e.getMessage());
    }
    ZKJobMasterFinder finder = new ZKJobMasterFinder(mpiMaster.config, job.getJobId());
    finder.initialize();
    String jobMasterIPandPort = finder.getJobMasterIPandPort();
    if (jobMasterIPandPort == null) {
        LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
        jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
        LOG.info("Job Master address: " + jobMasterIPandPort);
    } else {
        LOG.info("Job Master address: " + jobMasterIPandPort);
    }
    finder.close();
    // old way of finding
    // String jobMasterIP = workerNetworkInfoList.get(0).getWorkerIP().getHostAddress();
    String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
    int jobMasterPort = Integer.parseInt(jobMasterPortStr);
    String jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
    LOG.info("JobMaster IP..: " + jobMasterIP);
    LOG.info("Worker ID..: " + workerId);
    StringBuilder outputBuilder = new StringBuilder();
    int workerCount = workerController.getNumberOfWorkers();
    LOG.info("Worker Count..: " + workerCount);
    mpiMaster.startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort, numberOfWorkers);
    Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/twister2/hostFile", true)));
    for (int i = 0; i < workerCount; i++) {
        writer.write(workerInfoList.get(i).getWorkerIP() + "\n");
        LOG.info("Host IP..: " + workerInfoList.get(i).getWorkerIP());
    }
    writer.close();
    // mpi master has the id equals to 1
    // id==0 is job master
    String mpiClassNameToRun = "edu.iu.dsc.tws.rsched.schedulers.mesos.mpi.MesosMPIWorkerStarter";
    LOG.info("Before mpirun");
    String[] command = { "mpirun", "-allow-run-as-root", "-npernode", "1", "--mca", "btl_tcp_if_include", "eth0", "--hostfile", "/twister2/hostFile", "java", "-cp", "twister2-job/libexamples-java.jar:twister2-core/lib/*", mpiClassNameToRun, mpiMaster.jobName, jobMasterIP };
    LOG.info("command:" + String.join(" ", command));
    ProcessUtils.runSyncProcess(false, command, outputBuilder, new File("."), true);
    mpiMaster.jobMasterAgent.sendWorkerCompletedMessage(JobMasterAPI.WorkerState.COMPLETED);
    mpiMaster.jobMasterAgent.close();
    workerController.close();
    LOG.info("Job DONE");
}
Also used : MesosWorkerLogger(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger) ZKJobMasterFinder(edu.iu.dsc.tws.common.zk.ZKJobMasterFinder) ArrayList(java.util.ArrayList) MesosWorkerController(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI) BufferedWriter(java.io.BufferedWriter) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

ZKJobMasterFinder (edu.iu.dsc.tws.common.zk.ZKJobMasterFinder)4 JobAPI (edu.iu.dsc.tws.proto.system.job.JobAPI)3 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)2 Config (edu.iu.dsc.tws.api.config.Config)1 MesosWorkerController (edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController)1 MesosWorkerLogger (edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1 ArrayList (java.util.ArrayList)1