Search in sources :

Example 6 with JobMaster

use of edu.iu.dsc.tws.master.server.JobMaster in project twister2 by DSC-SPIDAL.

the class MesosJobMasterStarter method main.

public static void main(String[] args) {
    // we can not initialize the logger fully yet,
    // but we need to set the format as the first thing
    String homeDir = System.getenv("HOME");
    int workerId = Integer.parseInt(System.getenv("WORKER_ID"));
    String jobName = System.getenv("JOB_NAME");
    String jobId = System.getenv("JOB_ID");
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    Config config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    Config.Builder builder = Config.newBuilder().putAll(config);
    builder.put(Context.JOB_ID, jobId);
    config = builder.build();
    JobTerminator terminator = new JobTerminator(config, System.getenv("FRAMEWORK_ID"));
    MesosWorkerLogger logger = new MesosWorkerLogger(config, "/persistent-volume/logs", "master");
    logger.initLogging();
    edu.iu.dsc.tws.rsched.schedulers.mesos.MesosController controller;
    controller = new edu.iu.dsc.tws.rsched.schedulers.mesos.MesosController(config);
    JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobName + ".job");
    // try {
    // workerController = new MesosWorkerController(config, job,
    // Inet4Address.getLocalHost().getHostAddress(), 2023, workerId);
    // LOG.info("Initializing with zookeeper");
    // workerController.initializeWithZooKeeper();
    // LOG.info("Waiting for all workers to join");
    // workerController.getAllWorkers(
    // ZKContext.maxWaitTimeForAllWorkersToJoin(config));
    // LOG.info("Everyone has joined");
    // //      //container.execute(worker.config, id, null, workerController, null);
    // 
    // 
    // } catch (Exception e) {
    // LOG.severe("Error " + e.getMessage());
    // }
    // this block is for ZKjobmaster register
    ZKJobMasterRegistrar registrar = null;
    try {
        registrar = new ZKJobMasterRegistrar(config, Inet4Address.getLocalHost().getHostAddress(), 11011, job.getJobId());
        LOG.info("JobMaster REGISTERED..:" + Inet4Address.getLocalHost().getHostAddress());
    } catch (UnknownHostException e) {
        LOG.info("JobMaster CAN NOT BE REGISTERED:");
        e.printStackTrace();
    }
    boolean initialized = registrar.initialize();
    if (!initialized) {
        LOG.info("CAN NOT INITIALIZE");
    }
    if (!initialized && registrar.sameZNodeExist()) {
        registrar.deleteJobMasterZNode();
        registrar.initialize();
    }
    if (!JobMasterContext.jobMasterRunsInClient(config)) {
        JobMaster jobMaster;
        try {
            String workerIp = Inet4Address.getLocalHost().getHostAddress();
            JobMasterAPI.NodeInfo jobMasterNodeInfo = MesosContext.getNodeInfo(config, workerIp);
            IScalerPerCluster clusterScaler = new NullScaler();
            MesosScaler mesosScaler = new MesosScaler(config, job, controller);
            mesosScaler.setFrameWorkId(System.getenv("FRAMEWORK_ID"));
            JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
            // JobMaster.jobID = jobId;
            jobMaster = new JobMaster(config, InetAddress.getLocalHost().getHostAddress(), terminator, job, jobMasterNodeInfo, clusterScaler, initialState);
            // jobMaster.jobId = jobId;
            LOG.info("JobMaster host address...:" + InetAddress.getLocalHost().getHostAddress());
            jobMaster.startJobMasterBlocking();
        // jobMaster.startJobMasterThreaded();
        } catch (Exception e) {
            LOG.log(Level.SEVERE, "Exception when getting local host address: ", e);
        }
    }
    waitIndefinitely();
    registrar.deleteJobMasterZNode();
    registrar.close();
}
Also used : JobMaster(edu.iu.dsc.tws.master.server.JobMaster) MesosWorkerLogger(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger) UnknownHostException(java.net.UnknownHostException) NullScaler(edu.iu.dsc.tws.api.driver.NullScaler) Config(edu.iu.dsc.tws.api.config.Config) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI) IScalerPerCluster(edu.iu.dsc.tws.api.driver.IScalerPerCluster) UnknownHostException(java.net.UnknownHostException) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) MesosScaler(edu.iu.dsc.tws.rsched.schedulers.mesos.driver.MesosScaler) ZKJobMasterRegistrar(edu.iu.dsc.tws.common.zk.ZKJobMasterRegistrar)

Example 7 with JobMaster

use of edu.iu.dsc.tws.master.server.JobMaster in project twister2 by DSC-SPIDAL.

the class NomadJobMasterStarter method launch.

/**
 * launch the job master
 *
 * @return false if setup fails
 */
public boolean launch() {
    // get the job working directory
    /*    String jobWorkingDirectory = NomadContext.workingDirectory(config);
    LOG.log(Level.INFO, "job working directory ....." + jobWorkingDirectory);

    if (NomadContext.sharedFileSystem(config)) {
      if (!setupWorkingDirectory(job, jobWorkingDirectory)) {
        throw new RuntimeException("Failed to setup the directory");
      }
    }

    Config newConfig = Config.newBuilder().putAll(config).put(
        SchedulerContext.WORKING_DIRECTORY, jobWorkingDirectory).build();
    // now start the controller, which will get the resources from
    // slurm and start the job
    //IController controller = new NomadController(true);
    controller.initialize(newConfig);*/
    String indexEnv = System.getenv("NOMAD_ALLOC_INDEX");
    String idEnv = System.getenv("NOMAD_ALLOC_ID");
    int workerID = Integer.valueOf(indexEnv);
    MPIWorkerStarter.initJMLogger(config);
    LOG.log(Level.INFO, String.format("Worker id = %s and index = %d", idEnv, workerID));
    ZKJobMasterRegistrar registrar = null;
    int port = JobMasterContext.jobMasterPort(config);
    String hostAddress = null;
    try {
        hostAddress = Inet4Address.getLocalHost().getHostAddress();
    } catch (UnknownHostException e) {
        e.printStackTrace();
    }
    try {
        registrar = new ZKJobMasterRegistrar(config, hostAddress, port, job.getJobId());
        LOG.info("JobMaster REGISTERED..:" + hostAddress);
    } catch (Exception e) {
        LOG.info("JobMaster CAN NOT BE REGISTERED:");
        e.printStackTrace();
    }
    boolean initialized = registrar.initialize();
    if (!initialized) {
        LOG.info("CAN NOT INITIALIZE");
    }
    if (!initialized && registrar.sameZNodeExist()) {
        registrar.deleteJobMasterZNode();
        registrar.initialize();
    }
    // start the Job Master locally
    JobMaster jobMaster = null;
    JobMasterAPI.NodeInfo jobMasterNodeInfo = NomadContext.getNodeInfo(config, hostAddress);
    IScalerPerCluster clusterScaler = new NullScaler();
    Thread jmThread = null;
    int workerCount = job.getNumberOfWorkers();
    LOG.info("Worker Count..: " + workerCount);
    // if you want to set it manually
    // if (JobMasterContext.jobMasterIP(config) != null) {
    // hostAddress = JobMasterContext.jobMasterIP(config);
    // }
    LOG.log(Level.INFO, String.format("Starting the Job Master: %s:%d", hostAddress, port));
    JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
    NullTerminator nt = new NullTerminator();
    jobMaster = new JobMaster(config, hostAddress, nt, job, jobMasterNodeInfo, clusterScaler, initialState);
    jobMaster.addShutdownHook(true);
    try {
        jobMaster.startJobMasterBlocking();
    } catch (Twister2Exception e) {
        LOG.log(Level.SEVERE, e.getMessage(), e);
    }
    // jmThread = jobMaster.startJobMasterThreaded();
    waitIndefinitely();
    registrar.deleteJobMasterZNode();
    registrar.close();
    boolean start = controller.start(job);
    // }
    return start;
}
Also used : JobMaster(edu.iu.dsc.tws.master.server.JobMaster) Twister2Exception(edu.iu.dsc.tws.api.exceptions.Twister2Exception) UnknownHostException(java.net.UnknownHostException) NullScaler(edu.iu.dsc.tws.api.driver.NullScaler) Twister2Exception(edu.iu.dsc.tws.api.exceptions.Twister2Exception) UnknownHostException(java.net.UnknownHostException) ParseException(org.apache.commons.cli.ParseException) IScalerPerCluster(edu.iu.dsc.tws.api.driver.IScalerPerCluster) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) ZKJobMasterRegistrar(edu.iu.dsc.tws.common.zk.ZKJobMasterRegistrar) NullTerminator(edu.iu.dsc.tws.rsched.schedulers.NullTerminator)

Example 8 with JobMaster

use of edu.iu.dsc.tws.master.server.JobMaster in project twister2 by DSC-SPIDAL.

the class NomadMasterStarter method launch.

/**
 * launch the job master
 *
 * @return false if setup fails
 */
public boolean launch() {
    // get the job working directory
    String jobWorkingDirectory = NomadContext.workingDirectory(config);
    LOG.log(Level.INFO, "job working directory ....." + jobWorkingDirectory);
    if (NomadContext.sharedFileSystem(config)) {
        if (!setupWorkingDirectory(job, jobWorkingDirectory)) {
            throw new RuntimeException("Failed to setup the directory");
        }
    }
    Config newConfig = Config.newBuilder().putAll(config).put(SchedulerContext.WORKING_DIRECTORY, jobWorkingDirectory).build();
    // now start the controller, which will get the resources from
    // slurm and start the job
    IController controller = new NomadController(true);
    controller.initialize(newConfig);
    // start the Job Master locally
    JobMaster jobMaster = null;
    Thread jmThread = null;
    if (JobMasterContext.jobMasterRunsInClient(config)) {
        try {
            int port = JobMasterContext.jobMasterPort(config);
            String hostAddress = JobMasterContext.jobMasterIP(config);
            if (hostAddress == null) {
                hostAddress = InetAddress.getLocalHost().getHostAddress();
            }
            LOG.log(Level.INFO, String.format("Starting the job manager: %s:%d", hostAddress, port));
            JobMasterAPI.NodeInfo jobMasterNodeInfo = NomadContext.getNodeInfo(config, hostAddress);
            IScalerPerCluster clusterScaler = new NullScaler();
            JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
            NullTerminator nt = new NullTerminator();
            jobMaster = new JobMaster(config, hostAddress, nt, job, jobMasterNodeInfo, clusterScaler, initialState);
            jobMaster.addShutdownHook(true);
            jmThread = jobMaster.startJobMasterThreaded();
        } catch (UnknownHostException e) {
            LOG.log(Level.SEVERE, "Exception when getting local host address: ", e);
            throw new RuntimeException(e);
        } catch (Twister2Exception e) {
            LOG.log(Level.SEVERE, "Exception when starting Job master: ", e);
            throw new RuntimeException(e);
        }
    }
    boolean start = controller.start(job);
    // now lets wait on client
    if (JobMasterContext.jobMasterRunsInClient(config)) {
        try {
            if (jmThread != null) {
                jmThread.join();
            }
        } catch (InterruptedException ignore) {
        }
    }
    return start;
}
Also used : JobMaster(edu.iu.dsc.tws.master.server.JobMaster) Twister2Exception(edu.iu.dsc.tws.api.exceptions.Twister2Exception) IController(edu.iu.dsc.tws.api.scheduler.IController) UnknownHostException(java.net.UnknownHostException) NullScaler(edu.iu.dsc.tws.api.driver.NullScaler) Config(edu.iu.dsc.tws.api.config.Config) IScalerPerCluster(edu.iu.dsc.tws.api.driver.IScalerPerCluster) NomadController(edu.iu.dsc.tws.rsched.schedulers.nomad.NomadController) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) NullTerminator(edu.iu.dsc.tws.rsched.schedulers.NullTerminator)

Aggregations

JobMaster (edu.iu.dsc.tws.master.server.JobMaster)8 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)8 Twister2Exception (edu.iu.dsc.tws.api.exceptions.Twister2Exception)7 IScalerPerCluster (edu.iu.dsc.tws.api.driver.IScalerPerCluster)5 NullScaler (edu.iu.dsc.tws.api.driver.NullScaler)5 NullTerminator (edu.iu.dsc.tws.rsched.schedulers.NullTerminator)5 UnknownHostException (java.net.UnknownHostException)5 Config (edu.iu.dsc.tws.api.config.Config)4 K8sScaler (edu.iu.dsc.tws.rsched.schedulers.k8s.driver.K8sScaler)3 IController (edu.iu.dsc.tws.api.scheduler.IController)2 ZKJobMasterRegistrar (edu.iu.dsc.tws.common.zk.ZKJobMasterRegistrar)2 JobAPI (edu.iu.dsc.tws.proto.system.job.JobAPI)2 KubernetesController (edu.iu.dsc.tws.rsched.schedulers.k8s.KubernetesController)2 Twister2Job (edu.iu.dsc.tws.api.Twister2Job)1 Twister2RuntimeException (edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException)1 Twister2JobState (edu.iu.dsc.tws.api.scheduler.Twister2JobState)1 IJobTerminator (edu.iu.dsc.tws.master.IJobTerminator)1 JobMasterState (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI.JobMasterState)1 JobTerminator (edu.iu.dsc.tws.rsched.schedulers.k8s.master.JobTerminator)1 MesosWorkerLogger (edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger)1