Search in sources :

Example 1 with MesosWorkerLogger

use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger in project twister2 by DSC-SPIDAL.

the class MesosJobMasterStarter method main.

public static void main(String[] args) {
    // we can not initialize the logger fully yet,
    // but we need to set the format as the first thing
    String homeDir = System.getenv("HOME");
    int workerId = Integer.parseInt(System.getenv("WORKER_ID"));
    String jobName = System.getenv("JOB_NAME");
    String jobId = System.getenv("JOB_ID");
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    Config config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    Config.Builder builder = Config.newBuilder().putAll(config);
    builder.put(Context.JOB_ID, jobId);
    config = builder.build();
    JobTerminator terminator = new JobTerminator(config, System.getenv("FRAMEWORK_ID"));
    MesosWorkerLogger logger = new MesosWorkerLogger(config, "/persistent-volume/logs", "master");
    logger.initLogging();
    edu.iu.dsc.tws.rsched.schedulers.mesos.MesosController controller;
    controller = new edu.iu.dsc.tws.rsched.schedulers.mesos.MesosController(config);
    JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobName + ".job");
    // try {
    // workerController = new MesosWorkerController(config, job,
    // Inet4Address.getLocalHost().getHostAddress(), 2023, workerId);
    // LOG.info("Initializing with zookeeper");
    // workerController.initializeWithZooKeeper();
    // LOG.info("Waiting for all workers to join");
    // workerController.getAllWorkers(
    // ZKContext.maxWaitTimeForAllWorkersToJoin(config));
    // LOG.info("Everyone has joined");
    // //      //container.execute(worker.config, id, null, workerController, null);
    // 
    // 
    // } catch (Exception e) {
    // LOG.severe("Error " + e.getMessage());
    // }
    // this block is for ZKjobmaster register
    ZKJobMasterRegistrar registrar = null;
    try {
        registrar = new ZKJobMasterRegistrar(config, Inet4Address.getLocalHost().getHostAddress(), 11011, job.getJobId());
        LOG.info("JobMaster REGISTERED..:" + Inet4Address.getLocalHost().getHostAddress());
    } catch (UnknownHostException e) {
        LOG.info("JobMaster CAN NOT BE REGISTERED:");
        e.printStackTrace();
    }
    boolean initialized = registrar.initialize();
    if (!initialized) {
        LOG.info("CAN NOT INITIALIZE");
    }
    if (!initialized && registrar.sameZNodeExist()) {
        registrar.deleteJobMasterZNode();
        registrar.initialize();
    }
    if (!JobMasterContext.jobMasterRunsInClient(config)) {
        JobMaster jobMaster;
        try {
            String workerIp = Inet4Address.getLocalHost().getHostAddress();
            JobMasterAPI.NodeInfo jobMasterNodeInfo = MesosContext.getNodeInfo(config, workerIp);
            IScalerPerCluster clusterScaler = new NullScaler();
            MesosScaler mesosScaler = new MesosScaler(config, job, controller);
            mesosScaler.setFrameWorkId(System.getenv("FRAMEWORK_ID"));
            JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
            // JobMaster.jobID = jobId;
            jobMaster = new JobMaster(config, InetAddress.getLocalHost().getHostAddress(), terminator, job, jobMasterNodeInfo, clusterScaler, initialState);
            // jobMaster.jobId = jobId;
            LOG.info("JobMaster host address...:" + InetAddress.getLocalHost().getHostAddress());
            jobMaster.startJobMasterBlocking();
        // jobMaster.startJobMasterThreaded();
        } catch (Exception e) {
            LOG.log(Level.SEVERE, "Exception when getting local host address: ", e);
        }
    }
    waitIndefinitely();
    registrar.deleteJobMasterZNode();
    registrar.close();
}
Also used : JobMaster(edu.iu.dsc.tws.master.server.JobMaster) MesosWorkerLogger(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger) UnknownHostException(java.net.UnknownHostException) NullScaler(edu.iu.dsc.tws.api.driver.NullScaler) Config(edu.iu.dsc.tws.api.config.Config) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI) IScalerPerCluster(edu.iu.dsc.tws.api.driver.IScalerPerCluster) UnknownHostException(java.net.UnknownHostException) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) MesosScaler(edu.iu.dsc.tws.rsched.schedulers.mesos.driver.MesosScaler) ZKJobMasterRegistrar(edu.iu.dsc.tws.common.zk.ZKJobMasterRegistrar)

Example 2 with MesosWorkerLogger

use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger in project twister2 by DSC-SPIDAL.

the class MesosMPIWorkerStarter method main.

public static void main(String[] args) {
    try {
        MPI.Init(args);
        workerID = MPI.COMM_WORLD.getRank();
        numberOfWorkers = MPI.COMM_WORLD.getSize();
        System.out.println("Worker ranking..:" + workerID + " Number of workers..:" + numberOfWorkers);
    } catch (MPIException e) {
        LOG.log(Level.SEVERE, "Could not get rank or size from mpi.COMM_WORLD", e);
        throw new RuntimeException(e);
    }
    jobName = args[0];
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    MesosWorkerLogger logger = new MesosWorkerLogger(config, "/persistent-volume/logs", "worker" + workerID);
    logger.initLogging();
    MesosWorkerController workerController = null;
    // List<WorkerNetworkInfo> workerNetworkInfoList = new ArrayList<>();
    Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(config, startingPort);
    try {
        JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobName + ".job");
        // add any configuration from job file to the config object
        // if there are the same config parameters in both,
        // job file configurations will override
        config = JobUtils.overrideConfigs(job, config);
        config = JobUtils.updateConfigs(job, config);
        // this will change to get proper resource index.
        JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
        LOG.info("in worker starter...... job worker count:" + job.getNumberOfWorkers());
        workerController = new MesosWorkerController(config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerID, computeResource, additionalPorts);
        workerController.initializeWithZooKeeper();
    } catch (Exception e) {
        LOG.severe("Error " + e.getMessage());
    }
    // can not access docker env variable so it was passed as a parameter
    String jobMasterIP = args[1];
    LOG.info("JobMaster IP..: " + jobMasterIP);
    LOG.info("Worker ID..: " + workerID);
    int jobMasterPort = JobMasterContext.jobMasterPort(config);
    startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort);
    LOG.info("\nWorker Controller\nWorker ID..: " + workerController.getWorkerInfo().getWorkerID() + "\nIP address..: " + workerController.getWorkerInfo().getWorkerIP());
    startWorker(workerController, null);
    try {
        Thread.sleep(2000);
    } catch (InterruptedException e) {
        LOG.info("sleep exception" + e.getMessage());
    }
    try {
        MPI.Finalize();
    } catch (MPIException ignore) {
        LOG.info("MPI Finalize Exception" + ignore.getMessage());
    }
    closeWorker();
// workerController.close();
}
Also used : MesosWorkerLogger(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger) MesosWorkerController(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI) MPIException(mpi.MPIException) MPIException(mpi.MPIException)

Example 3 with MesosWorkerLogger

use of edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger in project twister2 by DSC-SPIDAL.

the class MesosMPIMasterStarter method main.

public static void main(String[] args) throws Exception {
    MesosMPIMasterStarter mpiMaster = new MesosMPIMasterStarter();
    // Thread.sleep(5000);
    // gets the docker home directory
    String homeDir = System.getenv("HOME");
    int workerId = Integer.parseInt(System.getenv("WORKER_ID"));
    mpiMaster.jobName = System.getenv("JOB_NAME");
    resourceIndex = Integer.parseInt(System.getenv("COMPUTE_RESOURCE_INDEX"));
    String twister2Home = Paths.get("").toAbsolutePath().toString();
    String configDir = "twister2-job";
    mpiMaster.config = ConfigLoader.loadConfig(twister2Home, configDir, "mesos");
    MesosWorkerLogger logger = new MesosWorkerLogger(mpiMaster.config, "/persistent-volume/logs", "mpiMaster");
    logger.initLogging();
    Map<String, Integer> additionalPorts = MesosWorkerUtils.generateAdditionalPorts(mpiMaster.config, startingPort);
    MesosWorkerController workerController = null;
    List<JobMasterAPI.WorkerInfo> workerInfoList = new ArrayList<JobMasterAPI.WorkerInfo>();
    int numberOfWorkers = 0;
    JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + mpiMaster.jobName + ".job");
    try {
        JobAPI.ComputeResource computeResource = JobUtils.getComputeResource(job, resourceIndex);
        workerController = new MesosWorkerController(mpiMaster.config, job, Inet4Address.getLocalHost().getHostAddress(), 2023, workerId, computeResource, additionalPorts);
        LOG.info("Initializing with zookeeper");
        workerController.initializeWithZooKeeper();
        LOG.info("Waiting for all workers to join");
        workerInfoList = workerController.getAllWorkers();
        LOG.info("Everyone has joined");
    // container.execute(worker.config, id, null, workerController, null);
    } catch (Exception e) {
        LOG.severe("Host unkown " + e.getMessage());
    }
    ZKJobMasterFinder finder = new ZKJobMasterFinder(mpiMaster.config, job.getJobId());
    finder.initialize();
    String jobMasterIPandPort = finder.getJobMasterIPandPort();
    if (jobMasterIPandPort == null) {
        LOG.info("Job Master has not joined yet. Will wait and try to get the address ...");
        jobMasterIPandPort = finder.waitAndGetJobMasterIPandPort(20000);
        LOG.info("Job Master address: " + jobMasterIPandPort);
    } else {
        LOG.info("Job Master address: " + jobMasterIPandPort);
    }
    finder.close();
    // old way of finding
    // String jobMasterIP = workerNetworkInfoList.get(0).getWorkerIP().getHostAddress();
    String jobMasterPortStr = jobMasterIPandPort.substring(jobMasterIPandPort.lastIndexOf(":") + 1);
    int jobMasterPort = Integer.parseInt(jobMasterPortStr);
    String jobMasterIP = jobMasterIPandPort.substring(0, jobMasterIPandPort.lastIndexOf(":"));
    LOG.info("JobMaster IP..: " + jobMasterIP);
    LOG.info("Worker ID..: " + workerId);
    StringBuilder outputBuilder = new StringBuilder();
    int workerCount = workerController.getNumberOfWorkers();
    LOG.info("Worker Count..: " + workerCount);
    mpiMaster.startJobMasterAgent(workerController.getWorkerInfo(), jobMasterIP, jobMasterPort, numberOfWorkers);
    Writer writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream("/twister2/hostFile", true)));
    for (int i = 0; i < workerCount; i++) {
        writer.write(workerInfoList.get(i).getWorkerIP() + "\n");
        LOG.info("Host IP..: " + workerInfoList.get(i).getWorkerIP());
    }
    writer.close();
    // mpi master has the id equals to 1
    // id==0 is job master
    String mpiClassNameToRun = "edu.iu.dsc.tws.rsched.schedulers.mesos.mpi.MesosMPIWorkerStarter";
    LOG.info("Before mpirun");
    String[] command = { "mpirun", "-allow-run-as-root", "-npernode", "1", "--mca", "btl_tcp_if_include", "eth0", "--hostfile", "/twister2/hostFile", "java", "-cp", "twister2-job/libexamples-java.jar:twister2-core/lib/*", mpiClassNameToRun, mpiMaster.jobName, jobMasterIP };
    LOG.info("command:" + String.join(" ", command));
    ProcessUtils.runSyncProcess(false, command, outputBuilder, new File("."), true);
    mpiMaster.jobMasterAgent.sendWorkerCompletedMessage(JobMasterAPI.WorkerState.COMPLETED);
    mpiMaster.jobMasterAgent.close();
    workerController.close();
    LOG.info("Job DONE");
}
Also used : MesosWorkerLogger(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger) ZKJobMasterFinder(edu.iu.dsc.tws.common.zk.ZKJobMasterFinder) ArrayList(java.util.ArrayList) MesosWorkerController(edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI) BufferedWriter(java.io.BufferedWriter) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) FileOutputStream(java.io.FileOutputStream) OutputStreamWriter(java.io.OutputStreamWriter) File(java.io.File) BufferedWriter(java.io.BufferedWriter) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Aggregations

JobAPI (edu.iu.dsc.tws.proto.system.job.JobAPI)3 MesosWorkerLogger (edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerLogger)3 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)2 MesosWorkerController (edu.iu.dsc.tws.rsched.schedulers.mesos.MesosWorkerController)2 Config (edu.iu.dsc.tws.api.config.Config)1 IScalerPerCluster (edu.iu.dsc.tws.api.driver.IScalerPerCluster)1 NullScaler (edu.iu.dsc.tws.api.driver.NullScaler)1 ZKJobMasterFinder (edu.iu.dsc.tws.common.zk.ZKJobMasterFinder)1 ZKJobMasterRegistrar (edu.iu.dsc.tws.common.zk.ZKJobMasterRegistrar)1 JobMaster (edu.iu.dsc.tws.master.server.JobMaster)1 MesosScaler (edu.iu.dsc.tws.rsched.schedulers.mesos.driver.MesosScaler)1 BufferedWriter (java.io.BufferedWriter)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 OutputStreamWriter (java.io.OutputStreamWriter)1 Writer (java.io.Writer)1 UnknownHostException (java.net.UnknownHostException)1 ArrayList (java.util.ArrayList)1 MPIException (mpi.MPIException)1