Search in sources :

Example 1 with K8sPersistentVolume

use of edu.iu.dsc.tws.rsched.schedulers.k8s.worker.K8sPersistentVolume in project twister2 by DSC-SPIDAL.

the class MPIWorkerStarter method main.

@SuppressWarnings("RegexpSinglelineJava")
public static void main(String[] args) {
    // we can not initialize the logger fully yet,
    // but we need to set the format as the first thing
    LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
    String jobMasterIP = System.getenv(K8sEnvVariables.JOB_MASTER_IP.name());
    jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
    boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
    if (jobMasterIP == null) {
        throw new RuntimeException("JobMasterIP address is null");
    }
    if (jobID == null) {
        throw new RuntimeException("jobID is null");
    }
    // load the configuration parameters from configuration directory
    String configDir = POD_MEMORY_VOLUME + "/" + JOB_ARCHIVE_DIRECTORY;
    config = K8sWorkerUtils.loadConfig(configDir);
    // initialize MPI
    try {
        MPI.Init(args);
        workerID = MPI.COMM_WORLD.getRank();
        numberOfWorkers = MPI.COMM_WORLD.getSize();
    } catch (MPIException e) {
        LOG.log(Level.SEVERE, "Could not get rank or size from MPI.COMM_WORLD", e);
        throw new RuntimeException(e);
    }
    // initialize persistent volume
    K8sPersistentVolume pv = null;
    if (KubernetesContext.persistentVolumeRequested(config)) {
        // create persistent volume object
        String persistentJobDir = KubernetesConstants.PERSISTENT_VOLUME_MOUNT;
        pv = new K8sPersistentVolume(persistentJobDir, workerID);
    }
    // initialize persistent logging
    K8sWorkerUtils.initWorkerLogger(workerID, pv, config);
    // read job description file
    String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
    jobDescFileName = POD_MEMORY_VOLUME + "/" + JOB_ARCHIVE_DIRECTORY + "/" + jobDescFileName;
    job = JobUtils.readJobFile(jobDescFileName);
    LOG.info("Job description file is loaded: " + jobDescFileName);
    // add any configuration from job file to the config object
    // if there are the same config parameters in both,
    // job file configurations will override
    config = JobUtils.overrideConfigs(job, config);
    config = JobUtils.updateConfigs(job, config);
    // update jobMasterIP
    // update config with jobMasterIP
    config = Config.newBuilder().putAll(config).put(JobMasterContext.JOB_MASTER_IP, jobMasterIP).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
    InetAddress localHost = null;
    String podName = null;
    try {
        localHost = InetAddress.getLocalHost();
    } catch (UnknownHostException e) {
        LOG.log(Level.SEVERE, "Cannot get localHost.", e);
    }
    String podIP = localHost.getHostAddress();
    podName = localHost.getHostName();
    // get up to first dot: t2-job-0-0
    if (podName.indexOf(".") > 0) {
        podName = podName.substring(0, podName.indexOf("."));
    }
    int workerPort = KubernetesContext.workerBasePort(config) + workerID * (SchedulerContext.numberOfAdditionalPorts(config) + 1);
    String nodeIP = null;
    try {
        nodeIP = Files.readAllLines(Paths.get("hostip.txt")).get(0);
    } catch (IOException e) {
        LOG.log(Level.WARNING, "Could not get host-ip from hostip.txt file.", e);
    }
    JobMasterAPI.NodeInfo nodeInfo = null;
    if (nodeIP == null) {
        LOG.warning("Could not get nodeIP for this pod. Using podIP as nodeIP.");
        nodeInfo = NodeInfoUtils.createNodeInfo(podIP, null, null);
    } else if (KubernetesContext.nodeLocationsFromConfig(config)) {
        nodeInfo = KubernetesContext.getNodeInfo(config, nodeIP);
    } else {
        try {
            String encodedNodeInfos = Files.readAllLines(Paths.get("node-info-list.txt")).get(0);
            nodeInfo = K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfos, nodeIP);
        } catch (IOException e) {
            LOG.log(Level.WARNING, "Could not get node-info list from file: node-info-list.txt. " + "Will use podIP as nodeIP", e);
            nodeInfo = NodeInfoUtils.createNodeInfo(podIP, null, null);
        }
    }
    LOG.info(String.format("PodName: %s, NodeInfo for this worker: %s", podName, nodeInfo));
    computeResource = K8sWorkerUtils.getComputeResource(job, podName);
    // generate additional ports if requested
    Map<String, Integer> additionalPorts = K8sWorkerUtils.generateAdditionalPorts(config, workerPort);
    workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, podIP, workerPort, nodeInfo, computeResource, additionalPorts);
    LOG.info("Worker information summary: \n" + "MPI Rank(workerID): " + workerID + "\n" + "MPI Size(number of workers): " + numberOfWorkers + "\n" + "POD_IP: " + podIP + "\n" + "HOSTNAME(podname): " + podName);
    int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, jobID, workerInfo);
    WorkerRuntime.init(config, job, workerInfo, restartCount);
    /**
     * Interfaces to interact with other workers and Job Master if there is any
     */
    IWorkerController workerController = WorkerRuntime.getWorkerController();
    IWorkerStatusUpdater workerStatusUpdater = WorkerRuntime.getWorkerStatusUpdater();
    // finish up this worker
    if (restartCount >= FaultToleranceContext.maxMpiJobRestarts(config)) {
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
        WorkerRuntime.close();
        return;
    }
    // on any uncaught exception,
    // we will label the worker as FULLY_FAILED and exit JVM with error code 1
    Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
        LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Worker FAILED...", throwable);
        if (restartCount >= FaultToleranceContext.maxMpiJobRestarts(config) - 1) {
            workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
        } else {
            workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
        }
        WorkerRuntime.close();
        System.exit(1);
    });
    // start the worker
    boolean completed = startWorker(workerController, pv, podName);
    if (completed) {
        // update worker status to COMPLETED
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
    } else {
        // if not successfully completed, it means it is fully failed
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
    }
    // finalize MPI
    try {
        MPI.Finalize();
    } catch (MPIException ignore) {
    }
    WorkerRuntime.close();
}
Also used : UnknownHostException(java.net.UnknownHostException) IWorkerStatusUpdater(edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater) IWorkerController(edu.iu.dsc.tws.api.resource.IWorkerController) IOException(java.io.IOException) MPIException(mpi.MPIException) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) K8sPersistentVolume(edu.iu.dsc.tws.rsched.schedulers.k8s.worker.K8sPersistentVolume) InetAddress(java.net.InetAddress)

Aggregations

IWorkerController (edu.iu.dsc.tws.api.resource.IWorkerController)1 IWorkerStatusUpdater (edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater)1 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)1 K8sPersistentVolume (edu.iu.dsc.tws.rsched.schedulers.k8s.worker.K8sPersistentVolume)1 IOException (java.io.IOException)1 InetAddress (java.net.InetAddress)1 UnknownHostException (java.net.UnknownHostException)1 MPIException (mpi.MPIException)1