Search in sources :

Example 1 with IWorkerStatusUpdater

use of edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater in project twister2 by DSC-SPIDAL.

the class MPIWorkerStarter method main.

@SuppressWarnings("RegexpSinglelineJava")
public static void main(String[] args) {
    // we can not initialize the logger fully yet,
    // but we need to set the format as the first thing
    LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
    String jobMasterIP = System.getenv(K8sEnvVariables.JOB_MASTER_IP.name());
    jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
    boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
    if (jobMasterIP == null) {
        throw new RuntimeException("JobMasterIP address is null");
    }
    if (jobID == null) {
        throw new RuntimeException("jobID is null");
    }
    // load the configuration parameters from configuration directory
    String configDir = POD_MEMORY_VOLUME + "/" + JOB_ARCHIVE_DIRECTORY;
    config = K8sWorkerUtils.loadConfig(configDir);
    // initialize MPI
    try {
        MPI.Init(args);
        workerID = MPI.COMM_WORLD.getRank();
        numberOfWorkers = MPI.COMM_WORLD.getSize();
    } catch (MPIException e) {
        LOG.log(Level.SEVERE, "Could not get rank or size from MPI.COMM_WORLD", e);
        throw new RuntimeException(e);
    }
    // initialize persistent volume
    K8sPersistentVolume pv = null;
    if (KubernetesContext.persistentVolumeRequested(config)) {
        // create persistent volume object
        String persistentJobDir = KubernetesConstants.PERSISTENT_VOLUME_MOUNT;
        pv = new K8sPersistentVolume(persistentJobDir, workerID);
    }
    // initialize persistent logging
    K8sWorkerUtils.initWorkerLogger(workerID, pv, config);
    // read job description file
    String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
    jobDescFileName = POD_MEMORY_VOLUME + "/" + JOB_ARCHIVE_DIRECTORY + "/" + jobDescFileName;
    job = JobUtils.readJobFile(jobDescFileName);
    LOG.info("Job description file is loaded: " + jobDescFileName);
    // add any configuration from job file to the config object
    // if there are the same config parameters in both,
    // job file configurations will override
    config = JobUtils.overrideConfigs(job, config);
    config = JobUtils.updateConfigs(job, config);
    // update jobMasterIP
    // update config with jobMasterIP
    config = Config.newBuilder().putAll(config).put(JobMasterContext.JOB_MASTER_IP, jobMasterIP).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
    InetAddress localHost = null;
    String podName = null;
    try {
        localHost = InetAddress.getLocalHost();
    } catch (UnknownHostException e) {
        LOG.log(Level.SEVERE, "Cannot get localHost.", e);
    }
    String podIP = localHost.getHostAddress();
    podName = localHost.getHostName();
    // get up to first dot: t2-job-0-0
    if (podName.indexOf(".") > 0) {
        podName = podName.substring(0, podName.indexOf("."));
    }
    int workerPort = KubernetesContext.workerBasePort(config) + workerID * (SchedulerContext.numberOfAdditionalPorts(config) + 1);
    String nodeIP = null;
    try {
        nodeIP = Files.readAllLines(Paths.get("hostip.txt")).get(0);
    } catch (IOException e) {
        LOG.log(Level.WARNING, "Could not get host-ip from hostip.txt file.", e);
    }
    JobMasterAPI.NodeInfo nodeInfo = null;
    if (nodeIP == null) {
        LOG.warning("Could not get nodeIP for this pod. Using podIP as nodeIP.");
        nodeInfo = NodeInfoUtils.createNodeInfo(podIP, null, null);
    } else if (KubernetesContext.nodeLocationsFromConfig(config)) {
        nodeInfo = KubernetesContext.getNodeInfo(config, nodeIP);
    } else {
        try {
            String encodedNodeInfos = Files.readAllLines(Paths.get("node-info-list.txt")).get(0);
            nodeInfo = K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfos, nodeIP);
        } catch (IOException e) {
            LOG.log(Level.WARNING, "Could not get node-info list from file: node-info-list.txt. " + "Will use podIP as nodeIP", e);
            nodeInfo = NodeInfoUtils.createNodeInfo(podIP, null, null);
        }
    }
    LOG.info(String.format("PodName: %s, NodeInfo for this worker: %s", podName, nodeInfo));
    computeResource = K8sWorkerUtils.getComputeResource(job, podName);
    // generate additional ports if requested
    Map<String, Integer> additionalPorts = K8sWorkerUtils.generateAdditionalPorts(config, workerPort);
    workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, podIP, workerPort, nodeInfo, computeResource, additionalPorts);
    LOG.info("Worker information summary: \n" + "MPI Rank(workerID): " + workerID + "\n" + "MPI Size(number of workers): " + numberOfWorkers + "\n" + "POD_IP: " + podIP + "\n" + "HOSTNAME(podname): " + podName);
    int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, jobID, workerInfo);
    WorkerRuntime.init(config, job, workerInfo, restartCount);
    /**
     * Interfaces to interact with other workers and Job Master if there is any
     */
    IWorkerController workerController = WorkerRuntime.getWorkerController();
    IWorkerStatusUpdater workerStatusUpdater = WorkerRuntime.getWorkerStatusUpdater();
    // finish up this worker
    if (restartCount >= FaultToleranceContext.maxMpiJobRestarts(config)) {
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
        WorkerRuntime.close();
        return;
    }
    // on any uncaught exception,
    // we will label the worker as FULLY_FAILED and exit JVM with error code 1
    Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
        LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Worker FAILED...", throwable);
        if (restartCount >= FaultToleranceContext.maxMpiJobRestarts(config) - 1) {
            workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
        } else {
            workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
        }
        WorkerRuntime.close();
        System.exit(1);
    });
    // start the worker
    boolean completed = startWorker(workerController, pv, podName);
    if (completed) {
        // update worker status to COMPLETED
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
    } else {
        // if not successfully completed, it means it is fully failed
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
    }
    // finalize MPI
    try {
        MPI.Finalize();
    } catch (MPIException ignore) {
    }
    WorkerRuntime.close();
}
Also used : UnknownHostException(java.net.UnknownHostException) IWorkerStatusUpdater(edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater) IWorkerController(edu.iu.dsc.tws.api.resource.IWorkerController) IOException(java.io.IOException) MPIException(mpi.MPIException) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) K8sPersistentVolume(edu.iu.dsc.tws.rsched.schedulers.k8s.worker.K8sPersistentVolume) InetAddress(java.net.InetAddress)

Example 2 with IWorkerStatusUpdater

use of edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater in project twister2 by DSC-SPIDAL.

the class K8sWorkerStarter method main.

@SuppressWarnings("RegexpSinglelineJava")
public static void main(String[] args) {
    // we can not initialize the logger fully yet,
    // but we need to set the format as the first thing
    LoggingHelper.setLoggingFormat(LoggingHelper.DEFAULT_FORMAT);
    // all environment variables
    int workerPort = Integer.parseInt(System.getenv(K8sEnvVariables.WORKER_PORT.name()));
    String containerName = System.getenv(K8sEnvVariables.CONTAINER_NAME.name());
    String podName = System.getenv(K8sEnvVariables.POD_NAME.name());
    String hostIP = System.getenv(K8sEnvVariables.HOST_IP.name());
    String hostName = System.getenv(K8sEnvVariables.HOST_NAME.name());
    String jobMasterIP = System.getenv(K8sEnvVariables.JOB_MASTER_IP.name());
    String encodedNodeInfoList = System.getenv(K8sEnvVariables.ENCODED_NODE_INFO_LIST.name());
    jobID = System.getenv(K8sEnvVariables.JOB_ID.name());
    boolean restoreJob = Boolean.parseBoolean(System.getenv(K8sEnvVariables.RESTORE_JOB.name()));
    if (jobID == null) {
        throw new RuntimeException("JobID is null");
    }
    // load the configuration parameters from configuration directory
    String configDir = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY;
    config = K8sWorkerUtils.loadConfig(configDir);
    // read job description file
    String jobDescFileName = SchedulerContext.createJobDescriptionFileName(jobID);
    jobDescFileName = POD_MEMORY_VOLUME + File.separator + JOB_ARCHIVE_DIRECTORY + File.separator + jobDescFileName;
    job = JobUtils.readJobFile(jobDescFileName);
    LOG.info("Job description file is loaded: " + jobDescFileName);
    // add any configuration from job file to the config object
    // if there are the same config parameters in both,
    // job file configurations will override
    config = JobUtils.overrideConfigs(job, config);
    config = JobUtils.updateConfigs(job, config);
    config = Config.newBuilder().putAll(config).put(CheckpointingContext.CHECKPOINTING_RESTORE_JOB, restoreJob).build();
    // if there is a driver or ZK is not used for group management, then we need to connect to JM
    if (!job.getDriverClassName().isEmpty() || !(ZKContext.isZooKeeperServerUsed(config)) || CheckpointingContext.isCheckpointingEnabled(config)) {
        jobMasterIP = updateJobMasterIp(jobMasterIP);
    }
    // get podIP from localhost
    InetAddress localHost = null;
    try {
        localHost = InetAddress.getLocalHost();
    } catch (UnknownHostException e) {
        throw new RuntimeException("Cannot get localHost.", e);
    }
    String podIP = localHost.getHostAddress();
    JobMasterAPI.NodeInfo nodeInfo = KubernetesContext.nodeLocationsFromConfig(config) ? KubernetesContext.getNodeInfo(config, hostIP) : K8sWorkerUtils.getNodeInfoFromEncodedStr(encodedNodeInfoList, hostIP);
    LOG.info("PodName: " + podName + ", NodeInfo: " + nodeInfo);
    // set workerID
    workerID = K8sWorkerUtils.calculateWorkerID(job, podName, containerName);
    // get computeResource for this worker
    computeResource = K8sWorkerUtils.getComputeResource(job, podName);
    // generate additional ports if requested
    Map<String, Integer> additionalPorts = K8sWorkerUtils.generateAdditionalPorts(config, workerPort);
    // construct WorkerInfo
    workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, podIP, workerPort, nodeInfo, computeResource, additionalPorts);
    // initialize persistent volume
    K8sPersistentVolume pv = null;
    if (KubernetesContext.persistentVolumeRequested(config)) {
        // create persistent volume object
        String persistentJobDir = KubernetesConstants.PERSISTENT_VOLUME_MOUNT;
        pv = new K8sPersistentVolume(persistentJobDir, workerID);
    }
    // initialize persistent logging
    K8sWorkerUtils.initWorkerLogger(workerID, pv, config);
    LOG.info("Worker information summary: \n" + "workerID: " + workerID + "\n" + "POD_IP: " + podIP + "\n" + "HOSTNAME(podname): " + podName + "\n" + "workerPort: " + workerPort + "\n" + "hostName(nodeName): " + hostName + "\n" + "hostIP(nodeIP): " + hostIP + "\n");
    int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, jobID, workerInfo);
    WorkerRuntime.init(config, job, workerInfo, restartCount);
    /**
     * Interfaces to interact with other workers and Job Master if there is any
     */
    IWorkerController workerController = WorkerRuntime.getWorkerController();
    IWorkerStatusUpdater workerStatusUpdater = WorkerRuntime.getWorkerStatusUpdater();
    // finish up this worker
    if (restartCount >= FaultToleranceContext.maxWorkerRestarts(config)) {
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
        WorkerRuntime.close();
        externallyKilled = false;
        return;
    }
    // add shut down hook
    addShutdownHook(workerStatusUpdater);
    // on any uncaught exception, we will label the worker as FAILED and throw a RuntimeException
    // JVM will be restarted by K8s
    Thread.setDefaultUncaughtExceptionHandler((thread, throwable) -> {
        LOG.log(Level.SEVERE, "Uncaught exception in the thread " + thread + ". Worker FAILED...", throwable);
        if (restartCount >= FaultToleranceContext.maxWorkerRestarts(config) - 1) {
            workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
        } else {
            workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
        }
        WorkerRuntime.close();
        externallyKilled = false;
        System.exit(1);
    });
    // start the worker
    boolean completed = startWorker(workerController, pv);
    if (completed) {
        // update worker status to COMPLETED
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
    } else {
        // if not successfully completed, it means it is fully failed
        workerStatusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FULLY_FAILED);
    }
    WorkerRuntime.close();
    externallyKilled = false;
}
Also used : UnknownHostException(java.net.UnknownHostException) IWorkerStatusUpdater(edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater) IWorkerController(edu.iu.dsc.tws.api.resource.IWorkerController) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) InetAddress(java.net.InetAddress)

Example 3 with IWorkerStatusUpdater

use of edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater in project twister2 by DSC-SPIDAL.

the class JobMasterClientExample method simulateClient.

/**
 * a method to simulate JMWorkerAgent running in workers
 */
public static void simulateClient(Config config, JobAPI.Job job, int workerID) {
    String workerIP = JMWorkerController.convertStringToIP("localhost").getHostAddress();
    int workerPort = 10000 + (int) (Math.random() * 10000);
    JobMasterAPI.NodeInfo nodeInfo = NodeInfoUtils.createNodeInfo("node.ip", "rack01", null);
    JobAPI.ComputeResource computeResource = job.getComputeResource(0);
    Map<String, Integer> additionalPorts = generateAdditionalPorts(config, workerPort);
    JobMasterAPI.WorkerInfo workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, workerIP, workerPort, nodeInfo, computeResource, additionalPorts);
    int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, job.getJobId(), workerInfo);
    long start = System.currentTimeMillis();
    WorkerRuntime.init(config, job, workerInfo, restartCount);
    long delay = System.currentTimeMillis() - start;
    LOG.severe("worker-" + workerID + " startupDelay " + delay);
    IWorkerStatusUpdater statusUpdater = WorkerRuntime.getWorkerStatusUpdater();
    IWorkerController workerController = WorkerRuntime.getWorkerController();
    ISenderToDriver senderToDriver = WorkerRuntime.getSenderToDriver();
    WorkerRuntime.addReceiverFromDriver(new IReceiverFromDriver() {

        @Override
        public void driverMessageReceived(Any anyMessage) {
            LOG.info("Received message from IDriver: \n" + anyMessage);
            senderToDriver.sendToDriver(anyMessage);
        }
    });
    try {
        List<JobMasterAPI.WorkerInfo> workerList = workerController.getAllWorkers();
        LOG.info("All workers joined... IDs: " + getIDs(workerList));
    } catch (TimeoutException timeoutException) {
        LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
        return;
    }
    // wait
    sleeeep(2 * 1000);
    try {
        workerController.waitOnBarrier();
        LOG.info("All workers reached the barrier. Proceeding.......");
    } catch (TimeoutException timeoutException) {
        LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
    }
    // int id = job.getNumberOfWorkers() - 1;
    // JobMasterAPI.WorkerInfo info = workerController.getWorkerInfoForID(id);
    // LOG.info("WorkerInfo for " + id + ": \n" + info);
    // wait up to 3sec
    sleeeep((long) (Math.random() * 10 * 1000));
    // start the worker
    try {
        throwException(workerID);
    } catch (Throwable t) {
        // update worker status to FAILED
        statusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
        WorkerRuntime.close();
        // System.exit(1);
        throw t;
    }
    statusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
    WorkerRuntime.close();
    System.out.println("Client has finished the computation. Client exiting.");
}
Also used : ISenderToDriver(edu.iu.dsc.tws.api.resource.ISenderToDriver) IWorkerStatusUpdater(edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater) IWorkerController(edu.iu.dsc.tws.api.resource.IWorkerController) JobAPI(edu.iu.dsc.tws.proto.system.job.JobAPI) Any(com.google.protobuf.Any) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) IReceiverFromDriver(edu.iu.dsc.tws.api.resource.IReceiverFromDriver) TimeoutException(edu.iu.dsc.tws.api.exceptions.TimeoutException)

Aggregations

IWorkerController (edu.iu.dsc.tws.api.resource.IWorkerController)3 IWorkerStatusUpdater (edu.iu.dsc.tws.api.resource.IWorkerStatusUpdater)3 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)3 InetAddress (java.net.InetAddress)2 UnknownHostException (java.net.UnknownHostException)2 Any (com.google.protobuf.Any)1 TimeoutException (edu.iu.dsc.tws.api.exceptions.TimeoutException)1 IReceiverFromDriver (edu.iu.dsc.tws.api.resource.IReceiverFromDriver)1 ISenderToDriver (edu.iu.dsc.tws.api.resource.ISenderToDriver)1 JobAPI (edu.iu.dsc.tws.proto.system.job.JobAPI)1 K8sPersistentVolume (edu.iu.dsc.tws.rsched.schedulers.k8s.worker.K8sPersistentVolume)1 IOException (java.io.IOException)1 MPIException (mpi.MPIException)1