Search in sources :

Example 6 with WorkerWithState

use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.

the class JMWorkerHandler method registerWorkerMessageReceived.

private void registerWorkerMessageReceived(RequestID id, JobMasterAPI.RegisterWorker message) {
    // if all workers connected, set it
    handleAllConnected();
    if (zkUsed) {
        int wID = message.getWorkerInfo().getWorkerID();
        LOG.fine("Since ZooKeeper is used, ignoring RegisterWorker message for worker: " + wID);
        sendRegisterWorkerResponse(id, wID, true, null);
        // if all workers connected with this worker,
        // publish all joined event to the driver if exists
        // if zk is not used, this is handled in workerMonitor
        workerMonitor.informDriverForAllJoined();
        return;
    }
    LOG.fine("RegisterWorker message received: \n" + message);
    JobMasterAPI.WorkerInfo workerInfo = message.getWorkerInfo();
    boolean initialAllJoined = workerMonitor.isAllJoined();
    int restartCount = message.getRestartCount();
    JobMasterAPI.WorkerState initialState = restartCount > 0 ? JobMasterAPI.WorkerState.RESTARTED : JobMasterAPI.WorkerState.STARTED;
    WorkerWithState workerWithState = new WorkerWithState(workerInfo, initialState, restartCount);
    if (initialState == JobMasterAPI.WorkerState.RESTARTED) {
        // if it is coming from failure
        sendRegisterWorkerResponse(id, workerInfo.getWorkerID(), true, null);
        workerMonitor.restarted(workerWithState);
    } else {
        // if there is already a worker with this ID, return fail message
        if (workerMonitor.existWorker(workerWithState.getWorkerID())) {
            String failMessage = "There is an already registered worker with workerID: " + workerWithState.getWorkerID();
            LOG.severe(failMessage);
            sendRegisterWorkerResponse(id, workerInfo.getWorkerID(), false, failMessage);
            return;
        }
        // send a success response
        sendRegisterWorkerResponse(id, workerInfo.getWorkerID(), true, null);
        // if it is not coming from failure
        workerMonitor.started(workerWithState);
    }
    // if all workers registered, inform all workers
    if (!initialAllJoined && workerMonitor.isAllJoined()) {
        LOG.info("All workers joined the job. Worker IDs: " + workerMonitor.getWorkerIDs());
        sendListWorkersResponseToWaitList();
        allJoined();
    }
}
Also used : JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) WorkerWithState(edu.iu.dsc.tws.common.zk.WorkerWithState)

Example 7 with WorkerWithState

use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.

the class ZKMasterController method childZnodeUpdated.

/**
 * when the status of a worker updated in the persistent worker znode,
 * take necessary actions
 */
private void childZnodeUpdated(PathChildrenCacheEvent event) {
    String childPath = event.getData().getPath();
    int workerID = ZKUtils.getWorkerIDFromPersPath(childPath);
    WorkerWithState workerWithState = WorkerWithState.decode(event.getData().getData());
    LOG.fine(String.format("Worker[%s] status changed to: %s ", workerID, workerWithState.getState()));
    // inform workerMonitor when the worker becomes COMPLETED
    if (workerWithState.getState() == JobMasterAPI.WorkerState.COMPLETED) {
        workerMonitor.completed(workerID);
    }
    if (workerWithState.getState() == JobMasterAPI.WorkerState.FULLY_FAILED) {
        workerMonitor.fullyFailed(workerID);
    }
}
Also used : WorkerWithState(edu.iu.dsc.tws.common.zk.WorkerWithState)

Example 8 with WorkerWithState

use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.

the class ZKJobLister method listJob.

/**
 * list a single job info from zk server
 * @param jobID
 */
public static void listJob(String jobID) {
    CuratorFramework client = ZKUtils.connectToServer(ZKContext.serverAddresses(config));
    String rootPath = ZKContext.rootNode(config);
    JobWithState job;
    List<WorkerWithState> workers;
    try {
        job = JobZNodeManager.readJobZNode(client, rootPath, jobID);
        workers = ZKPersStateManager.getWorkers(client, rootPath, jobID);
    } catch (Exception e) {
        LOG.log(Level.SEVERE, "Could not get the job from zookeeper: " + jobID, e);
        return;
    }
    if (workers.size() == 0) {
        LOG.info("\nNumber of workers in the job: 0");
        return;
    }
    int maxWorkerIPLength = workers.stream().mapToInt(w -> w.getInfo().getWorkerIP().length()).max().orElseThrow(() -> new RuntimeException("No valid workerIP in WorkerInfo"));
    StringBuilder buffer = new StringBuilder();
    Formatter f = new Formatter(buffer);
    f.format("\n\n%s", "JobID: " + job.getJob().getJobId());
    f.format("\n%s", "Job State: " + job.getState());
    f.format("\n%s", "Number of Workers: " + job.getJob().getNumberOfWorkers());
    f.format("\n%s", "");
    f.format("\n%s", "List of Workers: " + "\n");
    int workerIDColumn = "WorkerID".length() + 3;
    int workerIPColumn = maxWorkerIPLength + 3;
    String format = "%-" + workerIDColumn + "s%-" + workerIPColumn + "s%s\n";
    int lineWidth = workerIDColumn + workerIPColumn + "Worker State".length();
    String separator = StringUtils.repeat('=', lineWidth);
    f.format(format, "WorkerID", "WorkerIP", "Worker State");
    f.format("%s\n", separator);
    for (WorkerWithState wws : workers) {
        f.format(format, "" + wws.getWorkerID(), wws.getInfo().getWorkerIP(), wws.getState().toString());
    }
    LOG.info(buffer.toString());
}
Also used : CuratorFramework(org.apache.curator.framework.CuratorFramework) HelpFormatter(org.apache.commons.cli.HelpFormatter) Formatter(java.util.Formatter) JobWithState(edu.iu.dsc.tws.common.zk.JobWithState) WorkerWithState(edu.iu.dsc.tws.common.zk.WorkerWithState) ParseException(org.apache.commons.cli.ParseException)

Aggregations

WorkerWithState (edu.iu.dsc.tws.common.zk.WorkerWithState)8 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)2 Twister2Exception (edu.iu.dsc.tws.api.exceptions.Twister2Exception)1 JobWithState (edu.iu.dsc.tws.common.zk.JobWithState)1 Formatter (java.util.Formatter)1 LinkedList (java.util.LinkedList)1 HelpFormatter (org.apache.commons.cli.HelpFormatter)1 ParseException (org.apache.commons.cli.ParseException)1 CuratorFramework (org.apache.curator.framework.CuratorFramework)1 ChildData (org.apache.curator.framework.recipes.cache.ChildData)1 PathChildrenCache (org.apache.curator.framework.recipes.cache.PathChildrenCache)1