Search in sources :

Example 1 with WorkerWithState

use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.

the class WorkerMonitor method addJoinedWorkers.

/**
 * when the job master restarts, it adds already joined workers with this method.
 * returns true if allJoined becomes true
 */
public boolean addJoinedWorkers(List<WorkerWithState> joinedWorkers) {
    for (WorkerWithState wws : joinedWorkers) {
        workers.put(wws.getWorkerID(), wws);
    }
    if (workers.size() == numberOfWorkers && allWorkersJoined()) {
        allJoined = true;
        jobState = JobAPI.JobState.STARTED;
        LOG.info("All workers have already joined, before the job master restarted.");
        return true;
    }
    return false;
}
Also used : WorkerWithState(edu.iu.dsc.tws.common.zk.WorkerWithState)

Example 2 with WorkerWithState

use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.

the class WorkerMonitor method failed.

/**
 * called when a worker FAILED
 */
public void failed(int workerID) {
    WorkerWithState failedWorker = workers.get(workerID);
    if (failedWorker == null) {
        LOG.warning("The worker[" + workerID + "] that has not joined the job failed. " + "Ignoring this event.");
        return;
    }
    failedWorker.setState(JobMasterAPI.WorkerState.FAILED);
    LOG.info("Worker: " + workerID + " FAILED.");
    // send worker state change message to dashboard
    if (dashClient != null) {
        dashClient.workerStateChange(workerID, JobMasterAPI.WorkerState.FAILED);
    }
    if (failureListener != null) {
        failureListener.failed(workerID);
    }
    workerEventSender.workerFailed(workerID);
}
Also used : WorkerWithState(edu.iu.dsc.tws.common.zk.WorkerWithState)

Example 3 with WorkerWithState

use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.

the class ZKMasterController method workerZnodeAdded.

/**
 * when a new worker znode added to the ephemeral job directory,
 * take necessary actions
 */
private void workerZnodeAdded(PathChildrenCacheEvent event) {
    boolean initialAllJoined = workerMonitor.isAllJoined();
    String addedChildPath = event.getData().getPath();
    int workerID = ZKUtils.getWorkerIDFromEphemPath(addedChildPath);
    WorkerWithState workerWithState = getWorkerWithState(workerID);
    if (workerWithState == null) {
        LOG.severe("worker[" + workerID + "] added, but its data can not be retrieved.");
        return;
    }
    // if the status of joining worker is RESTARTED, it is coming from failure
    if (workerWithState.getState() == JobMasterAPI.WorkerState.RESTARTED) {
        workerMonitor.restarted(workerWithState);
    } else if (workerWithState.getState() == JobMasterAPI.WorkerState.STARTED) {
        workerMonitor.started(workerWithState);
    // a worker joined with initial state that is not acceptable
    } else {
        LOG.warning("Following worker joined with initial state of " + workerWithState.getState() + "Something must be wrong. Ignoring this event. WorkerInfo: " + workerWithState.getInfo());
        return;
    }
    // let all workers know that all joined
    if (!initialAllJoined && workerMonitor.isAllJoined()) {
        allJoined();
    }
}
Also used : WorkerWithState(edu.iu.dsc.tws.common.zk.WorkerWithState)

Example 4 with WorkerWithState

use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.

the class ZKMasterController method initRestarting.

/**
 * initialize JM when it is coming from failure
 * @throws Exception
 */
private void initRestarting() throws Exception {
    LOG.info("Job Master restarting .... ");
    // build the cache
    // we will not get events for the past worker joins/fails
    ephemChildrenCache = new PathChildrenCache(client, ephemDir, true);
    addEphemChildrenCacheListener(ephemChildrenCache);
    ephemChildrenCache.start(PathChildrenCache.StartMode.BUILD_INITIAL_CACHE);
    List<ChildData> joinedWorkerZnodes = ephemChildrenCache.getCurrentData();
    LOG.info("Initially existing workers: " + joinedWorkerZnodes.size());
    // We listen for status updates for persistent path
    persChildrenCache = new PathChildrenCache(client, persDir, true);
    addPersChildrenCacheListener(persChildrenCache);
    persChildrenCache.start(PathChildrenCache.StartMode.BUILD_INITIAL_CACHE);
    // get all joined workers and provide them to workerMonitor
    List<WorkerWithState> joinedWorkers = new LinkedList<>();
    for (ChildData child : joinedWorkerZnodes) {
        String fullPath = child.getPath();
        int workerID = ZKUtils.getWorkerIDFromEphemPath(fullPath);
        WorkerWithState workerWithState = getWorkerWithState(workerID);
        if (workerWithState != null) {
            joinedWorkers.add(workerWithState);
        } else {
            LOG.severe("worker[" + fullPath + "] added, but its data can not be retrieved.");
        }
    }
    // publish jm restarted event
    jmRestarted();
    // if all workers joined and allJoined event has not been published, publish it
    boolean allJoined = workerMonitor.addJoinedWorkers(joinedWorkers);
    if (allJoined && !allJoinedPublished()) {
        LOG.info("Publishing AllJoined event when restarting, since it is not previously published.");
        allJoined();
    }
}
Also used : PathChildrenCache(org.apache.curator.framework.recipes.cache.PathChildrenCache) ChildData(org.apache.curator.framework.recipes.cache.ChildData) WorkerWithState(edu.iu.dsc.tws.common.zk.WorkerWithState) LinkedList(java.util.LinkedList)

Example 5 with WorkerWithState

use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.

the class ZKMasterController method workerZnodeRemoved.

/**
 * when a worker znode is removed from the ephemeral znode of this job znode,
 * take necessary actions
 * Possibilities:
 * that worker may have completed and deleted its znode,
 * that worker may have failed
 * that worker may have been removed by scaling down
 * a failed and restarted worker may have deleted the znode from its previous run
 */
private void workerZnodeRemoved(PathChildrenCacheEvent event) {
    // if job master znode removed, it must have failed
    // job master is the last one to leave the job.
    // it does not send complete message as workers when it finishes.
    String workerPath = event.getData().getPath();
    int removedWorkerID = ZKUtils.getWorkerIDFromEphemPath(workerPath);
    // this is a scaled down worker, nothing to do
    if (scaledDownWorkers.contains(removedWorkerID)) {
        scaledDownWorkers.remove(Integer.valueOf(removedWorkerID));
        LOG.info("Removed scaled down worker: " + removedWorkerID);
        return;
    }
    // if the worker state is COMPLETED or FULLY_FAILED in workerMonitor, ignore this event.
    // those events are already handled by persistent state cache updates
    JobMasterAPI.WorkerState ws = workerMonitor.getWorkerState(removedWorkerID);
    if (ws == JobMasterAPI.WorkerState.COMPLETED || ws == JobMasterAPI.WorkerState.FULLY_FAILED) {
        return;
    }
    // get worker info and the state from persistent storage
    WorkerWithState workerWithState;
    try {
        workerWithState = ZKPersStateManager.getWorkerWithState(client, rootPath, jobID, removedWorkerID);
        if (workerWithState == null) {
            LOG.severe("worker[" + removedWorkerID + "] removed, but its data can not be retrieved.");
            return;
        }
    } catch (Twister2Exception e) {
        LOG.log(Level.SEVERE, "worker[" + removedWorkerID + "] removed, but its data can not be retrieved.", e);
        return;
    }
    String workerBodyText = ZKEphemStateManager.decodeWorkerZnodeBody(event.getData().getData());
    // otherwise, the worker failed. We inform the failureListener.
    if (workerWithState.getState() == JobMasterAPI.WorkerState.COMPLETED || workerWithState.getState() == JobMasterAPI.WorkerState.FULLY_FAILED) {
        // removed event received for completed or fullyfailed worker, nothing to do
        return;
    } else if (ZKEphemStateManager.DELETE_TAG.equals(workerBodyText)) {
        // restarting worker deleted the previous ephemeral znode
        // ignore this event, because the worker is already re-joining
        LOG.info("Restarting worker deleted znode from previous run: " + workerPath);
        return;
    } else {
        // worker failed
        LOG.info(String.format("Worker[%s] FAILED. Worker last status: %s", removedWorkerID, workerWithState.getState()));
        workerMonitor.failed(removedWorkerID);
        // workers may mark their status as FAILED before failing
        if (workerWithState.getState() != JobMasterAPI.WorkerState.FAILED) {
            try {
                ZKPersStateManager.updateWorkerStatus(client, rootPath, jobID, workerWithState.getInfo(), workerWithState.getRestartCount(), JobMasterAPI.WorkerState.FAILED);
            } catch (Twister2Exception e) {
                LOG.log(Level.SEVERE, e.getMessage(), e);
            }
        }
    }
}
Also used : Twister2Exception(edu.iu.dsc.tws.api.exceptions.Twister2Exception) JobMasterAPI(edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI) WorkerWithState(edu.iu.dsc.tws.common.zk.WorkerWithState)

Aggregations

WorkerWithState (edu.iu.dsc.tws.common.zk.WorkerWithState)8 JobMasterAPI (edu.iu.dsc.tws.proto.jobmaster.JobMasterAPI)2 Twister2Exception (edu.iu.dsc.tws.api.exceptions.Twister2Exception)1 JobWithState (edu.iu.dsc.tws.common.zk.JobWithState)1 Formatter (java.util.Formatter)1 LinkedList (java.util.LinkedList)1 HelpFormatter (org.apache.commons.cli.HelpFormatter)1 ParseException (org.apache.commons.cli.ParseException)1 CuratorFramework (org.apache.curator.framework.CuratorFramework)1 ChildData (org.apache.curator.framework.recipes.cache.ChildData)1 PathChildrenCache (org.apache.curator.framework.recipes.cache.PathChildrenCache)1