use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.
the class WorkerMonitor method addJoinedWorkers.
/**
* when the job master restarts, it adds already joined workers with this method.
* returns true if allJoined becomes true
*/
public boolean addJoinedWorkers(List<WorkerWithState> joinedWorkers) {
for (WorkerWithState wws : joinedWorkers) {
workers.put(wws.getWorkerID(), wws);
}
if (workers.size() == numberOfWorkers && allWorkersJoined()) {
allJoined = true;
jobState = JobAPI.JobState.STARTED;
LOG.info("All workers have already joined, before the job master restarted.");
return true;
}
return false;
}
use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.
the class WorkerMonitor method failed.
/**
* called when a worker FAILED
*/
public void failed(int workerID) {
WorkerWithState failedWorker = workers.get(workerID);
if (failedWorker == null) {
LOG.warning("The worker[" + workerID + "] that has not joined the job failed. " + "Ignoring this event.");
return;
}
failedWorker.setState(JobMasterAPI.WorkerState.FAILED);
LOG.info("Worker: " + workerID + " FAILED.");
// send worker state change message to dashboard
if (dashClient != null) {
dashClient.workerStateChange(workerID, JobMasterAPI.WorkerState.FAILED);
}
if (failureListener != null) {
failureListener.failed(workerID);
}
workerEventSender.workerFailed(workerID);
}
use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.
the class ZKMasterController method workerZnodeAdded.
/**
* when a new worker znode added to the ephemeral job directory,
* take necessary actions
*/
private void workerZnodeAdded(PathChildrenCacheEvent event) {
boolean initialAllJoined = workerMonitor.isAllJoined();
String addedChildPath = event.getData().getPath();
int workerID = ZKUtils.getWorkerIDFromEphemPath(addedChildPath);
WorkerWithState workerWithState = getWorkerWithState(workerID);
if (workerWithState == null) {
LOG.severe("worker[" + workerID + "] added, but its data can not be retrieved.");
return;
}
// if the status of joining worker is RESTARTED, it is coming from failure
if (workerWithState.getState() == JobMasterAPI.WorkerState.RESTARTED) {
workerMonitor.restarted(workerWithState);
} else if (workerWithState.getState() == JobMasterAPI.WorkerState.STARTED) {
workerMonitor.started(workerWithState);
// a worker joined with initial state that is not acceptable
} else {
LOG.warning("Following worker joined with initial state of " + workerWithState.getState() + "Something must be wrong. Ignoring this event. WorkerInfo: " + workerWithState.getInfo());
return;
}
// let all workers know that all joined
if (!initialAllJoined && workerMonitor.isAllJoined()) {
allJoined();
}
}
use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.
the class ZKMasterController method initRestarting.
/**
* initialize JM when it is coming from failure
* @throws Exception
*/
private void initRestarting() throws Exception {
LOG.info("Job Master restarting .... ");
// build the cache
// we will not get events for the past worker joins/fails
ephemChildrenCache = new PathChildrenCache(client, ephemDir, true);
addEphemChildrenCacheListener(ephemChildrenCache);
ephemChildrenCache.start(PathChildrenCache.StartMode.BUILD_INITIAL_CACHE);
List<ChildData> joinedWorkerZnodes = ephemChildrenCache.getCurrentData();
LOG.info("Initially existing workers: " + joinedWorkerZnodes.size());
// We listen for status updates for persistent path
persChildrenCache = new PathChildrenCache(client, persDir, true);
addPersChildrenCacheListener(persChildrenCache);
persChildrenCache.start(PathChildrenCache.StartMode.BUILD_INITIAL_CACHE);
// get all joined workers and provide them to workerMonitor
List<WorkerWithState> joinedWorkers = new LinkedList<>();
for (ChildData child : joinedWorkerZnodes) {
String fullPath = child.getPath();
int workerID = ZKUtils.getWorkerIDFromEphemPath(fullPath);
WorkerWithState workerWithState = getWorkerWithState(workerID);
if (workerWithState != null) {
joinedWorkers.add(workerWithState);
} else {
LOG.severe("worker[" + fullPath + "] added, but its data can not be retrieved.");
}
}
// publish jm restarted event
jmRestarted();
// if all workers joined and allJoined event has not been published, publish it
boolean allJoined = workerMonitor.addJoinedWorkers(joinedWorkers);
if (allJoined && !allJoinedPublished()) {
LOG.info("Publishing AllJoined event when restarting, since it is not previously published.");
allJoined();
}
}
use of edu.iu.dsc.tws.common.zk.WorkerWithState in project twister2 by DSC-SPIDAL.
the class ZKMasterController method workerZnodeRemoved.
/**
* when a worker znode is removed from the ephemeral znode of this job znode,
* take necessary actions
* Possibilities:
* that worker may have completed and deleted its znode,
* that worker may have failed
* that worker may have been removed by scaling down
* a failed and restarted worker may have deleted the znode from its previous run
*/
private void workerZnodeRemoved(PathChildrenCacheEvent event) {
// if job master znode removed, it must have failed
// job master is the last one to leave the job.
// it does not send complete message as workers when it finishes.
String workerPath = event.getData().getPath();
int removedWorkerID = ZKUtils.getWorkerIDFromEphemPath(workerPath);
// this is a scaled down worker, nothing to do
if (scaledDownWorkers.contains(removedWorkerID)) {
scaledDownWorkers.remove(Integer.valueOf(removedWorkerID));
LOG.info("Removed scaled down worker: " + removedWorkerID);
return;
}
// if the worker state is COMPLETED or FULLY_FAILED in workerMonitor, ignore this event.
// those events are already handled by persistent state cache updates
JobMasterAPI.WorkerState ws = workerMonitor.getWorkerState(removedWorkerID);
if (ws == JobMasterAPI.WorkerState.COMPLETED || ws == JobMasterAPI.WorkerState.FULLY_FAILED) {
return;
}
// get worker info and the state from persistent storage
WorkerWithState workerWithState;
try {
workerWithState = ZKPersStateManager.getWorkerWithState(client, rootPath, jobID, removedWorkerID);
if (workerWithState == null) {
LOG.severe("worker[" + removedWorkerID + "] removed, but its data can not be retrieved.");
return;
}
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, "worker[" + removedWorkerID + "] removed, but its data can not be retrieved.", e);
return;
}
String workerBodyText = ZKEphemStateManager.decodeWorkerZnodeBody(event.getData().getData());
// otherwise, the worker failed. We inform the failureListener.
if (workerWithState.getState() == JobMasterAPI.WorkerState.COMPLETED || workerWithState.getState() == JobMasterAPI.WorkerState.FULLY_FAILED) {
// removed event received for completed or fullyfailed worker, nothing to do
return;
} else if (ZKEphemStateManager.DELETE_TAG.equals(workerBodyText)) {
// restarting worker deleted the previous ephemeral znode
// ignore this event, because the worker is already re-joining
LOG.info("Restarting worker deleted znode from previous run: " + workerPath);
return;
} else {
// worker failed
LOG.info(String.format("Worker[%s] FAILED. Worker last status: %s", removedWorkerID, workerWithState.getState()));
workerMonitor.failed(removedWorkerID);
// workers may mark their status as FAILED before failing
if (workerWithState.getState() != JobMasterAPI.WorkerState.FAILED) {
try {
ZKPersStateManager.updateWorkerStatus(client, rootPath, jobID, workerWithState.getInfo(), workerWithState.getRestartCount(), JobMasterAPI.WorkerState.FAILED);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
}
}
}
Aggregations