use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKMasterController method workerZnodeRemoved.
/**
* when a worker znode is removed from the ephemeral znode of this job znode,
* take necessary actions
* Possibilities:
* that worker may have completed and deleted its znode,
* that worker may have failed
* that worker may have been removed by scaling down
* a failed and restarted worker may have deleted the znode from its previous run
*/
private void workerZnodeRemoved(PathChildrenCacheEvent event) {
// if job master znode removed, it must have failed
// job master is the last one to leave the job.
// it does not send complete message as workers when it finishes.
String workerPath = event.getData().getPath();
int removedWorkerID = ZKUtils.getWorkerIDFromEphemPath(workerPath);
// this is a scaled down worker, nothing to do
if (scaledDownWorkers.contains(removedWorkerID)) {
scaledDownWorkers.remove(Integer.valueOf(removedWorkerID));
LOG.info("Removed scaled down worker: " + removedWorkerID);
return;
}
// if the worker state is COMPLETED or FULLY_FAILED in workerMonitor, ignore this event.
// those events are already handled by persistent state cache updates
JobMasterAPI.WorkerState ws = workerMonitor.getWorkerState(removedWorkerID);
if (ws == JobMasterAPI.WorkerState.COMPLETED || ws == JobMasterAPI.WorkerState.FULLY_FAILED) {
return;
}
// get worker info and the state from persistent storage
WorkerWithState workerWithState;
try {
workerWithState = ZKPersStateManager.getWorkerWithState(client, rootPath, jobID, removedWorkerID);
if (workerWithState == null) {
LOG.severe("worker[" + removedWorkerID + "] removed, but its data can not be retrieved.");
return;
}
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, "worker[" + removedWorkerID + "] removed, but its data can not be retrieved.", e);
return;
}
String workerBodyText = ZKEphemStateManager.decodeWorkerZnodeBody(event.getData().getData());
// otherwise, the worker failed. We inform the failureListener.
if (workerWithState.getState() == JobMasterAPI.WorkerState.COMPLETED || workerWithState.getState() == JobMasterAPI.WorkerState.FULLY_FAILED) {
// removed event received for completed or fullyfailed worker, nothing to do
return;
} else if (ZKEphemStateManager.DELETE_TAG.equals(workerBodyText)) {
// restarting worker deleted the previous ephemeral znode
// ignore this event, because the worker is already re-joining
LOG.info("Restarting worker deleted znode from previous run: " + workerPath);
return;
} else {
// worker failed
LOG.info(String.format("Worker[%s] FAILED. Worker last status: %s", removedWorkerID, workerWithState.getState()));
workerMonitor.failed(removedWorkerID);
// workers may mark their status as FAILED before failing
if (workerWithState.getState() != JobMasterAPI.WorkerState.FAILED) {
try {
ZKPersStateManager.updateWorkerStatus(client, rootPath, jobID, workerWithState.getInfo(), workerWithState.getRestartCount(), JobMasterAPI.WorkerState.FAILED);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
}
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKMasterController method workerRestarted.
@Override
public void workerRestarted(JobMasterAPI.WorkerInfo workerInfo) {
// generate en event and inform all other workers
JobMasterAPI.WorkerRestarted workerRestarted = JobMasterAPI.WorkerRestarted.newBuilder().setWorkerInfo(workerInfo).build();
JobMasterAPI.JobEvent jobEvent = JobMasterAPI.JobEvent.newBuilder().setRestarted(workerRestarted).build();
try {
ZKEventsManager.publishEvent(client, rootPath, jobID, jobEvent);
} catch (Twister2Exception e) {
throw new Twister2RuntimeException(e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKBarrierHandler method getInitialWorkersAtBarrier.
private long getInitialWorkersAtBarrier(PathChildrenCache childrenCache, Set<Integer> workersAtBarrier) {
long timeout = 0;
List<ChildData> existingWorkerZnodes = childrenCache.getCurrentData();
for (ChildData child : existingWorkerZnodes) {
String fullPath = child.getPath();
int workerID = ZKUtils.getWorkerIDFromPersPath(fullPath);
workersAtBarrier.add(workerID);
if (timeout == 0) {
try {
ZKBarrierManager.readWorkerTimeout(client, fullPath);
} catch (Twister2Exception e) {
throw new Twister2RuntimeException(e);
}
}
}
return timeout;
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class JobZNodeManager method updateJob.
public static boolean updateJob(CuratorFramework client, String rootPath, JobAPI.Job job, JobAPI.JobState state) throws Twister2Exception {
String jobDir = ZKUtils.jobDir(rootPath, job.getJobId());
JobWithState jobWithState = new JobWithState(job, state);
try {
client.setData().forPath(jobDir, jobWithState.toByteArray());
return true;
} catch (Exception e) {
throw new Twister2Exception("Could not update Job in znode: " + jobDir, e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class JobZNodeManager method getJobs.
/**
* return all jobs
*/
public static List<JobWithState> getJobs(CuratorFramework client, String rootPath) throws Twister2Exception {
try {
List<String> jobPaths = client.getChildren().forPath(rootPath);
LinkedList<JobWithState> jobs = new LinkedList();
for (String jobID : jobPaths) {
String childPath = rootPath + "/" + jobID;
byte[] jobZNodeBody = client.getData().forPath(childPath);
JobWithState jobWithState = JobWithState.decode(jobZNodeBody);
jobs.add(jobWithState);
}
return jobs;
} catch (Exception e) {
throw new Twister2Exception("Could not get job znode data: " + rootPath, e);
}
}
Aggregations