use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.
the class ZKWorkerController method waitOnBarrier.
/**
* All workers create a znode on the barrier directory
* Job master watches znode creations/removals on this directory
* when the number of znodes on that directory reaches the number of workers in the job,
* Job master publishes AllArrivedOnBarrier event
* Workers proceed when they get this event or when they time out
* <p>
* Workers remove their znodes after they proceed through the barrier
* so that they can wait on the barrier again
* Workers are responsible for creating and removing znodes on the barrier
* Job master removes barrier znode after the job completion or scale down.
*
* if timeout is reached, throws TimeoutException.
*/
@Override
public void waitOnBarrier(long timeLimit) throws TimeoutException {
// do not wait on the barrier
if (JobProgress.isJobFaulty()) {
throw new JobFaultyException("Can not wait on the barrier, since the job is faulty.");
}
defaultBarrierProceeded = false;
try {
ZKBarrierManager.createWorkerZNodeAtDefault(client, rootPath, jobID, workerInfo.getWorkerID(), timeLimit);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return;
}
// wait until all workers joined or time limit is reached
long startTime = System.currentTimeMillis();
long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
long delay = 0;
while (delay < tl) {
synchronized (defaultBarrierWaitObject) {
try {
if (!defaultBarrierProceeded) {
defaultBarrierWaitObject.wait(tl - delay);
break;
}
} catch (InterruptedException e) {
delay = System.currentTimeMillis() - startTime;
}
}
}
// delete barrier znode in any case
try {
ZKBarrierManager.deleteWorkerZNodeFromDefault(client, rootPath, jobID, workerInfo.getWorkerID());
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
if (defaultBarrierProceeded) {
if (defaultBarrierResult == JobMasterAPI.BarrierResult.SUCCESS) {
return;
} else if (defaultBarrierResult == JobMasterAPI.BarrierResult.JOB_FAULTY) {
throw new JobFaultyException("Barrier broken since a fault occurred in the job.");
} else if (defaultBarrierResult == JobMasterAPI.BarrierResult.TIMED_OUT) {
throw new TimeoutException("Barrier timed out. Not all workers arrived on the time limit: " + timeLimit + "ms.");
}
// this should never happen, since we have only these three options
return;
} else {
throw new TimeoutException("Barrier timed out on the worker. " + tl + "ms.");
}
}
use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.
the class ZKWorkerController method waitOnInitBarrier.
/**
* init barrier
* the same algorithm as the default barrier
* @throws TimeoutException
*/
public void waitOnInitBarrier() throws TimeoutException {
initBarrierProceeded = false;
long timeLimit = ControllerContext.maxWaitTimeOnInitBarrier(config);
try {
ZKBarrierManager.createWorkerZNodeAtInit(client, rootPath, jobID, workerInfo.getWorkerID(), timeLimit);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return;
}
// wait until all workers joined or the time limit is reached
long startTime = System.currentTimeMillis();
long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
long delay = 0;
while (delay < tl) {
synchronized (initBarrierWaitObject) {
try {
if (!initBarrierProceeded) {
initBarrierWaitObject.wait(tl - delay);
break;
}
} catch (InterruptedException e) {
delay = System.currentTimeMillis() - startTime;
}
}
}
// delete barrier znode in any case
try {
ZKBarrierManager.deleteWorkerZNodeFromInit(client, rootPath, jobID, workerInfo.getWorkerID());
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
if (initBarrierProceeded) {
if (initBarrierResult == JobMasterAPI.BarrierResult.SUCCESS) {
return;
} else if (initBarrierResult == JobMasterAPI.BarrierResult.JOB_FAULTY) {
throw new JobFaultyException("Barrier broken since a fault occurred in the job.");
} else if (initBarrierResult == JobMasterAPI.BarrierResult.TIMED_OUT) {
throw new TimeoutException("Barrier timed out. Not all workers arrived on the time limit: " + timeLimit + "ms.");
}
// this should never happen, since we have only these three options
return;
} else {
throw new TimeoutException("Barrier timed out on the worker. " + tl + "ms.");
}
}
use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.
the class JMWorkerController method sendBarrierRequest.
private void sendBarrierRequest(JobMasterAPI.BarrierType barrierType, long timeLimit) throws TimeoutException {
JobMasterAPI.BarrierRequest barrierRequest = JobMasterAPI.BarrierRequest.newBuilder().setWorkerID(workerInfo.getWorkerID()).setBarrierType(barrierType).setTimeout(timeLimit).build();
LOG.fine("Sending BarrierRequest message: \n" + barrierRequest.toString());
try {
// set the local wait time for the barrier response to (2 * timeLimit)
// if the requested time limit is more than half of the long max value,
// set it to the long max value
long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
Tuple<RequestID, Message> response = rrClient.sendRequestWaitResponse(barrierRequest, tl);
JobMasterAPI.BarrierResponse barrierResponse = (JobMasterAPI.BarrierResponse) response.getValue();
if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.SUCCESS) {
return;
} else if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.JOB_FAULTY) {
throw new JobFaultyException("Job became faulty and Default Barrier failed.");
} else if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.TIMED_OUT) {
throw new TimeoutException("Barrier timed out. Not all workers arrived at the barrier " + "on the time limit: " + timeLimit + "ms");
}
} catch (BlockingSendException e) {
throw new TimeoutException("Not all workers arrived at the barrier on the time limit: " + timeLimit + "ms.", e);
}
}
use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.
the class MPIWorkerManager method execute.
public boolean execute(Config config, JobAPI.Job job, IWorkerController workerController, IPersistentVolume persistentVolume, IVolatileVolume volatileVolume, IWorker managedWorker) {
int workerID = workerController.getWorkerInfo().getWorkerID();
LOG.info("Waiting on the init barrier before starting IWorker: " + workerID + " with restartCount: " + workerController.workerRestartCount() + " and with re-executionCount: " + JobProgress.getWorkerExecuteCount());
try {
workerController.waitOnInitBarrier();
firstInitBarrierProceeded = true;
} catch (TimeoutException e) {
throw new Twister2RuntimeException("Could not pass through the init barrier", e);
}
// if it is executing for the first time, release worker ports
if (JobProgress.getWorkerExecuteCount() == 0) {
NetworkUtils.releaseWorkerPorts();
}
JobProgressImpl.setJobStatus(JobProgress.JobStatus.EXECUTING);
JobProgressImpl.increaseWorkerExecuteCount();
try {
managedWorker.execute(config, job, workerController, persistentVolume, volatileVolume);
return true;
} catch (JobFaultyException jfe) {
// a worker in the cluster should have failed
JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
throw jfe;
}
}
use of edu.iu.dsc.tws.api.exceptions.JobFaultyException in project twister2 by DSC-SPIDAL.
the class WorkerManager method execute.
/**
* Execute IWorker
* return false if IWorker fails fully after retries
* return true if execution successful
* throw an exception if execution fails and the worker needs to be restarted from jvm
*/
public boolean execute() {
while (JobProgress.getWorkerExecuteCount() < maxRetries) {
LOG.info("Waiting on the init barrier before starting IWorker: " + workerID + " with restartCount: " + workerController.workerRestartCount() + " and with re-executionCount: " + JobProgress.getWorkerExecuteCount());
try {
workerController.waitOnInitBarrier();
firstInitBarrierProceeded = true;
} catch (TimeoutException e) {
throw new Twister2RuntimeException("Could not pass through the init barrier", e);
}
LOG.fine("Proceeded through INIT barrier. Starting Worker: " + workerID);
JobProgressImpl.setJobStatus(JobProgress.JobStatus.EXECUTING);
JobProgressImpl.increaseWorkerExecuteCount();
JobProgressImpl.setRestartedWorkers(restartedWorkers.values());
try {
managedWorker.execute(config, job, workerController, persistentVolume, volatileVolume);
} catch (JobFaultyException cue) {
// a worker in the cluster should have failed
// we will try to re-execute this worker
JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
LOG.warning("thrown JobFaultyException. Some workers should have failed.");
}
// we need to make sure whether that all workers finished successfully also
if (JobProgress.isJobHealthy()) {
try {
// wait on the barrier indefinitely until all workers arrive
// or the barrier is broken with with a job fault
LOG.info("Worker completed, waiting for other workers to finish at the final barrier.");
workerController.waitOnBarrier(Long.MAX_VALUE);
LOG.info("Worker finished successfully");
return true;
} catch (TimeoutException e) {
// this should never happen
throw new Twister2RuntimeException("Could not pass through the final barrier", e);
} catch (JobFaultyException e) {
JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
LOG.warning("thrown JobFaultyException. Some workers failed before finishing.");
}
}
}
LOG.info(String.format("Re-executed IWorker %d times and failed, we are exiting", maxRetries));
return false;
}
Aggregations