use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class ZKWorkerController method waitOnBarrier.
/**
* All workers create a znode on the barrier directory
* Job master watches znode creations/removals on this directory
* when the number of znodes on that directory reaches the number of workers in the job,
* Job master publishes AllArrivedOnBarrier event
* Workers proceed when they get this event or when they time out
* <p>
* Workers remove their znodes after they proceed through the barrier
* so that they can wait on the barrier again
* Workers are responsible for creating and removing znodes on the barrier
* Job master removes barrier znode after the job completion or scale down.
*
* if timeout is reached, throws TimeoutException.
*/
@Override
public void waitOnBarrier(long timeLimit) throws TimeoutException {
// do not wait on the barrier
if (JobProgress.isJobFaulty()) {
throw new JobFaultyException("Can not wait on the barrier, since the job is faulty.");
}
defaultBarrierProceeded = false;
try {
ZKBarrierManager.createWorkerZNodeAtDefault(client, rootPath, jobID, workerInfo.getWorkerID(), timeLimit);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return;
}
// wait until all workers joined or time limit is reached
long startTime = System.currentTimeMillis();
long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
long delay = 0;
while (delay < tl) {
synchronized (defaultBarrierWaitObject) {
try {
if (!defaultBarrierProceeded) {
defaultBarrierWaitObject.wait(tl - delay);
break;
}
} catch (InterruptedException e) {
delay = System.currentTimeMillis() - startTime;
}
}
}
// delete barrier znode in any case
try {
ZKBarrierManager.deleteWorkerZNodeFromDefault(client, rootPath, jobID, workerInfo.getWorkerID());
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
if (defaultBarrierProceeded) {
if (defaultBarrierResult == JobMasterAPI.BarrierResult.SUCCESS) {
return;
} else if (defaultBarrierResult == JobMasterAPI.BarrierResult.JOB_FAULTY) {
throw new JobFaultyException("Barrier broken since a fault occurred in the job.");
} else if (defaultBarrierResult == JobMasterAPI.BarrierResult.TIMED_OUT) {
throw new TimeoutException("Barrier timed out. Not all workers arrived on the time limit: " + timeLimit + "ms.");
}
// this should never happen, since we have only these three options
return;
} else {
throw new TimeoutException("Barrier timed out on the worker. " + tl + "ms.");
}
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class ZKWorkerController method waitOnInitBarrier.
/**
* init barrier
* the same algorithm as the default barrier
* @throws TimeoutException
*/
public void waitOnInitBarrier() throws TimeoutException {
initBarrierProceeded = false;
long timeLimit = ControllerContext.maxWaitTimeOnInitBarrier(config);
try {
ZKBarrierManager.createWorkerZNodeAtInit(client, rootPath, jobID, workerInfo.getWorkerID(), timeLimit);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return;
}
// wait until all workers joined or the time limit is reached
long startTime = System.currentTimeMillis();
long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
long delay = 0;
while (delay < tl) {
synchronized (initBarrierWaitObject) {
try {
if (!initBarrierProceeded) {
initBarrierWaitObject.wait(tl - delay);
break;
}
} catch (InterruptedException e) {
delay = System.currentTimeMillis() - startTime;
}
}
}
// delete barrier znode in any case
try {
ZKBarrierManager.deleteWorkerZNodeFromInit(client, rootPath, jobID, workerInfo.getWorkerID());
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
if (initBarrierProceeded) {
if (initBarrierResult == JobMasterAPI.BarrierResult.SUCCESS) {
return;
} else if (initBarrierResult == JobMasterAPI.BarrierResult.JOB_FAULTY) {
throw new JobFaultyException("Barrier broken since a fault occurred in the job.");
} else if (initBarrierResult == JobMasterAPI.BarrierResult.TIMED_OUT) {
throw new TimeoutException("Barrier timed out. Not all workers arrived on the time limit: " + timeLimit + "ms.");
}
// this should never happen, since we have only these three options
return;
} else {
throw new TimeoutException("Barrier timed out on the worker. " + tl + "ms.");
}
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class TaskUtils method execute.
public static void execute(Config config, int workerID, ComputeGraph graph, IWorkerController workerController) {
RoundRobinTaskScheduler roundRobinTaskScheduler = new RoundRobinTaskScheduler();
roundRobinTaskScheduler.initialize(config);
List<JobMasterAPI.WorkerInfo> workerList = null;
try {
workerList = workerController.getAllWorkers();
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
return;
}
WorkerPlan workerPlan = createWorkerPlan(workerList);
TaskSchedulePlan taskSchedulePlan = roundRobinTaskScheduler.schedule(graph, workerPlan);
TWSChannel network = Network.initializeChannel(config, workerController);
ExecutionPlanBuilder executionPlanBuilder = new ExecutionPlanBuilder(workerID, workerList, new Communicator(config, network), workerController.getCheckpointingClient());
ExecutionPlan plan = executionPlanBuilder.build(config, graph, taskSchedulePlan);
ExecutorFactory executor = new ExecutorFactory(config, workerID, network);
executor.getExecutor(config, plan, graph.getOperationMode()).execute();
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class TaskUtils method executeBatch.
public static void executeBatch(Config config, int workerID, ComputeGraph graph, IWorkerController workerController) {
RoundRobinTaskScheduler roundRobinTaskScheduler = new RoundRobinTaskScheduler();
roundRobinTaskScheduler.initialize(config);
WorkerPlan workerPlan = null;
List<JobMasterAPI.WorkerInfo> workerList = null;
try {
workerList = workerController.getAllWorkers();
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
return;
}
workerPlan = createWorkerPlan(workerList);
TaskSchedulePlan taskSchedulePlan = roundRobinTaskScheduler.schedule(graph, workerPlan);
TWSChannel network = Network.initializeChannel(config, workerController);
ExecutionPlanBuilder executionPlanBuilder = new ExecutionPlanBuilder(workerID, workerList, new Communicator(config, network), workerController.getCheckpointingClient());
ExecutionPlan plan = executionPlanBuilder.build(config, graph, taskSchedulePlan);
ExecutorFactory executor = new ExecutorFactory(config, workerID, network);
executor.getExecutor(config, plan, graph.getOperationMode()).execute();
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class JMWorkerController method sendBarrierRequest.
private void sendBarrierRequest(JobMasterAPI.BarrierType barrierType, long timeLimit) throws TimeoutException {
JobMasterAPI.BarrierRequest barrierRequest = JobMasterAPI.BarrierRequest.newBuilder().setWorkerID(workerInfo.getWorkerID()).setBarrierType(barrierType).setTimeout(timeLimit).build();
LOG.fine("Sending BarrierRequest message: \n" + barrierRequest.toString());
try {
// set the local wait time for the barrier response to (2 * timeLimit)
// if the requested time limit is more than half of the long max value,
// set it to the long max value
long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
Tuple<RequestID, Message> response = rrClient.sendRequestWaitResponse(barrierRequest, tl);
JobMasterAPI.BarrierResponse barrierResponse = (JobMasterAPI.BarrierResponse) response.getValue();
if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.SUCCESS) {
return;
} else if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.JOB_FAULTY) {
throw new JobFaultyException("Job became faulty and Default Barrier failed.");
} else if (barrierResponse.getResult() == JobMasterAPI.BarrierResult.TIMED_OUT) {
throw new TimeoutException("Barrier timed out. Not all workers arrived at the barrier " + "on the time limit: " + timeLimit + "ms");
}
} catch (BlockingSendException e) {
throw new TimeoutException("Not all workers arrived at the barrier on the time limit: " + timeLimit + "ms.", e);
}
}
Aggregations