use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class MPIWorkerManager method execute.
public boolean execute(Config config, JobAPI.Job job, IWorkerController workerController, IPersistentVolume persistentVolume, IVolatileVolume volatileVolume, IWorker managedWorker) {
int workerID = workerController.getWorkerInfo().getWorkerID();
LOG.info("Waiting on the init barrier before starting IWorker: " + workerID + " with restartCount: " + workerController.workerRestartCount() + " and with re-executionCount: " + JobProgress.getWorkerExecuteCount());
try {
workerController.waitOnInitBarrier();
firstInitBarrierProceeded = true;
} catch (TimeoutException e) {
throw new Twister2RuntimeException("Could not pass through the init barrier", e);
}
// if it is executing for the first time, release worker ports
if (JobProgress.getWorkerExecuteCount() == 0) {
NetworkUtils.releaseWorkerPorts();
}
JobProgressImpl.setJobStatus(JobProgress.JobStatus.EXECUTING);
JobProgressImpl.increaseWorkerExecuteCount();
try {
managedWorker.execute(config, job, workerController, persistentVolume, volatileVolume);
return true;
} catch (JobFaultyException jfe) {
// a worker in the cluster should have failed
JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
throw jfe;
}
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class WorkerManager method execute.
/**
* Execute IWorker
* return false if IWorker fails fully after retries
* return true if execution successful
* throw an exception if execution fails and the worker needs to be restarted from jvm
*/
public boolean execute() {
while (JobProgress.getWorkerExecuteCount() < maxRetries) {
LOG.info("Waiting on the init barrier before starting IWorker: " + workerID + " with restartCount: " + workerController.workerRestartCount() + " and with re-executionCount: " + JobProgress.getWorkerExecuteCount());
try {
workerController.waitOnInitBarrier();
firstInitBarrierProceeded = true;
} catch (TimeoutException e) {
throw new Twister2RuntimeException("Could not pass through the init barrier", e);
}
LOG.fine("Proceeded through INIT barrier. Starting Worker: " + workerID);
JobProgressImpl.setJobStatus(JobProgress.JobStatus.EXECUTING);
JobProgressImpl.increaseWorkerExecuteCount();
JobProgressImpl.setRestartedWorkers(restartedWorkers.values());
try {
managedWorker.execute(config, job, workerController, persistentVolume, volatileVolume);
} catch (JobFaultyException cue) {
// a worker in the cluster should have failed
// we will try to re-execute this worker
JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
LOG.warning("thrown JobFaultyException. Some workers should have failed.");
}
// we need to make sure whether that all workers finished successfully also
if (JobProgress.isJobHealthy()) {
try {
// wait on the barrier indefinitely until all workers arrive
// or the barrier is broken with with a job fault
LOG.info("Worker completed, waiting for other workers to finish at the final barrier.");
workerController.waitOnBarrier(Long.MAX_VALUE);
LOG.info("Worker finished successfully");
return true;
} catch (TimeoutException e) {
// this should never happen
throw new Twister2RuntimeException("Could not pass through the final barrier", e);
} catch (JobFaultyException e) {
JobProgressImpl.setJobStatus(JobProgress.JobStatus.FAULTY);
LOG.warning("thrown JobFaultyException. Some workers failed before finishing.");
}
}
}
LOG.info(String.format("Re-executed IWorker %d times and failed, we are exiting", maxRetries));
return false;
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class TaskWorker method execute.
@Override
public void execute(Config cfg, JobAPI.Job job, IWorkerController wController, IPersistentVolume pVolume, IVolatileVolume vVolume) {
this.config = cfg;
this.workerId = wController.getWorkerInfo().getWorkerID();
this.workerController = wController;
this.persistentVolume = pVolume;
this.volatileVolume = vVolume;
ISenderToDriver senderToDriver = JMWorkerAgent.getJMWorkerAgent().getDriverAgent();
workerEnvironment = WorkerEnvironment.init(config, job, workerController, pVolume, vVolume);
computeEnvironment = ComputeEnvironment.init(workerEnvironment);
// to keep backward compatibility
taskExecutor = computeEnvironment.getTaskExecutor();
// call execute
execute();
// wait for the sync
try {
workerEnvironment.getWorkerController().waitOnBarrier();
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
}
computeEnvironment.close();
// lets terminate the network
workerEnvironment.close();
// we are done executing
// If the execute returns without any errors we assume that the job completed properly
JobExecutionState.WorkerJobState workerState = JobExecutionState.WorkerJobState.newBuilder().setFailure(false).setJobName(config.getStringValue(Context.JOB_ID)).setWorkerMessage("Worker Completed").build();
senderToDriver.sendToDriver(workerState);
LOG.log(Level.FINE, String.format("%d Worker done", workerId));
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class NomadWorkerStarter method startWorker.
private void startWorker() {
LOG.log(Level.INFO, "A worker process is starting...");
// lets create the resource plan
this.workerController = createWorkerController();
JobMasterAPI.WorkerInfo workerNetworkInfo = workerController.getWorkerInfo();
try {
LOG.log(Level.INFO, "Worker IP..:" + Inet4Address.getLocalHost().getHostAddress());
} catch (UnknownHostException e) {
e.printStackTrace();
}
try {
List<JobMasterAPI.WorkerInfo> workerInfos = workerController.getAllWorkers();
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
return;
}
IWorker worker = JobUtils.initializeIWorker(job);
MPIWorkerManager workerManager = new MPIWorkerManager();
workerManager.execute(config, job, workerController, null, null, worker);
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class CDFWRuntime method reinitialize.
private boolean reinitialize() {
communicator.close();
List<JobMasterAPI.WorkerInfo> workerInfoList = null;
try {
workerInfoList = controller.getAllWorkers();
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
}
// create the channel
channel = Network.initializeChannel(config, controller);
String persistent = null;
// create the communicator
communicator = new Communicator(config, channel, persistent);
taskExecutor = new TaskExecutor(config, workerId, workerInfoList, communicator, null);
return true;
}
Aggregations