use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class TWSTCPChannel method reInit.
@Override
public void reInit(List<JobMasterAPI.WorkerInfo> restartedWorkers) {
// close previous connections
for (JobMasterAPI.WorkerInfo wInfo : restartedWorkers) {
channel.closeConnection(wInfo.getWorkerID());
}
// wait for everyone to start the job master
try {
workerController.waitOnBarrier();
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
throw new Twister2RuntimeException(timeoutException);
}
// lets start the client connections now
List<NetworkInfo> nInfos = new ArrayList<>();
for (JobMasterAPI.WorkerInfo w : restartedWorkers) {
NetworkInfo networkInfo = new NetworkInfo(w.getWorkerID());
networkInfo.addProperty(TCPContext.NETWORK_PORT, w.getPort());
networkInfo.addProperty(TCPContext.NETWORK_HOSTNAME, w.getWorkerIP());
nInfos.add(networkInfo);
}
// start the connections
channel.startConnections(nInfos);
// now lets wait for connections to be established
channel.waitForConnections(maxConnEstTime);
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class TWSUCXChannel method createUXCWorker.
private void createUXCWorker(IWorkerController iWorkerController) {
UcpContext ucpContext = null;
UcpListener ucpListener = null;
// if UCX socket is already created, use that
// this happens in mpi clusters
Stack<Closeable> ucxObjects = (Stack<Closeable>) WorkerEnvironment.getSharedValue("ucxSocketsForFreePorts");
if (ucxObjects != null && ucxObjects.size() > 2) {
// todo: handle the case when there are multiple ucp sockets
while (!ucxObjects.isEmpty()) {
Closeable ucxObj = ucxObjects.pop();
if (ucxObj instanceof UcpListener) {
ucpListener = (UcpListener) ucxObj;
} else if (ucxObj instanceof UcpContext) {
ucpContext = (UcpContext) ucxObj;
} else if (ucxObj instanceof UcpWorker) {
ucpWorker = (UcpWorker) ucxObj;
} else {
LOG.warning("Unrecognized UCX object: " + ucxObj);
}
}
// add them to closeables
closeables.push(ucpContext);
closeables.push(ucpWorker);
closeables.push(ucpListener);
// create UCX objects
} else {
ucpContext = initUcpContext();
this.closeables.push(ucpContext);
this.ucpWorker = ucpContext.newWorker(new UcpWorkerParams().requestThreadSafety());
this.closeables.push(ucpWorker);
UcpListenerParams ucpListenerParams = new UcpListenerParams().setSockAddr(new InetSocketAddress(iWorkerController.getWorkerInfo().getWorkerIP(), iWorkerController.getWorkerInfo().getPort()));
// start listener
try {
ucpListener = ucpWorker.newListener(ucpListenerParams);
closeables.push(ucpListener);
} catch (org.openucx.jucx.UcxException ucxEx) {
throw new Twister2RuntimeException("Can not start TWSUCXChannel.", ucxEx);
}
}
try {
// wait till everyone add listeners
iWorkerController.waitOnBarrier();
} catch (TimeoutException e) {
LOG.log(Level.SEVERE, "Failed to wait on barrier", e);
}
// create end points
for (JobMasterAPI.WorkerInfo worker : iWorkerController.getJoinedWorkers()) {
if (worker.getWorkerID() != workerId) {
UcpEndpoint ucpEndpoint = ucpWorker.newEndpoint(new UcpEndpointParams().setSocketAddress(new InetSocketAddress(worker.getWorkerIP(), worker.getPort())));
this.endpoints.put(worker.getWorkerID(), ucpEndpoint);
this.closeables.push(ucpEndpoint);
}
}
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class HarpWorker method execute.
@Override
public void execute(Config config, JobAPI.Job job, IWorkerController workerController, IPersistentVolume persistentVolume, IVolatileVolume volatileVolume) {
int workerID = workerController.getWorkerInfo().getWorkerID();
List<JobMasterAPI.WorkerInfo> workersList = null;
try {
workersList = workerController.getAllWorkers();
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
return;
}
LOG.info(String.format("Worker %s starting with %d workers, " + "after waiting for all to start. \n %s", workerID, workersList.size(), workersList.toString()));
JobMasterAPI.WorkerInfo workerInfo = workerController.getWorkerInfo();
// Building Harp Specific parameters
Map<String, Integer> rackToIntegerMap = this.getRackToIntegerMap(workersList);
LinkedList<Integer> nodeRackIDs = new LinkedList<>(rackToIntegerMap.values());
// todo check the suitability
int noOfPhysicalNodes = nodeRackIDs.size();
Map<Integer, List<String>> nodesOfRackMap = this.getNodesOfRackMap(workersList, rackToIntegerMap);
Workers workers = new Workers(nodesOfRackMap, nodeRackIDs, noOfPhysicalNodes, workerID);
DataMap dataMap = new DataMap();
int harpPort = Constant.DEFAULT_WORKER_POART_BASE + workerID;
Server server;
try {
server = new Server(workerInfo.getWorkerIP(), harpPort, new EventQueue(), dataMap, workers);
} catch (Exception e) {
LOG.log(Level.SEVERE, String.format("Failed to start harp server %s:%d " + "on twister worker %s:%d", workerInfo.getWorkerIP(), harpPort, workerInfo.getWorkerIP(), workerInfo.getPort()), e);
throw new RuntimeException("Failed to start Harp Server");
}
SyncClient syncClient = new SyncClient(workers);
LOG.info("Starting Harp Sync client");
syncClient.start();
LOG.info(String.format("Starting harp server on port : %d", harpPort));
server.start();
LOG.info(String.format("Harp server started. %s:%d " + "on twister worker %s:%d", workerInfo.getWorkerIP(), harpPort, workerInfo.getWorkerIP(), workerInfo.getPort()));
try {
LOG.info("Trying master barrier");
doMasterBarrier("start-worker", "handshake", dataMap, workers);
LOG.info("Master barrier done");
} catch (IOException e) {
LOG.log(Level.SEVERE, "Failed to do master barrier", e);
server.stop();
syncClient.stop();
throw new RuntimeException("Failed to do master barrier");
}
// call executeHarp that will be coded by user
this.executeHarp(config, workerID, workerController.getNumberOfWorkers(), workerController, persistentVolume, volatileVolume, dataMap, workers);
// stopping servers, releasing resources
LOG.info("Execution completed. Shutting harp Sync Client down....");
syncClient.stop();
LOG.info("Harp Sync Client stopped.");
LOG.info("Shutting harp server down....");
server.stop(true);
LOG.info("Harp server stopped.");
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class MesosWorker method launchTask.
@Override
public void launchTask(ExecutorDriver executorDriver, Protos.TaskInfo taskInfo) {
LOG.info("Task start time(ms):" + System.currentTimeMillis());
Integer id = Integer.parseInt(taskInfo.getData().toStringUtf8());
LOG.info("Task " + id + " has started");
Protos.TaskStatus status = Protos.TaskStatus.newBuilder().setTaskId(taskInfo.getTaskId()).setState(Protos.TaskState.TASK_RUNNING).build();
executorDriver.sendStatusUpdate(status);
// jobID = SchedulerContext.jobID(config);
// System.out.println("job name is " + jobID);
long port = 0;
for (Protos.Resource r : taskInfo.getResourcesList()) {
if (r.getName().equals("ports")) {
port = r.getRanges().getRange(0).getBegin();
break;
}
}
MesosWorkerController workerController;
try {
JobAPI.Job job = JobUtils.readJobFile("twister2-job/" + jobID + ".job");
workerController = new MesosWorkerController(config, job, InetAddress.getLocalHost().getHostAddress(), toIntExact(port), id);
LOG.info("Initializing with zookeeper");
workerController.initializeWithZooKeeper();
LOG.info("Waiting for all workers to join");
workerController.getAllWorkers();
LOG.info("Everyone has joined");
IWorker worker = JobUtils.initializeIWorker(job);
worker.execute(config, job, workerController, null, null);
workerController.close();
} catch (UnknownHostException e) {
LOG.severe("Host unkown " + e.getMessage());
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
return;
}
// The below two lines can be used to send a message to the framework
// String reply = id.toString();
// executorDriver.sendFrameworkMessage(reply.getBytes());
LOG.info("Task " + id + " has finished");
status = Protos.TaskStatus.newBuilder().setTaskId(taskInfo.getTaskId()).setState(Protos.TaskState.TASK_FINISHED).build();
executorDriver.sendStatusUpdate(status);
}
use of edu.iu.dsc.tws.api.exceptions.TimeoutException in project twister2 by DSC-SPIDAL.
the class JobMasterClientExample method simulateClient.
/**
* a method to simulate JMWorkerAgent running in workers
*/
public static void simulateClient(Config config, JobAPI.Job job, int workerID) {
String workerIP = JMWorkerController.convertStringToIP("localhost").getHostAddress();
int workerPort = 10000 + (int) (Math.random() * 10000);
JobMasterAPI.NodeInfo nodeInfo = NodeInfoUtils.createNodeInfo("node.ip", "rack01", null);
JobAPI.ComputeResource computeResource = job.getComputeResource(0);
Map<String, Integer> additionalPorts = generateAdditionalPorts(config, workerPort);
JobMasterAPI.WorkerInfo workerInfo = WorkerInfoUtils.createWorkerInfo(workerID, workerIP, workerPort, nodeInfo, computeResource, additionalPorts);
int restartCount = K8sWorkerUtils.getAndInitRestartCount(config, job.getJobId(), workerInfo);
long start = System.currentTimeMillis();
WorkerRuntime.init(config, job, workerInfo, restartCount);
long delay = System.currentTimeMillis() - start;
LOG.severe("worker-" + workerID + " startupDelay " + delay);
IWorkerStatusUpdater statusUpdater = WorkerRuntime.getWorkerStatusUpdater();
IWorkerController workerController = WorkerRuntime.getWorkerController();
ISenderToDriver senderToDriver = WorkerRuntime.getSenderToDriver();
WorkerRuntime.addReceiverFromDriver(new IReceiverFromDriver() {
@Override
public void driverMessageReceived(Any anyMessage) {
LOG.info("Received message from IDriver: \n" + anyMessage);
senderToDriver.sendToDriver(anyMessage);
}
});
try {
List<JobMasterAPI.WorkerInfo> workerList = workerController.getAllWorkers();
LOG.info("All workers joined... IDs: " + getIDs(workerList));
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
return;
}
// wait
sleeeep(2 * 1000);
try {
workerController.waitOnBarrier();
LOG.info("All workers reached the barrier. Proceeding.......");
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
}
// int id = job.getNumberOfWorkers() - 1;
// JobMasterAPI.WorkerInfo info = workerController.getWorkerInfoForID(id);
// LOG.info("WorkerInfo for " + id + ": \n" + info);
// wait up to 3sec
sleeeep((long) (Math.random() * 10 * 1000));
// start the worker
try {
throwException(workerID);
} catch (Throwable t) {
// update worker status to FAILED
statusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.FAILED);
WorkerRuntime.close();
// System.exit(1);
throw t;
}
statusUpdater.updateWorkerStatus(JobMasterAPI.WorkerState.COMPLETED);
WorkerRuntime.close();
System.out.println("Client has finished the computation. Client exiting.");
}
Aggregations