use of edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException in project twister2 by DSC-SPIDAL.
the class ZKPersStateManager method createJobMasterPersState.
/**
* create job master persistent state at ZooKeeper server
* this method must be called if isJobMasterRestarting returns false
*/
public static void createJobMasterPersState(CuratorFramework client, String rootPath, String jobID, String jmAddress) {
String jmPersPath = ZKUtils.jmPersPath(rootPath, jobID);
try {
byte[] znodeBody = ZKUtils.encodeJobMasterZnode(jmAddress, JobMasterState.JM_STARTED.getNumber());
client.create().withMode(CreateMode.PERSISTENT).forPath(jmPersPath, znodeBody);
LOG.info("JobMaster persistent state znode created: " + jmPersPath);
} catch (Exception e) {
throw new Twister2RuntimeException("Can not initialize job master pers state znode.", e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException in project twister2 by DSC-SPIDAL.
the class MPIChannel method progressReceives.
/**
* This method needs to be called to progress the receives
*/
public void progressReceives() {
try {
for (Map.Entry<Integer, PendingReceive> x : pendingReceives.entrySet()) {
boolean flag;
PendingReceive pendingReceive = x.getValue();
if (pendingReceive.status == ReceiveStatus.RECEIVE_LENGTH_POSTED) {
Status status = pendingReceive.request.testStatus();
if (status != null) {
pendingReceive.request = null;
int count = status.getCount(MPI.INT);
// read the length from the header
int length = pendingReceive.headerBuf.get(0);
int finFlag = pendingReceive.headerBuf.get(1);
// check weather we are at the end
if (finFlag != TWISTERX_MSG_FIN) {
if (count > 8) {
LOG.log(Level.SEVERE, "Un-expected number of bytes expected: 8 or less received: " + count);
}
// malloc a buffer
pendingReceive.data = allocator.allocate(length);
pendingReceive.length = length;
pendingReceive.request = comm.iRecv(pendingReceive.data.getByteBuffer(), length, MPI.BYTE, pendingReceive.receiveId, edge);
// LOG(INFO) << rank << " ** POST RECEIVE " << length << " addr: " << x.second->data;
pendingReceive.status = ReceiveStatus.RECEIVE_POSTED;
// copy the count - 2 to the buffer
if (count > 2) {
for (int i = 2; i < count; i++) {
pendingReceive.userHeader[i - 2] = pendingReceive.headerBuf.get(i);
}
}
// notify the receiver
receiveCallback.receivedHeader(x.getKey(), finFlag, pendingReceive.userHeader, count - 2);
} else {
if (count != 2) {
LOG.log(Level.SEVERE, "Un-expected number of bytes expected: 2 received: " + count);
}
// we are not expecting to receive any more
pendingReceive.status = ReceiveStatus.RECEIVED_FIN;
// notify the receiver
receiveCallback.receivedHeader(x.getKey(), finFlag, null, 0);
}
}
} else if (pendingReceive.status == ReceiveStatus.RECEIVE_POSTED) {
flag = pendingReceive.request.test();
if (flag) {
pendingReceive.request = null;
// clear the array
pendingReceive.headerBuf.clear();
pendingReceive.request = comm.iRecv(pendingReceive.headerBuf, TWISTERX_CHANNEL_HEADER_SIZE, MPI.INT, pendingReceive.receiveId, edge);
// LOG(INFO) << rank << " ** POST HEADER " << 8 << " addr: " << x.second->headerBuf;
pendingReceive.status = ReceiveStatus.RECEIVE_LENGTH_POSTED;
// call the back end
receiveCallback.receivedData(x.getKey(), pendingReceive.data, pendingReceive.length);
}
} else {
// we are at the end
}
}
} catch (MPIException e) {
LOG.log(Level.SEVERE, "Error in MPI", e);
throw new Twister2RuntimeException(e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException in project twister2 by DSC-SPIDAL.
the class TWSTCPChannel method progressSends.
@Override
public void progressSends() {
// we should rate limit here
while (pendingSends.size() > 0) {
// post the message
TCPSendRequests sendRequests = pendingSends.poll();
// post the send
postMessage(sendRequests);
waitForCompletionSends.add(sendRequests);
}
IterativeLinkedList.ILLIterator sendRequestsIterator = waitForCompletionSends.iterator();
while (sendRequestsIterator.hasNext()) {
TCPSendRequests sendRequests = (TCPSendRequests) sendRequestsIterator.next();
IterativeLinkedList.ILLIterator requestIterator = sendRequests.pendingSends.iterator();
while (requestIterator.hasNext()) {
Request r = (Request) requestIterator.next();
// this request has finished
if (r.request.isComplete()) {
requestIterator.remove();
} else if (r.request.isError()) {
throw new Twister2RuntimeException("Error when sending a message to worker: " + sendRequests.rank);
}
}
// ideally we should be able to call for each finish of the buffer
if (sendRequests.pendingSends.size() == 0) {
sendRequests.callback.onSendComplete(sendRequests.rank, sendRequests.edge, sendRequests.message);
sendRequestsIterator.remove();
}
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException in project twister2 by DSC-SPIDAL.
the class TWSTCPChannel method reInit.
@Override
public void reInit(List<JobMasterAPI.WorkerInfo> restartedWorkers) {
// close previous connections
for (JobMasterAPI.WorkerInfo wInfo : restartedWorkers) {
channel.closeConnection(wInfo.getWorkerID());
}
// wait for everyone to start the job master
try {
workerController.waitOnBarrier();
} catch (TimeoutException timeoutException) {
LOG.log(Level.SEVERE, timeoutException.getMessage(), timeoutException);
throw new Twister2RuntimeException(timeoutException);
}
// lets start the client connections now
List<NetworkInfo> nInfos = new ArrayList<>();
for (JobMasterAPI.WorkerInfo w : restartedWorkers) {
NetworkInfo networkInfo = new NetworkInfo(w.getWorkerID());
networkInfo.addProperty(TCPContext.NETWORK_PORT, w.getPort());
networkInfo.addProperty(TCPContext.NETWORK_HOSTNAME, w.getWorkerIP());
nInfos.add(networkInfo);
}
// start the connections
channel.startConnections(nInfos);
// now lets wait for connections to be established
channel.waitForConnections(maxConnEstTime);
}
use of edu.iu.dsc.tws.api.exceptions.Twister2RuntimeException in project twister2 by DSC-SPIDAL.
the class TWSUCXChannel method createUXCWorker.
private void createUXCWorker(IWorkerController iWorkerController) {
UcpContext ucpContext = null;
UcpListener ucpListener = null;
// if UCX socket is already created, use that
// this happens in mpi clusters
Stack<Closeable> ucxObjects = (Stack<Closeable>) WorkerEnvironment.getSharedValue("ucxSocketsForFreePorts");
if (ucxObjects != null && ucxObjects.size() > 2) {
// todo: handle the case when there are multiple ucp sockets
while (!ucxObjects.isEmpty()) {
Closeable ucxObj = ucxObjects.pop();
if (ucxObj instanceof UcpListener) {
ucpListener = (UcpListener) ucxObj;
} else if (ucxObj instanceof UcpContext) {
ucpContext = (UcpContext) ucxObj;
} else if (ucxObj instanceof UcpWorker) {
ucpWorker = (UcpWorker) ucxObj;
} else {
LOG.warning("Unrecognized UCX object: " + ucxObj);
}
}
// add them to closeables
closeables.push(ucpContext);
closeables.push(ucpWorker);
closeables.push(ucpListener);
// create UCX objects
} else {
ucpContext = initUcpContext();
this.closeables.push(ucpContext);
this.ucpWorker = ucpContext.newWorker(new UcpWorkerParams().requestThreadSafety());
this.closeables.push(ucpWorker);
UcpListenerParams ucpListenerParams = new UcpListenerParams().setSockAddr(new InetSocketAddress(iWorkerController.getWorkerInfo().getWorkerIP(), iWorkerController.getWorkerInfo().getPort()));
// start listener
try {
ucpListener = ucpWorker.newListener(ucpListenerParams);
closeables.push(ucpListener);
} catch (org.openucx.jucx.UcxException ucxEx) {
throw new Twister2RuntimeException("Can not start TWSUCXChannel.", ucxEx);
}
}
try {
// wait till everyone add listeners
iWorkerController.waitOnBarrier();
} catch (TimeoutException e) {
LOG.log(Level.SEVERE, "Failed to wait on barrier", e);
}
// create end points
for (JobMasterAPI.WorkerInfo worker : iWorkerController.getJoinedWorkers()) {
if (worker.getWorkerID() != workerId) {
UcpEndpoint ucpEndpoint = ucpWorker.newEndpoint(new UcpEndpointParams().setSocketAddress(new InetSocketAddress(worker.getWorkerIP(), worker.getPort())));
this.endpoints.put(worker.getWorkerID(), ucpEndpoint);
this.closeables.push(ucpEndpoint);
}
}
}
Aggregations