use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKPersStateManager method createWorkerPersState.
/**
* Initialize worker persistent state at ZooKeeper server
* This method must be called after making sure that
* there is no pers state znode for this worker at zk
*/
public static void createWorkerPersState(CuratorFramework client, String rootPath, String jobID, WorkerInfo workerInfo) throws Twister2Exception {
String workersPersDir = ZKUtils.persDir(rootPath, jobID);
String workerPersPath = ZKUtils.workerPath(workersPersDir, workerInfo.getWorkerID());
try {
WorkerWithState workerWithState = new WorkerWithState(workerInfo, WorkerState.STARTED, 0);
client.create().withMode(CreateMode.PERSISTENT).forPath(workerPersPath, workerWithState.toByteArray());
} catch (Exception e) {
throw new Twister2Exception("Can not initialize pers state znode for the worker.", e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKPersStateManager method initAndGetRestartCount.
/**
* Initialize worker persistent state at ZooKeeper server
* return restart count: 0 means first start
* If the worker is restarting, update restartCount and WorkerInfo
* <p>
* A persistent znode is created/updated for this worker on ZooKeeper server
* Each worker must call this method exactly once when they start
*/
public static int initAndGetRestartCount(CuratorFramework client, String rootPath, String jobID, WorkerInfo workerInfo) throws Twister2Exception {
String workersPersDir = ZKUtils.persDir(rootPath, jobID);
String workerPersPath = ZKUtils.workerPath(workersPersDir, workerInfo.getWorkerID());
try {
// update the body and return restartCount
if (client.checkExists().forPath(workerPersPath) != null) {
LOG.warning("Worker restarting. Worker PersStateZNode exists: " + workerPersPath);
byte[] workerNodeBody = client.getData().forPath(workerPersPath);
WorkerWithState previousWws = WorkerWithState.decode(workerNodeBody);
int restartCount = previousWws.getRestartCount() + 1;
WorkerWithState workerWithState = new WorkerWithState(workerInfo, WorkerState.RESTARTED, restartCount);
client.setData().forPath(workerPersPath, workerWithState.toByteArray());
return restartCount;
}
} catch (Exception e) {
throw new Twister2Exception("Can not initialize pers state znode for the worker.", e);
}
return 0;
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKWorkerController method waitOnBarrier.
/**
* All workers create a znode on the barrier directory
* Job master watches znode creations/removals on this directory
* when the number of znodes on that directory reaches the number of workers in the job,
* Job master publishes AllArrivedOnBarrier event
* Workers proceed when they get this event or when they time out
* <p>
* Workers remove their znodes after they proceed through the barrier
* so that they can wait on the barrier again
* Workers are responsible for creating and removing znodes on the barrier
* Job master removes barrier znode after the job completion or scale down.
*
* if timeout is reached, throws TimeoutException.
*/
@Override
public void waitOnBarrier(long timeLimit) throws TimeoutException {
// do not wait on the barrier
if (JobProgress.isJobFaulty()) {
throw new JobFaultyException("Can not wait on the barrier, since the job is faulty.");
}
defaultBarrierProceeded = false;
try {
ZKBarrierManager.createWorkerZNodeAtDefault(client, rootPath, jobID, workerInfo.getWorkerID(), timeLimit);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return;
}
// wait until all workers joined or time limit is reached
long startTime = System.currentTimeMillis();
long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
long delay = 0;
while (delay < tl) {
synchronized (defaultBarrierWaitObject) {
try {
if (!defaultBarrierProceeded) {
defaultBarrierWaitObject.wait(tl - delay);
break;
}
} catch (InterruptedException e) {
delay = System.currentTimeMillis() - startTime;
}
}
}
// delete barrier znode in any case
try {
ZKBarrierManager.deleteWorkerZNodeFromDefault(client, rootPath, jobID, workerInfo.getWorkerID());
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
if (defaultBarrierProceeded) {
if (defaultBarrierResult == JobMasterAPI.BarrierResult.SUCCESS) {
return;
} else if (defaultBarrierResult == JobMasterAPI.BarrierResult.JOB_FAULTY) {
throw new JobFaultyException("Barrier broken since a fault occurred in the job.");
} else if (defaultBarrierResult == JobMasterAPI.BarrierResult.TIMED_OUT) {
throw new TimeoutException("Barrier timed out. Not all workers arrived on the time limit: " + timeLimit + "ms.");
}
// this should never happen, since we have only these three options
return;
} else {
throw new TimeoutException("Barrier timed out on the worker. " + tl + "ms.");
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKWorkerController method waitOnInitBarrier.
/**
* init barrier
* the same algorithm as the default barrier
* @throws TimeoutException
*/
public void waitOnInitBarrier() throws TimeoutException {
initBarrierProceeded = false;
long timeLimit = ControllerContext.maxWaitTimeOnInitBarrier(config);
try {
ZKBarrierManager.createWorkerZNodeAtInit(client, rootPath, jobID, workerInfo.getWorkerID(), timeLimit);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return;
}
// wait until all workers joined or the time limit is reached
long startTime = System.currentTimeMillis();
long tl = timeLimit > Long.MAX_VALUE / 2 ? Long.MAX_VALUE : timeLimit * 2;
long delay = 0;
while (delay < tl) {
synchronized (initBarrierWaitObject) {
try {
if (!initBarrierProceeded) {
initBarrierWaitObject.wait(tl - delay);
break;
}
} catch (InterruptedException e) {
delay = System.currentTimeMillis() - startTime;
}
}
}
// delete barrier znode in any case
try {
ZKBarrierManager.deleteWorkerZNodeFromInit(client, rootPath, jobID, workerInfo.getWorkerID());
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
}
if (initBarrierProceeded) {
if (initBarrierResult == JobMasterAPI.BarrierResult.SUCCESS) {
return;
} else if (initBarrierResult == JobMasterAPI.BarrierResult.JOB_FAULTY) {
throw new JobFaultyException("Barrier broken since a fault occurred in the job.");
} else if (initBarrierResult == JobMasterAPI.BarrierResult.TIMED_OUT) {
throw new TimeoutException("Barrier timed out. Not all workers arrived on the time limit: " + timeLimit + "ms.");
}
// this should never happen, since we have only these three options
return;
} else {
throw new TimeoutException("Barrier timed out on the worker. " + tl + "ms.");
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKMasterController method getWorkerWithState.
/**
* get WorkerWithState from the local cache if exists,
* otherwise, get it from the server
*/
private WorkerWithState getWorkerWithState(int workerID) {
String workerPersPath = ZKUtils.workerPath(persDir, workerID);
ChildData znodeBody = persChildrenCache.getCurrentData(workerPersPath);
if (znodeBody != null) {
return WorkerWithState.decode(znodeBody.getData());
}
try {
return ZKPersStateManager.getWorkerWithState(client, workerPersPath);
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return null;
}
}
Aggregations