use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKBarrierManager method getNumberOfWorkersAtBarrier.
public static int getNumberOfWorkersAtBarrier(CuratorFramework client, String rootPath, String jobID) throws Twister2Exception {
String barrierDir = ZKUtils.defaultBarrierDir(rootPath, jobID);
try {
int numberOfWorkersAt = client.getChildren().forPath(barrierDir).size();
LOG.info("Number of workers at the barrier: " + numberOfWorkersAt);
return numberOfWorkersAt;
} catch (Exception e) {
throw new Twister2Exception("Could not get children of barrier directory: " + barrierDir, e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKBarrierManager method removeScaledDownZNodes.
/**
* When a job is scaled down, we must delete the znodes of killed workers.
* minID inclusive, maxID exclusive
*/
public static void removeScaledDownZNodes(CuratorFramework client, String barrierDir, int minID, int maxID) throws Twister2Exception {
for (int workerID = minID; workerID < maxID; workerID++) {
String workerPath = ZKUtils.workerPath(barrierDir, workerID);
try {
// not sure whether we need to check the existence
if (client.checkExists().forPath(workerPath) != null) {
client.delete().forPath(workerPath);
LOG.info("Worker Barrier Znode deleted: " + workerPath);
}
} catch (Exception e) {
throw new Twister2Exception("Worker Barrier Znode cannot be deleted: " + workerPath, e);
}
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKEventsManager method getAllEvents.
/**
* return all registered events
*/
public static TreeMap<Integer, JobMasterAPI.JobEvent> getAllEvents(CuratorFramework client, String rootPath, String jobID) throws Twister2Exception {
String eventsDir = ZKUtils.eventsDir(rootPath, jobID);
try {
TreeMap<Integer, JobMasterAPI.JobEvent> events = new TreeMap<>(Collections.reverseOrder());
List<String> children = client.getChildren().forPath(eventsDir);
for (String childName : children) {
String childPath = eventsDir + "/" + childName;
int eventIndex = Integer.parseInt(childName);
byte[] eventNodeBody = client.getData().forPath(childPath);
JobMasterAPI.JobEvent event = decodeJobEvent(eventNodeBody);
events.put(eventIndex, event);
}
return events;
} catch (Exception e) {
throw new Twister2Exception("Could not get event znode data: " + eventsDir, e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class JobMaster method init.
/**
* initialize the Job Master
*/
private void init() throws Twister2Exception {
looper = new Progress();
// if Dashboard is used, register this job with that
if (dashClient != null) {
boolean registered = dashClient.registerJob(job, nodeInfo);
if (!registered) {
LOG.warning("Not using Dashboard since it can not register with it.");
dashClient = null;
}
}
ServerConnectHandler connectHandler = new ServerConnectHandler();
int backLog = Math.min(job.getNumberOfWorkers() / 2, MAX_BACK_LOG);
rrServer = new RRServer(config, jmAddress, masterPort, looper, JOB_MASTER_ID, connectHandler, backLog);
// init Driver if it exists
// this ha to be done before WorkerMonitor initialization
initDriver();
JobFailureWatcher jobFailureWatcher = new JobFailureWatcher();
workerMonitor = new WorkerMonitor(this, rrServer, dashClient, zkJobUpdater, job, driver, jobFailureWatcher);
workerHandler = new JMWorkerHandler(workerMonitor, rrServer, ZKContext.isZooKeeperServerUsed(config));
if (!ZKContext.isZooKeeperServerUsed(config)) {
workerMonitor.setWorkerEventSender(workerHandler);
}
// initialize BarrierMonitor
barrierMonitor = new BarrierMonitor(workerMonitor, jobFailureWatcher);
if (ZKContext.isZooKeeperServerUsed(config)) {
zkBarrierHandler = new ZKBarrierHandler(barrierMonitor, config, job.getJobId(), job.getNumberOfWorkers());
barrierMonitor.setBarrierResponder(zkBarrierHandler);
zkBarrierHandler.initialize(initialState);
} else {
JMBarrierHandler jmBarrierHandler = new JMBarrierHandler(rrServer, barrierMonitor);
barrierMonitor.setBarrierResponder(jmBarrierHandler);
}
jobFailureWatcher.addJobFaultListener(barrierMonitor);
// if ZoKeeper server is used for this job, initialize that
try {
initZKMasterController(workerMonitor);
} catch (Twister2Exception e) {
throw e;
}
// initialize checkpoint manager
if (CheckpointingContext.isCheckpointingEnabled(config)) {
StateStore stateStore = CheckpointUtils.getStateStore(config);
stateStore.init(config, "checkpoint-manager");
this.checkpointManager = new CheckpointManager(this.rrServer, stateStore, job.getJobId());
jobFailureWatcher.addJobFaultListener(this.checkpointManager);
LOG.info("Checkpoint manager initialized");
this.checkpointManager.init();
}
// done initializing checkpoint manager
rrServer.start();
looper.loop();
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKJobUpdater method removeInitialStateZNodes.
/**
* remove InitialState worker znodes after scaling down
* @return
*/
public boolean removeInitialStateZNodes(int minWorkerID, int maxWorkerID) {
// if ZooKeeper server is not used, return. Nothing to be done.
if (!ZKContext.isZooKeeperServerUsed(config)) {
return true;
}
CuratorFramework client = ZKUtils.connectToServer(ZKContext.serverAddresses(config));
String rootPath = ZKContext.rootNode(config);
try {
ZKPersStateManager.removeScaledDownZNodes(client, rootPath, jobID, minWorkerID, maxWorkerID);
ZKBarrierManager.removeScaledDownZNodes(client, rootPath, jobID, minWorkerID, maxWorkerID);
return true;
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return false;
}
}
Aggregations