use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class JobZNodeManager method checkJstZNodeWaitIfNeeded.
/**
* Job master creates job submission time znode under job znode,
* as the last action to create job related znodes at ZK server
* workers wait for the job master to create this znode.
* They proceed after seeing that this jst znode is created
* <p>
* this jst znode may exist from previous runs in the case of restarting from a checkpoint
* because of this, its value has to be compared
*/
public static boolean checkJstZNodeWaitIfNeeded(CuratorFramework client, String rootPath, String jobID, long jsTime) throws Twister2Exception {
String jstPath = ZKUtils.jobSubmisionTimePath(rootPath, jobID);
// 100 seconds
long timeLimit = 100000;
long sleepInterval = 300;
long duration = 0;
long startTime = System.currentTimeMillis();
// log interval in milliseconds
long logInterval = 3000;
long nextLogTime = logInterval;
int checkCount = 1;
while (duration < timeLimit) {
try {
if (client.checkExists().forPath(jstPath) != null) {
byte[] jstBytes = client.getData().forPath(jstPath);
long jstAtZK = Longs.fromByteArray(jstBytes);
if (jstAtZK == jsTime) {
LOG.info("matched job submission times. Proceeding. checkCount: " + checkCount);
return true;
}
}
} catch (Exception e) {
throw new Twister2Exception("Can not get job submission znode data.", e);
}
try {
Thread.sleep(sleepInterval);
} catch (InterruptedException e) {
LOG.warning("Sleeping thread interrupted.");
}
duration = System.currentTimeMillis() - startTime;
checkCount++;
if (duration > nextLogTime) {
LOG.info("Still waiting for job submission time znode to be created: " + jstPath);
nextLogTime += logInterval;
}
}
throw new Twister2Exception("Job Submission Time znode is not created by job master " + "on the time limit: " + timeLimit + " ms");
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKBarrierManager method createWorkerZNode.
/**
* create a worker znode at the barrier directory
*/
public static void createWorkerZNode(CuratorFramework client, String workerPath, long timeout) throws Twister2Exception {
try {
client.create().creatingParentsIfNeeded().withMode(CreateMode.PERSISTENT).forPath(workerPath, Longs.toByteArray(timeout));
LOG.info("Worker Barrier Znode created: " + workerPath);
} catch (Exception e) {
throw new Twister2Exception("Worker Barrier Znode can not be created for the path: " + workerPath, e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKBarrierManager method deleteWorkerZNode.
/**
* delete the worker znode at a barrier directory
*/
public static void deleteWorkerZNode(CuratorFramework client, String workerPath) throws Twister2Exception {
try {
client.delete().forPath(workerPath);
LOG.info("Worker Barrier Znode deleted: " + workerPath);
} catch (Exception e) {
throw new Twister2Exception("Worker Barrier Znode can not be deleted for the path: " + workerPath, e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class ZKEphemStateManager method removeEphemZNode.
/**
* remove ephemeral worker znode from previous run if exist
*/
public static void removeEphemZNode(CuratorFramework client, String rootPath, String jobID, int workerID) throws Twister2Exception {
String ephemDirPath = ZKUtils.ephemDir(rootPath, jobID);
try {
List<String> children = client.getChildren().forPath(ephemDirPath);
for (String childZnodeName : children) {
int wID = ZKUtils.getWorkerIDFromEphemPath(childZnodeName);
if (wID == workerID) {
String wPath = ephemDirPath + "/" + childZnodeName;
client.setData().forPath(wPath, DELETE_TAG.getBytes(StandardCharsets.UTF_8));
client.delete().forPath(wPath);
LOG.info("EphemeralWorkerZnode deleted from previous run: " + wPath);
}
}
} catch (Exception e) {
throw new Twister2Exception("Can not remove ephemeral worker znode.", e);
}
}
use of edu.iu.dsc.tws.api.exceptions.Twister2Exception in project twister2 by DSC-SPIDAL.
the class KubernetesLauncher method startJobMasterOnClient.
/**
* start the JobMaster locally on submitting client
* this is a blocking call
* it finishes after the job has completed
*/
private boolean startJobMasterOnClient(JobAPI.Job job) {
// get Dashboard IP address from dashboard service name
String dashAddress = JobMasterContext.dashboardHost(config);
// get the IP address for the service name by querying Kubernetes master
if (dashAddress.endsWith("svc.cluster.local")) {
String dashIP = getDashboardIP(dashAddress);
String dashURL = "http://" + dashIP;
if (dashIP == null) {
LOG.warning("Could not get Dashboard server IP address from dashboard service name: " + dashAddress + " will not connect to Dashboard. *****");
dashURL = null;
}
config = JobMasterContext.updateDashboardHost(config, dashURL);
LOG.info("Dashboard server HTTP URL: " + dashURL);
}
String hostAdress = RequestObjectBuilder.getJobMasterIP();
JobMasterAPI.NodeInfo nodeInfo = NodeInfoUtils.createNodeInfo(hostAdress, null, null);
K8sScaler k8sScaler = new K8sScaler(config, job, controller);
JobMasterAPI.JobMasterState initialState = JobMasterAPI.JobMasterState.JM_STARTED;
JobTerminator jobTerminator = new JobTerminator(config, controller);
JobMaster jobMaster = new JobMaster(config, hostAdress, jobTerminator, job, nodeInfo, k8sScaler, initialState);
jobMaster.addShutdownHook(true);
// jobMaster.startJobMasterThreaded();
try {
jobMaster.startJobMasterBlocking();
} catch (Twister2Exception e) {
LOG.log(Level.SEVERE, e.getMessage(), e);
return false;
}
return true;
}
Aggregations