use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.
the class JobManager method launchJob.
/**
*
* @param OrbJob job
*/
private void launchJob(OrbJob job) {
try {
ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber());
ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber() + "/OrbPartitionLeaderGroup");
ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber() + "/messages");
ZookeeperUtils.tryToCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber() + "/messages/heartbeat", new LongWritable(0), CreateMode.PERSISTENT);
// allocate resources and if enough, start the job
logger.info("checking for available OrbTracker resources");
Map<M, Integer[]> assignments = null;
try {
assignments = resourceAllocator.assignResources(job.getOrbConf());
} catch (InvalidJobConfException e) {
logger.error(e.getMessage());
}
logger.info("Starting Job");
logger.info("********** Job {} started: {}", job.getJobNumber(), new Date().getTime());
if (assignments != null) {
logger.info("Allocating partitions");
int basePartitionID = 0;
for (M tracker : orbTrackerMembers) {
logger.debug("OrbTracker - " + tracker.getHostname() + ":" + tracker.getPort());
Integer[] assignment = assignments.get(tracker);
tracker.initProxy(getOrbConf());
try {
logger.debug("jobConf().getHDFSdistributedFiles(): {}", job.getOrbConf().getHDFSdistributedFiles());
tracker.getRequiredFiles(job.getOrbConf());
} catch (OrbZKFailure e) {
logger.error("EXCEPTION : An OrbTrackerMember failed to copy files from HDFS to local machine");
logger.error(e.getMessage());
throw e;
}
PartitionRequest request = new PartitionRequest();
request.setActivePartitions(assignment[ResourceAllocator.TRACKER_AVAILABLE]);
request.setReservedPartitions(assignment[ResourceAllocator.TRACKER_RESERVED]);
request.setJobID(job.getJobNumber());
request.setBasePartitionID(basePartitionID);
request.setJobConf(job.getOrbConf());
basePartitionID += assignment[ResourceAllocator.TRACKER_AVAILABLE];
logger.debug("requesting partitions");
tracker.requestPartitions(request);
logger.info(request.toString());
JobStillActiveCheck jobStillActiveCheck = new JobStillActiveCheck(job);
job.setJobStillActiveInterface(jobStillActiveCheck);
new Thread(jobStillActiveCheck).start();
activeJobs.add(job.getJobNumber());
checkForDeathComplete(job);
heartbeat(job);
}
} else {
logger.error("not enough capacity for this job");
jobComplete(job);
}
} catch (OrbZKFailure e) {
e.printStackTrace();
logger.error(e.getMessage());
fireEvent(new OrbExceptionEvent(e));
}
//catch (IOException e) {
// e.printStackTrace();
// logger.error(e.getMessage());
// }
}
use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.
the class JobManager method getJobsInQueue.
/**
* Return the jobsInQueue
*/
private void getJobsInQueue() {
logger.info("getting jobs in queue.");
synchronized (jobs) {
List<String> jobQueueChildren = null;
try {
jobQueueChildren = zk.getChildren(jobQueuePath, jobsInQueueWatcher);
} catch (KeeperException e) {
fireEvent(new OrbExceptionEvent(e));
} catch (InterruptedException e) {
fireEvent(new OrbExceptionEvent(e));
}
List<String> jobsToRemove = new ArrayList<String>();
for (String jobPath : jobs.keySet()) {
if (!jobQueueChildren.contains(jobPath)) {
jobsToRemove.add(jobPath);
// Either a job has completed or been removed by someone else this should fire an event.
// This should really not occur since it should only be removed by the JobManager itself.
// In reality does an event really even need to be thrown?
}
}
for (String job : jobsToRemove) {
logger.debug("Removing job: " + job);
jobs.remove(job);
activeJobs.remove(job);
}
for (String jobPath : jobQueueChildren) {
OrbConfiguration jobConf;
try {
jobConf = (OrbConfiguration) ZookeeperUtils.getNodeWritable(zk, jobQueuePath + "/" + jobPath, OrbConfiguration.class, orbConf);
if (jobConf != null) {
if (!jobs.containsKey(jobPath)) {
logger.debug("Adding job: " + jobPath);
jobs.put(jobPath, new OrbJob(jobPath, jobConf));
// Here we have a new job--once again an event should be fired.
// Although I am not sure that an event really needs to be fired at this point. We will see.
}
} else {
logger.debug("Job is not a valid job.");
}
} catch (OrbZKFailure e) {
fireEvent(new OrbExceptionEvent(e));
}
}
}
tryToLaunchJob();
}
use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.
the class JobManager method checkForDeathComplete.
/**
*
* @param OrbJob job
*/
private void checkForDeathComplete(OrbJob job) throws OrbZKFailure {
if (job.getDeathAndCompleteWatcher() == null) {
job.setDeathAndCompleteWatcher(new DeathAndCompleteWatcher(job));
}
try {
job.getDeathAndCompleteWatcher().restart();
List<String> messages = zk.getChildren(jobsInProgressPath + "/" + job.getJobNumber() + "/messages", (Watcher) job.getDeathAndCompleteWatcher());
if (messages.contains("death")) {
jobDeath(job);
}
if (messages.contains("complete")) {
jobComplete(job);
}
} catch (KeeperException e) {
throw new OrbZKFailure(e);
} catch (InterruptedException e) {
throw new OrbZKFailure(e);
}
}
use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.
the class OrbPartition method enterInitializationBarrier.
/**
*
* @param String
* barrierName
*
*/
private void enterInitializationBarrier(String barrierName, int countToWaitFor) {
LOG.debug("p{} creating barrier {}", getPartitionID(), barrierName);
LOG.debug("{} will wait for {} partitions to join", barrierName, countToWaitFor);
Barrier barrier = new OrbFastBarrier(getOrbConf(), jobInProgressPath + "/" + barrierName, countToWaitFor, Integer.toString(getPartitionID()), zk);
try {
barrier.enter();
LOG.debug("p{} entered {}", getPartitionID(), barrierName);
} catch (OrbZKFailure e) {
LOG.error("p{} failed to complete barrier {}: " + e.getMessage(), getPartitionID(), barrierName);
e.printStackTrace();
}
}
use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.
the class OrbPartition method enterAllDoneBarrier.
/**
* @param boolean iAmDone
* @param String
* barrierName
*/
private boolean enterAllDoneBarrier(String barrierName, boolean iAmDone) {
LOG.debug("p{} creating barrier {}", getPartitionID(), barrierName);
AllDoneBarrier barrier = null;
synchronized (leaderGroupMembers) {
barrier = new OrbFastAllDoneBarrier(getOrbConf(), jobInProgressPath + "/" + barrierName, leaderGroupMembers.size(), Integer.toString(getPartitionID()), zk);
}
try {
boolean entered = barrier.enter(iAmDone);
LOG.debug("p{} entered {}", getPartitionID(), barrierName);
return entered;
} catch (OrbZKFailure e) {
LOG.error("p{} failed to complete barrier: {}" + e.getMessage(), getPartitionID(), barrierName);
e.printStackTrace();
}
return false;
}
Aggregations