Search in sources :

Example 1 with OrbZKFailure

use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.

the class JobManager method launchJob.

/**
 * 
 * @param  OrbJob job
 */
private void launchJob(OrbJob job) {
    try {
        ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber());
        ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber() + "/OrbPartitionLeaderGroup");
        ZookeeperUtils.notExistCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber() + "/messages");
        ZookeeperUtils.tryToCreateNode(zk, jobsInProgressPath + "/" + job.getJobNumber() + "/messages/heartbeat", new LongWritable(0), CreateMode.PERSISTENT);
        // allocate resources and if enough, start the job
        logger.info("checking for available OrbTracker resources");
        Map<M, Integer[]> assignments = null;
        try {
            assignments = resourceAllocator.assignResources(job.getOrbConf());
        } catch (InvalidJobConfException e) {
            logger.error(e.getMessage());
        }
        logger.info("Starting Job");
        logger.info("********** Job {} started: {}", job.getJobNumber(), new Date().getTime());
        if (assignments != null) {
            logger.info("Allocating partitions");
            int basePartitionID = 0;
            for (M tracker : orbTrackerMembers) {
                logger.debug("OrbTracker - " + tracker.getHostname() + ":" + tracker.getPort());
                Integer[] assignment = assignments.get(tracker);
                tracker.initProxy(getOrbConf());
                try {
                    logger.debug("jobConf().getHDFSdistributedFiles(): {}", job.getOrbConf().getHDFSdistributedFiles());
                    tracker.getRequiredFiles(job.getOrbConf());
                } catch (OrbZKFailure e) {
                    logger.error("EXCEPTION : An OrbTrackerMember failed to copy files from HDFS to local machine");
                    logger.error(e.getMessage());
                    throw e;
                }
                PartitionRequest request = new PartitionRequest();
                request.setActivePartitions(assignment[ResourceAllocator.TRACKER_AVAILABLE]);
                request.setReservedPartitions(assignment[ResourceAllocator.TRACKER_RESERVED]);
                request.setJobID(job.getJobNumber());
                request.setBasePartitionID(basePartitionID);
                request.setJobConf(job.getOrbConf());
                basePartitionID += assignment[ResourceAllocator.TRACKER_AVAILABLE];
                logger.debug("requesting partitions");
                tracker.requestPartitions(request);
                logger.info(request.toString());
                JobStillActiveCheck jobStillActiveCheck = new JobStillActiveCheck(job);
                job.setJobStillActiveInterface(jobStillActiveCheck);
                new Thread(jobStillActiveCheck).start();
                activeJobs.add(job.getJobNumber());
                checkForDeathComplete(job);
                heartbeat(job);
            }
        } else {
            logger.error("not enough capacity for this job");
            jobComplete(job);
        }
    } catch (OrbZKFailure e) {
        e.printStackTrace();
        logger.error(e.getMessage());
        fireEvent(new OrbExceptionEvent(e));
    }
//catch (IOException e) {
//      e.printStackTrace();
//      logger.error(e.getMessage());
//    }
}
Also used : OrbExceptionEvent(org.goldenorb.event.OrbExceptionEvent) PartitionRequest(org.goldenorb.jet.PartitionRequest) InvalidJobConfException(org.apache.hadoop.mapred.InvalidJobConfException) Date(java.util.Date) OrbZKFailure(org.goldenorb.zookeeper.OrbZKFailure) LongWritable(org.apache.hadoop.io.LongWritable)

Example 2 with OrbZKFailure

use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.

the class JobManager method getJobsInQueue.

/**
 * Return the jobsInQueue
 */
private void getJobsInQueue() {
    logger.info("getting jobs in queue.");
    synchronized (jobs) {
        List<String> jobQueueChildren = null;
        try {
            jobQueueChildren = zk.getChildren(jobQueuePath, jobsInQueueWatcher);
        } catch (KeeperException e) {
            fireEvent(new OrbExceptionEvent(e));
        } catch (InterruptedException e) {
            fireEvent(new OrbExceptionEvent(e));
        }
        List<String> jobsToRemove = new ArrayList<String>();
        for (String jobPath : jobs.keySet()) {
            if (!jobQueueChildren.contains(jobPath)) {
                jobsToRemove.add(jobPath);
            // Either a job has completed or been removed by someone else this should fire an event.
            // This should really not occur since it should only be removed by the JobManager itself.
            // In reality does an event really even need to be thrown?
            }
        }
        for (String job : jobsToRemove) {
            logger.debug("Removing job: " + job);
            jobs.remove(job);
            activeJobs.remove(job);
        }
        for (String jobPath : jobQueueChildren) {
            OrbConfiguration jobConf;
            try {
                jobConf = (OrbConfiguration) ZookeeperUtils.getNodeWritable(zk, jobQueuePath + "/" + jobPath, OrbConfiguration.class, orbConf);
                if (jobConf != null) {
                    if (!jobs.containsKey(jobPath)) {
                        logger.debug("Adding job: " + jobPath);
                        jobs.put(jobPath, new OrbJob(jobPath, jobConf));
                    // Here we have a new job--once again an event should be fired.
                    // Although I am not sure that an event really needs to be fired at this point. We will see.
                    }
                } else {
                    logger.debug("Job is not a valid job.");
                }
            } catch (OrbZKFailure e) {
                fireEvent(new OrbExceptionEvent(e));
            }
        }
    }
    tryToLaunchJob();
}
Also used : OrbExceptionEvent(org.goldenorb.event.OrbExceptionEvent) OrbConfiguration(org.goldenorb.conf.OrbConfiguration) ArrayList(java.util.ArrayList) OrbZKFailure(org.goldenorb.zookeeper.OrbZKFailure) KeeperException(org.apache.zookeeper.KeeperException)

Example 3 with OrbZKFailure

use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.

the class JobManager method checkForDeathComplete.

/**
 * 
 * @param  OrbJob job
 */
private void checkForDeathComplete(OrbJob job) throws OrbZKFailure {
    if (job.getDeathAndCompleteWatcher() == null) {
        job.setDeathAndCompleteWatcher(new DeathAndCompleteWatcher(job));
    }
    try {
        job.getDeathAndCompleteWatcher().restart();
        List<String> messages = zk.getChildren(jobsInProgressPath + "/" + job.getJobNumber() + "/messages", (Watcher) job.getDeathAndCompleteWatcher());
        if (messages.contains("death")) {
            jobDeath(job);
        }
        if (messages.contains("complete")) {
            jobComplete(job);
        }
    } catch (KeeperException e) {
        throw new OrbZKFailure(e);
    } catch (InterruptedException e) {
        throw new OrbZKFailure(e);
    }
}
Also used : OrbZKFailure(org.goldenorb.zookeeper.OrbZKFailure) KeeperException(org.apache.zookeeper.KeeperException)

Example 4 with OrbZKFailure

use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.

the class OrbPartition method enterInitializationBarrier.

/**
   * 
   * @param String
   *          barrierName
   * 
   */
private void enterInitializationBarrier(String barrierName, int countToWaitFor) {
    LOG.debug("p{} creating barrier {}", getPartitionID(), barrierName);
    LOG.debug("{} will wait for {} partitions to join", barrierName, countToWaitFor);
    Barrier barrier = new OrbFastBarrier(getOrbConf(), jobInProgressPath + "/" + barrierName, countToWaitFor, Integer.toString(getPartitionID()), zk);
    try {
        barrier.enter();
        LOG.debug("p{} entered {}", getPartitionID(), barrierName);
    } catch (OrbZKFailure e) {
        LOG.error("p{} failed to complete barrier {}: " + e.getMessage(), getPartitionID(), barrierName);
        e.printStackTrace();
    }
}
Also used : OrbFastBarrier(org.goldenorb.zookeeper.OrbFastBarrier) OrbZKFailure(org.goldenorb.zookeeper.OrbZKFailure) OrbFastAllDoneBarrier(org.goldenorb.zookeeper.OrbFastAllDoneBarrier) AllDoneBarrier(org.goldenorb.zookeeper.AllDoneBarrier) OrbFastBarrier(org.goldenorb.zookeeper.OrbFastBarrier) Barrier(org.goldenorb.zookeeper.Barrier)

Example 5 with OrbZKFailure

use of org.goldenorb.zookeeper.OrbZKFailure in project goldenorb by jzachr.

the class OrbPartition method enterAllDoneBarrier.

/**
   * @param boolean iAmDone
   * @param String
   *          barrierName
   */
private boolean enterAllDoneBarrier(String barrierName, boolean iAmDone) {
    LOG.debug("p{} creating barrier {}", getPartitionID(), barrierName);
    AllDoneBarrier barrier = null;
    synchronized (leaderGroupMembers) {
        barrier = new OrbFastAllDoneBarrier(getOrbConf(), jobInProgressPath + "/" + barrierName, leaderGroupMembers.size(), Integer.toString(getPartitionID()), zk);
    }
    try {
        boolean entered = barrier.enter(iAmDone);
        LOG.debug("p{} entered {}", getPartitionID(), barrierName);
        return entered;
    } catch (OrbZKFailure e) {
        LOG.error("p{} failed to complete barrier: {}" + e.getMessage(), getPartitionID(), barrierName);
        e.printStackTrace();
    }
    return false;
}
Also used : OrbFastAllDoneBarrier(org.goldenorb.zookeeper.OrbFastAllDoneBarrier) OrbZKFailure(org.goldenorb.zookeeper.OrbZKFailure) OrbFastAllDoneBarrier(org.goldenorb.zookeeper.OrbFastAllDoneBarrier) AllDoneBarrier(org.goldenorb.zookeeper.AllDoneBarrier)

Aggregations

OrbZKFailure (org.goldenorb.zookeeper.OrbZKFailure)8 AllDoneBarrier (org.goldenorb.zookeeper.AllDoneBarrier)3 OrbFastAllDoneBarrier (org.goldenorb.zookeeper.OrbFastAllDoneBarrier)3 IOException (java.io.IOException)2 KeeperException (org.apache.zookeeper.KeeperException)2 OrbConfiguration (org.goldenorb.conf.OrbConfiguration)2 OrbExceptionEvent (org.goldenorb.event.OrbExceptionEvent)2 Barrier (org.goldenorb.zookeeper.Barrier)2 OrbFastBarrier (org.goldenorb.zookeeper.OrbFastBarrier)2 UnknownHostException (java.net.UnknownHostException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 LongWritable (org.apache.hadoop.io.LongWritable)1 InvalidJobConfException (org.apache.hadoop.mapred.InvalidJobConfException)1 NodeDoesNotExistException (org.goldenorb.client.NodeDoesNotExistException)1 OrbTrackerMemberData (org.goldenorb.client.OrbTrackerMemberData)1 WatcherException (org.goldenorb.client.WatcherException)1 ZooKeeperConnectionException (org.goldenorb.client.ZooKeeperConnectionException)1 PartitionRequest (org.goldenorb.jet.PartitionRequest)1