Search in sources :

Example 1 with NamedThreadFactory

use of org.objectweb.proactive.utils.NamedThreadFactory in project scheduling by ow2-proactive.

the class NodesRecoveryManager method recoverNodes.

public void recoverNodes(NodeSource nodeSource) {
    // this log line is important for performance tests
    logger.info(START_TO_RECOVER_NODES);
    String nodeSourceName = nodeSource.getName();
    Collection<RMNodeData> nodesData = this.rmCore.getDbManager().getNodesByNodeSource(nodeSourceName);
    logger.info("Number of nodes found in database for node source " + nodeSourceName + ": " + nodesData.size());
    List<RMNode> recoveredEligibleNodes = Collections.synchronizedList(new ArrayList<>());
    Map<NodeState, Integer> recoveredNodeStatesCounter = new HashMap<>();
    // for each node found in database, try to lookup node or recover it
    // as down node
    ExecutorService nodeRecoveryThreadPool = PAExecutors.newCachedBoundedThreadPool(1, PAResourceManagerProperties.RM_NODESOURCE_MAX_THREAD_NUMBER.getValueAsInt(), 120L, TimeUnit.SECONDS, new NamedThreadFactory("NodeRecoveryThreadPool"));
    List<Future<RMNode>> nodesFutures = new ArrayList<>(nodesData.size());
    for (RMNodeData rmNodeData : nodesData) {
        nodesFutures.add(nodeRecoveryThreadPool.submit(() -> this.recoverNode(rmNodeData, nodeSource, recoveredNodeStatesCounter)));
    }
    for (Future<RMNode> rmNodeFuture : nodesFutures) {
        RMNode node = null;
        try {
            node = rmNodeFuture.get();
        } catch (Exception e) {
            logger.error("Unexpected error occurred while recovering node source " + nodeSource.getName(), e);
            nodeRecoveryThreadPool.shutdownNow();
            return;
        }
        if (this.isEligible(node)) {
            recoveredEligibleNodes.add(node);
        }
        if (node != null) {
            final RMNodeEvent event = node.createNodeEvent(RMEventType.NODE_ADDED, null, node.getProvider().getName());
            this.rmCore.registerAndEmitNodeEvent(event);
        }
    }
    nodeRecoveryThreadPool.shutdownNow();
    this.rmCore.addEligibleNodesToRecover(recoveredEligibleNodes);
    this.logNodeRecoverySummary(nodeSourceName, recoveredNodeStatesCounter, recoveredEligibleNodes.size());
}
Also used : NodeState(org.ow2.proactive.resourcemanager.common.NodeState) HashMap(java.util.HashMap) NamedThreadFactory(org.objectweb.proactive.utils.NamedThreadFactory) ArrayList(java.util.ArrayList) AddingNodesException(org.ow2.proactive.resourcemanager.exception.AddingNodesException) RMNode(org.ow2.proactive.resourcemanager.rmnode.RMNode) RMNodeData(org.ow2.proactive.resourcemanager.db.RMNodeData) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) RMNodeEvent(org.ow2.proactive.resourcemanager.common.event.RMNodeEvent)

Example 2 with NamedThreadFactory

use of org.objectweb.proactive.utils.NamedThreadFactory in project scheduling by ow2-proactive.

the class SchedulerStateRecoverHelper method recover.

public RecoveredSchedulerState recover(long loadJobPeriod, RMProxy rmProxy, SchedulerStatus schedulerStatus) {
    dbManager.setTaskDataOwnerIfNull();
    List<InternalJob> notFinishedJobs = dbManager.loadNotFinishedJobs(true);
    Vector<InternalJob> pendingJobs = new Vector<>();
    Vector<InternalJob> runningJobs = new Vector<>();
    ExecutorService recoverRunningTasksThreadPool = PAExecutors.newCachedBoundedThreadPool(1, PASchedulerProperties.SCHEDULER_PARALLEL_SCHEDULER_STATE_RECOVER_NBTHREAD.getValueAsInt(), 60L, TimeUnit.SECONDS, new NamedThreadFactory("TaskRecoverThreadPool"));
    for (InternalJob job : notFinishedJobs) {
        recoverJob(rmProxy, pendingJobs, runningJobs, job, recoverRunningTasksThreadPool);
    }
    recoverRunningTasksThreadPool.shutdown();
    boolean terminatedWithoutTimeout;
    try {
        terminatedWithoutTimeout = recoverRunningTasksThreadPool.awaitTermination(PASchedulerProperties.SCHEDULER_PARALLEL_SCHEDULER_STATE_RECOVER_TIMEOUT.getValueAsInt(), TimeUnit.MINUTES);
    } catch (InterruptedException e) {
        logger.error("Interrupted while waiting for the Scheduler state to be recovered", e);
        Thread.currentThread().interrupt();
        throw new SchedulerStateNotRecoveredException(e);
    }
    failIfSchedulerStateRecoveryTimeout(terminatedWithoutTimeout);
    applyJobUpdates(notFinishedJobs);
    Vector<InternalJob> finishedJobs = new Vector<>();
    for (Iterator<InternalJob> iterator = runningJobs.iterator(); iterator.hasNext(); ) {
        InternalJob job = iterator.next();
        try {
            List<InternalTask> tasksList = copyAndSort(job.getITasks());
            // simulate the running execution to recreate the tree.
            for (InternalTask task : tasksList) {
                job.recoverTask(task.getId());
            }
            if (job.getStatus() == JobStatus.PAUSED) {
                job.setStatus(JobStatus.STALLED);
                job.setPaused();
                // update the count of pending and running task.
                job.setNumberOfPendingTasks(job.getNumberOfPendingTasks() + job.getNumberOfRunningTasks());
                job.setNumberOfRunningTasks(0);
            }
        } catch (Throwable e) {
            logger.error("Failed to recover job " + job.getId() + " " + job.getName() + " job might be in a inconsistent state", e);
            jobLogger.error(job.getId(), "Failed to recover job, job might be in an inconsistent state", e);
            // partially cancel job (not tasks) and move it to finished jobs to avoid running it
            iterator.remove();
            job.setStatus(JobStatus.CANCELED);
            finishedJobs.add(job);
            dbManager.updateJobAndTasksState(job);
        }
    }
    finishedJobs.addAll(dbManager.loadFinishedJobs(false, loadJobPeriod));
    logger.info("[Recovering counters] " + " Pending: " + pendingJobs.size() + " Running: " + runningJobs.size() + " Finished: " + finishedJobs.size());
    return new RecoveredSchedulerState(pendingJobs, runningJobs, finishedJobs, schedulerStatus);
}
Also used : InternalJob(org.ow2.proactive.scheduler.job.InternalJob) InternalTask(org.ow2.proactive.scheduler.task.internal.InternalTask) NamedThreadFactory(org.objectweb.proactive.utils.NamedThreadFactory) ExecutorService(java.util.concurrent.ExecutorService) Vector(java.util.Vector)

Example 3 with NamedThreadFactory

use of org.objectweb.proactive.utils.NamedThreadFactory in project scheduling by ow2-proactive.

the class SchedulerFrontend method initActivity.

/**
 * @see org.objectweb.proactive.InitActive#initActivity(org.objectweb.proactive.Body)
 */
@Override
public void initActivity(Body body) {
    try {
        // setting up the policy
        logger.debug("Setting up scheduler security policy");
        ClientsPolicy.init();
        // creating the scheduler authentication interface.
        // if this fails then it will not continue.
        logger.debug("Creating scheduler authentication interface...");
        authentication = PAActiveObject.newActive(SchedulerAuthentication.class, new Object[] { PAActiveObject.getStubOnThis() });
        // creating scheduler core
        DataSpaceServiceStarter dsServiceStarter = DataSpaceServiceStarter.getDataSpaceServiceStarter();
        dsServiceStarter.startNamingService();
        ExecutorService clientThreadPool = PAExecutors.newCachedBoundedThreadPool(1, PASchedulerProperties.SCHEDULER_CLIENT_POOL_NBTHREAD.getValueAsInt(), 120L, TimeUnit.SECONDS, new NamedThreadFactory("ClientRequestsThreadPool", false, 3));
        ExecutorService internalThreadPool = PAExecutors.newCachedBoundedThreadPool(1, PASchedulerProperties.SCHEDULER_INTERNAL_POOL_NBTHREAD.getValueAsInt(), 120L, TimeUnit.SECONDS, new NamedThreadFactory("InternalOperationsThreadPool", false, 7));
        ExecutorService taskPingerThreadPool = PAExecutors.newCachedBoundedThreadPool(1, PASchedulerProperties.SCHEDULER_TASK_PINGER_POOL_NBTHREAD.getValueAsInt(), 120L, TimeUnit.SECONDS, new NamedThreadFactory("TaskPingerThreadPool", false, 2));
        ScheduledExecutorService scheduledThreadPool = new ScheduledThreadPoolExecutor(PASchedulerProperties.SCHEDULER_SCHEDULED_POOL_NBTHREAD.getValueAsInt(), new NamedThreadFactory("SchedulingServiceTimerThread", false, 2));
        // at this point we must wait the resource manager
        RMConnection.waitAndJoin(rmURL.toString());
        RMProxiesManager rmProxiesManager = RMProxiesManager.createRMProxiesManager(rmURL);
        RMProxy rmProxy = rmProxiesManager.getRmProxy();
        long loadJobPeriod = -1;
        if (PASchedulerProperties.SCHEDULER_DB_LOAD_JOB_PERIOD.isSet()) {
            String periodStr = PASchedulerProperties.SCHEDULER_DB_LOAD_JOB_PERIOD.getValueAsString();
            if (periodStr != null && !periodStr.isEmpty()) {
                try {
                    loadJobPeriod = Tools.parsePeriod(periodStr);
                } catch (IllegalArgumentException e) {
                    logger.warn("Invalid load job period string: " + periodStr + ", this setting is ignored", e);
                }
            }
        }
        logger.debug("Booting jmx...");
        this.jmxHelper.boot(authentication);
        publicStore = startSynchronizationService();
        RecoveredSchedulerState recoveredState = new SchedulerStateRecoverHelper(dbManager).recover(loadJobPeriod, rmProxy, initialStatus);
        this.frontendState = new SchedulerFrontendState(recoveredState.getSchedulerState(), jmxHelper, dbManager);
        SchedulingInfrastructure infrastructure = new SchedulingInfrastructureImpl(dbManager, rmProxiesManager, dsServiceStarter, clientThreadPool, internalThreadPool, taskPingerThreadPool, scheduledThreadPool);
        this.spacesSupport = infrastructure.getSpacesSupport();
        ServerJobAndTaskLogs.getInstance().setSpacesSupport(this.spacesSupport);
        this.corePublicKey = Credentials.getPublicKey(PASchedulerProperties.getAbsolutePath(PASchedulerProperties.SCHEDULER_AUTH_PUBKEY_PATH.getValueAsString()));
        this.schedulingService = new SchedulingService(infrastructure, frontendState, recoveredState, policyFullName, null, publicStore);
        recoveredState.enableLiveLogsForRunningTasks(schedulingService);
        releaseBusyNodesWithNoRunningTask(rmProxy, recoveredState);
        logger.debug("Registering scheduler...");
        PAActiveObject.registerByName(authentication, SchedulerConstants.SCHEDULER_DEFAULT_NAME);
        authentication.setActivated(true);
        Tools.logAvailableScriptEngines(logger);
        if (PASchedulerProperties.SCHEDULER_MEM_MONITORING_FREQ.isSet()) {
            logger.debug("Starting the memory monitoring process...");
            metricsMonitorScheduler = new it.sauronsoftware.cron4j.Scheduler();
            String cronExpr = PASchedulerProperties.SCHEDULER_MEM_MONITORING_FREQ.getValueAsString();
            metricsMonitorScheduler.schedule(cronExpr, new TableSizeMonitorRunner(dbManager.getTransactionHelper()));
            metricsMonitorScheduler.schedule(cronExpr, new JobsMemoryMonitorRunner(dbManager.getSessionFactory().getStatistics(), recoveredState.getSchedulerState()));
            metricsMonitorScheduler.start();
        }
    } catch (Exception e) {
        logger.fatal("Failed to start Scheduler", e);
        e.printStackTrace();
        System.exit(1);
    }
}
Also used : ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) NamedThreadFactory(org.objectweb.proactive.utils.NamedThreadFactory) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) TimeUnit(java.util.concurrent.TimeUnit) SchedulerAuthentication(org.ow2.proactive.scheduler.authentication.SchedulerAuthentication) FileSystemException(org.objectweb.proactive.extensions.dataspaces.exceptions.FileSystemException) KeyException(java.security.KeyException) UnknownJobException(org.ow2.proactive.scheduler.common.exception.UnknownJobException) TaskCouldNotRestartException(org.ow2.proactive.scheduler.common.exception.TaskCouldNotRestartException) InvalidChannelException(org.ow2.proactive.scheduler.synchronization.InvalidChannelException) JobCreationException(org.ow2.proactive.scheduler.common.exception.JobCreationException) PermissionException(org.ow2.proactive.scheduler.common.exception.PermissionException) NotConnectedException(org.ow2.proactive.scheduler.common.exception.NotConnectedException) AlreadyConnectedException(org.ow2.proactive.scheduler.common.exception.AlreadyConnectedException) UnknownTaskException(org.ow2.proactive.scheduler.common.exception.UnknownTaskException) TaskCouldNotStartException(org.ow2.proactive.scheduler.common.exception.TaskCouldNotStartException) JobValidationException(org.ow2.proactive.scheduler.common.exception.JobValidationException) JobAlreadyFinishedException(org.ow2.proactive.scheduler.common.exception.JobAlreadyFinishedException) SubmissionClosedException(org.ow2.proactive.scheduler.common.exception.SubmissionClosedException) DatabaseManagerException(org.ow2.proactive.db.DatabaseManagerException) TaskSkippedException(org.ow2.proactive.scheduler.common.exception.TaskSkippedException) ProActiveException(org.objectweb.proactive.core.ProActiveException) SignalApiException(org.ow2.proactive.scheduler.signal.SignalApiException) RecoveredSchedulerState(org.ow2.proactive.scheduler.core.db.RecoveredSchedulerState) RMProxy(org.ow2.proactive.scheduler.core.rmproxies.RMProxy) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutorService(java.util.concurrent.ExecutorService) SchedulerStateRecoverHelper(org.ow2.proactive.scheduler.core.db.SchedulerStateRecoverHelper) TableSizeMonitorRunner(org.ow2.proactive.scheduler.core.helpers.TableSizeMonitorRunner) DataSpacesFileObject(org.objectweb.proactive.extensions.dataspaces.api.DataSpacesFileObject) ActiveObject(org.objectweb.proactive.extensions.annotation.ActiveObject) PAActiveObject(org.objectweb.proactive.api.PAActiveObject) JobsMemoryMonitorRunner(org.ow2.proactive.scheduler.core.helpers.JobsMemoryMonitorRunner) RMProxiesManager(org.ow2.proactive.scheduler.core.rmproxies.RMProxiesManager)

Aggregations

ExecutorService (java.util.concurrent.ExecutorService)3 NamedThreadFactory (org.objectweb.proactive.utils.NamedThreadFactory)3 KeyException (java.security.KeyException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Vector (java.util.Vector)1 Future (java.util.concurrent.Future)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 ScheduledThreadPoolExecutor (java.util.concurrent.ScheduledThreadPoolExecutor)1 TimeUnit (java.util.concurrent.TimeUnit)1 PAActiveObject (org.objectweb.proactive.api.PAActiveObject)1 ProActiveException (org.objectweb.proactive.core.ProActiveException)1 ActiveObject (org.objectweb.proactive.extensions.annotation.ActiveObject)1 DataSpacesFileObject (org.objectweb.proactive.extensions.dataspaces.api.DataSpacesFileObject)1 FileSystemException (org.objectweb.proactive.extensions.dataspaces.exceptions.FileSystemException)1 DatabaseManagerException (org.ow2.proactive.db.DatabaseManagerException)1 NodeState (org.ow2.proactive.resourcemanager.common.NodeState)1 RMNodeEvent (org.ow2.proactive.resourcemanager.common.event.RMNodeEvent)1 RMNodeData (org.ow2.proactive.resourcemanager.db.RMNodeData)1 AddingNodesException (org.ow2.proactive.resourcemanager.exception.AddingNodesException)1