Search in sources :

Example 1 with Pinger

use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.

the class RMCore method setDownNode.

/**
 * Sets a node state to down and updates all internal structures of rm core
 * accordingly. Sends an event indicating that the node is down.
 */
public void setDownNode(String nodeUrl) {
    RMNode rmNode = getNodebyUrl(nodeUrl);
    if (rmNode != null) {
        // If the node is already down no need to go further
        if (rmNode.isDown()) {
            return;
        }
        logger.info("The node " + rmNode.getNodeURL() + " provided by " + rmNode.getProvider() + " is down");
        // Get the previous state of the node needed for the event
        final NodeState previousNodeState = rmNode.getState();
        if (rmNode.isFree()) {
            eligibleNodes.remove(rmNode);
        }
        rmNode.setDown();
        persistUpdatedRMNodeIfRecoveryEnabled(rmNode);
        // create the event
        this.registerAndEmitNodeEvent(rmNode.createNodeEvent(RMEventType.NODE_STATE_CHANGED, previousNodeState, rmNode.getProvider().getName()));
    } else {
        // the nodes has been removed from core asynchronously
        // when pinger of selection manager tried to access it
        // do nothing in this case
        logger.debug("setDownNode returned immediately because the node " + nodeUrl + " was not known");
    }
}
Also used : RMNode(org.ow2.proactive.resourcemanager.rmnode.RMNode) NodeState(org.ow2.proactive.resourcemanager.common.NodeState)

Example 2 with Pinger

use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.

the class TopologyManager method pingNode.

/**
 * Launches the pinging process from new host. It will ping all other hosts
 * according to the pinger logic.
 */
private HashMap<InetAddress, Long> pingNode(Node node, NodeSet nodes) {
    try {
        logger.debug("Launching ping process on node " + node.getNodeInformation().getURL());
        long timeStamp = System.currentTimeMillis();
        Pinger pinger = PAActiveObject.newActive(pingerClass, null, node);
        HashMap<InetAddress, Long> result = pinger.ping(nodes);
        PAFuture.waitFor(result);
        logger.debug(result.size() + " hosts were pinged from " + node.getNodeInformation().getURL() + " in " + (System.currentTimeMillis() - timeStamp) + " ms");
        if (logger.isDebugEnabled()) {
            logger.debug("Distances are:");
            for (InetAddress host : result.keySet()) {
                logger.debug(result.get(host) + " to " + host);
            }
        }
        try {
            PAActiveObject.terminateActiveObject(pinger, true);
        } catch (RuntimeException e) {
            logger.error("Cannot kill the pinger active object", e);
        }
        return result;
    } catch (ActiveObjectCreationException e) {
        logger.warn(e.getMessage(), e);
    } catch (NodeException e) {
        logger.warn(e.getMessage(), e);
    }
    return null;
}
Also used : Pinger(org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger) NodeException(org.objectweb.proactive.core.node.NodeException) InetAddress(java.net.InetAddress) ActiveObjectCreationException(org.objectweb.proactive.ActiveObjectCreationException)

Example 3 with Pinger

use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.

the class TestTaskRestartOnNodeFailure method testTaskKillNode.

private void testTaskKillNode(FileLock fileLock, boolean waitBeforeKill) throws Exception {
    Path fileLockPath = fileLock.lock();
    TestNode nodeToKill = startNode();
    log("Submit job");
    final JobId jobId = schedulerHelper.submitJob(createJob(fileLockPath.toString()));
    log("Wait when node becomes busy");
    RMNodeEvent event;
    do {
        event = schedulerHelper.waitForAnyNodeEvent(RMEventType.NODE_STATE_CHANGED, TIMEOUT);
    } while (!event.getNodeState().equals(NodeState.BUSY));
    log("Wait when task starts");
    schedulerHelper.waitForEventTaskRunning(jobId, "Test task");
    /*
         * Want to test two cases (existed at the time of this writing): - if wait some time before
         * killing node then node failure is detected by the pinger thread - if kill node
         * immediately then node failure is detected by the thread calling TaskLauncher.doTask
         */
    if (waitBeforeKill) {
        log("Wait some time");
        Thread.sleep(5000);
    }
    log("Stop task node process (node " + nodeToKill.getNode().getNodeInformation().getURL() + ")");
    nodeToKill.kill();
    TestNode newNode = startNode();
    log("Let task finish");
    fileLock.unlock();
    log("Wait when job finish");
    schedulerHelper.waitForEventJobFinished(jobId, TIMEOUT);
    event = schedulerHelper.waitForNodeEvent(RMEventType.NODE_STATE_CHANGED, newNode.getNode().getNodeInformation().getURL(), TIMEOUT);
    assertEquals(NodeState.BUSY, event.getNodeState());
    event = schedulerHelper.waitForNodeEvent(RMEventType.NODE_STATE_CHANGED, newNode.getNode().getNodeInformation().getURL(), TIMEOUT);
    assertEquals(NodeState.FREE, event.getNodeState());
    log("Check job result");
    checkJobResult(schedulerHelper.getSchedulerInterface(), jobId);
    schedulerHelper.getResourceManager().removeNode(newNode.getNodeURL(), true);
    newNode.kill();
}
Also used : Path(java.nio.file.Path) TestNode(functionaltests.utils.TestNode) RMNodeEvent(org.ow2.proactive.resourcemanager.common.event.RMNodeEvent) JobId(org.ow2.proactive.scheduler.common.job.JobId)

Example 4 with Pinger

use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.

the class TopologyManager method addNode.

/**
 * Updates the topology for new node. Executes the pinger on new node when this node belongs
 * to unknow host.
 */
public void addNode(Node node) {
    try {
        rwLock.writeLock().lock();
        if (!PAResourceManagerProperties.RM_TOPOLOGY_ENABLED.getValueAsBoolean()) {
            // do not do anything if topology disabled
            return;
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Adding Node " + node.getNodeInformation().getURL() + " to topology");
        }
        InetAddress host = node.getVMInformation().getInetAddress();
        if (topology.knownHost(host)) {
            // host topology is already known
            if (logger.isDebugEnabled()) {
                logger.debug("The topology information has been already added for node " + node.getNodeInformation().getURL());
            }
            nodesOnHost.get(host).add(node);
            return;
        }
        // unknown host => start pinging process
        NodeSet toPing = new NodeSet();
        HashMap<InetAddress, Long> hostsTopology = new HashMap<>();
        // adding one node from each host
        for (InetAddress h : nodesOnHost.keySet()) {
            // always have at least one node on each host
            if (nodesOnHost.get(h) != null && !nodesOnHost.get(h).isEmpty()) {
                toPing.add(nodesOnHost.get(h).iterator().next());
                hostsTopology.put(h, Long.MAX_VALUE);
            }
        }
        if (PAResourceManagerProperties.RM_TOPOLOGY_DISTANCE_ENABLED.getValueAsBoolean()) {
            hostsTopology = pingNode(node, toPing);
        }
        topology.addHostTopology(node.getVMInformation().getHostName(), host, hostsTopology);
        Set<Node> nodesList = new LinkedHashSet<>();
        nodesList.add(node);
        nodesOnHost.put(node.getVMInformation().getInetAddress(), nodesList);
    } finally {
        rwLock.writeLock().unlock();
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Node " + node.getNodeInformation().getURL() + " added.");
    }
}
Also used : NodeSet(org.ow2.proactive.utils.NodeSet) LinkedHashSet(java.util.LinkedHashSet) HashMap(java.util.HashMap) Node(org.objectweb.proactive.core.node.Node) InetAddress(java.net.InetAddress)

Example 5 with Pinger

use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.

the class RMCore method releaseNodes.

/**
 * {@inheritDoc}
 */
public BooleanWrapper releaseNodes(NodeSet nodes) {
    if (nodes.getExtraNodes() != null) {
        // do not forget to release extra nodes
        nodes.addAll(nodes.getExtraNodes());
    }
    // exception to throw in case of problems
    RuntimeException exception = null;
    NodeSet nodesReleased = new NodeSet();
    NodeSet nodesFailedToRelease = new NodeSet();
    for (Node node : nodes) {
        String nodeURL = null;
        try {
            nodeURL = node.getNodeInformation().getURL();
            logger.debug("Releasing node " + nodeURL);
        } catch (RuntimeException e) {
            logger.debug("A Runtime exception occurred while obtaining information on the node," + "the node must be down (it will be detected later)", e);
            // node is down, will be detected by pinger
            exception = new IllegalStateException(e.getMessage(), e);
            nodesFailedToRelease.add(node);
        }
        // verify whether the node has not been removed from the RM
        if (this.allNodes.containsKey(nodeURL)) {
            RMNode rmnode = this.getNodebyUrl(nodeURL);
            // free
            if (rmnode.isFree()) {
                logger.warn("Client " + caller + " tries to release the already free node " + nodeURL);
                nodesFailedToRelease.add(node);
            } else if (rmnode.isDown()) {
                logger.warn("Node was down, it cannot be released");
                nodesFailedToRelease.add(node);
            } else {
                Set<? extends IdentityPrincipal> userPrincipal = rmnode.getOwner().getSubject().getPrincipals(UserNamePrincipal.class);
                Permission ownerPermission = new PrincipalPermission(rmnode.getOwner().getName(), userPrincipal);
                try {
                    caller.checkPermission(ownerPermission, caller + " is not authorized to free node " + node.getNodeInformation().getURL());
                    if (rmnode.isToRemove()) {
                        removeNodeFromCoreAndSource(rmnode, caller);
                        nodesReleased.add(node);
                    } else {
                        internalSetFree(rmnode);
                        nodesReleased.add(node);
                    }
                } catch (SecurityException ex) {
                    logger.error(ex.getMessage(), ex);
                    nodesFailedToRelease.add(node);
                    exception = ex;
                }
            }
        } else {
            logger.warn("Cannot release unknown node " + nodeURL);
            nodesFailedToRelease.add(node);
            exception = new IllegalArgumentException("Cannot release unknown node " + nodeURL);
        }
    }
    logger.info("Nodes released : " + nodesReleased);
    if (!nodesFailedToRelease.isEmpty()) {
        logger.warn("Nodes failed to release : " + nodesFailedToRelease);
    }
    if (exception != null) {
        // throwing the latest exception we had
        throw exception;
    }
    return new BooleanWrapper(true);
}
Also used : NodeSet(org.ow2.proactive.utils.NodeSet) NodeSet(org.ow2.proactive.utils.NodeSet) Set(java.util.Set) ImmutableSet(com.google.common.collect.ImmutableSet) HashSet(java.util.HashSet) RMDeployingNode(org.ow2.proactive.resourcemanager.rmnode.RMDeployingNode) Node(org.objectweb.proactive.core.node.Node) RMNode(org.ow2.proactive.resourcemanager.rmnode.RMNode) PrincipalPermission(org.ow2.proactive.permissions.PrincipalPermission) UserNamePrincipal(org.ow2.proactive.authentication.principals.UserNamePrincipal) BooleanWrapper(org.objectweb.proactive.core.util.wrapper.BooleanWrapper) RMNode(org.ow2.proactive.resourcemanager.rmnode.RMNode) Permission(java.security.Permission) MethodCallPermission(org.ow2.proactive.permissions.MethodCallPermission) PrincipalPermission(org.ow2.proactive.permissions.PrincipalPermission) IdentityPrincipal(org.ow2.proactive.authentication.principals.IdentityPrincipal)

Aggregations

InetAddress (java.net.InetAddress)2 Node (org.objectweb.proactive.core.node.Node)2 PrincipalPermission (org.ow2.proactive.permissions.PrincipalPermission)2 RMNode (org.ow2.proactive.resourcemanager.rmnode.RMNode)2 NodeSet (org.ow2.proactive.utils.NodeSet)2 ImmutableSet (com.google.common.collect.ImmutableSet)1 TestNode (functionaltests.utils.TestNode)1 Path (java.nio.file.Path)1 Permission (java.security.Permission)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 Set (java.util.Set)1 ActiveObjectCreationException (org.objectweb.proactive.ActiveObjectCreationException)1 NodeException (org.objectweb.proactive.core.node.NodeException)1 BooleanWrapper (org.objectweb.proactive.core.util.wrapper.BooleanWrapper)1 IdentityPrincipal (org.ow2.proactive.authentication.principals.IdentityPrincipal)1 UserNamePrincipal (org.ow2.proactive.authentication.principals.UserNamePrincipal)1 MethodCallPermission (org.ow2.proactive.permissions.MethodCallPermission)1 NodeState (org.ow2.proactive.resourcemanager.common.NodeState)1