use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.
the class RMCore method setDownNode.
/**
* Sets a node state to down and updates all internal structures of rm core
* accordingly. Sends an event indicating that the node is down.
*/
public void setDownNode(String nodeUrl) {
RMNode rmNode = getNodebyUrl(nodeUrl);
if (rmNode != null) {
// If the node is already down no need to go further
if (rmNode.isDown()) {
return;
}
logger.info("The node " + rmNode.getNodeURL() + " provided by " + rmNode.getProvider() + " is down");
// Get the previous state of the node needed for the event
final NodeState previousNodeState = rmNode.getState();
if (rmNode.isFree()) {
eligibleNodes.remove(rmNode);
}
rmNode.setDown();
persistUpdatedRMNodeIfRecoveryEnabled(rmNode);
// create the event
this.registerAndEmitNodeEvent(rmNode.createNodeEvent(RMEventType.NODE_STATE_CHANGED, previousNodeState, rmNode.getProvider().getName()));
} else {
// the nodes has been removed from core asynchronously
// when pinger of selection manager tried to access it
// do nothing in this case
logger.debug("setDownNode returned immediately because the node " + nodeUrl + " was not known");
}
}
use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.
the class TopologyManager method pingNode.
/**
* Launches the pinging process from new host. It will ping all other hosts
* according to the pinger logic.
*/
private HashMap<InetAddress, Long> pingNode(Node node, NodeSet nodes) {
try {
logger.debug("Launching ping process on node " + node.getNodeInformation().getURL());
long timeStamp = System.currentTimeMillis();
Pinger pinger = PAActiveObject.newActive(pingerClass, null, node);
HashMap<InetAddress, Long> result = pinger.ping(nodes);
PAFuture.waitFor(result);
logger.debug(result.size() + " hosts were pinged from " + node.getNodeInformation().getURL() + " in " + (System.currentTimeMillis() - timeStamp) + " ms");
if (logger.isDebugEnabled()) {
logger.debug("Distances are:");
for (InetAddress host : result.keySet()) {
logger.debug(result.get(host) + " to " + host);
}
}
try {
PAActiveObject.terminateActiveObject(pinger, true);
} catch (RuntimeException e) {
logger.error("Cannot kill the pinger active object", e);
}
return result;
} catch (ActiveObjectCreationException e) {
logger.warn(e.getMessage(), e);
} catch (NodeException e) {
logger.warn(e.getMessage(), e);
}
return null;
}
use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.
the class TestTaskRestartOnNodeFailure method testTaskKillNode.
private void testTaskKillNode(FileLock fileLock, boolean waitBeforeKill) throws Exception {
Path fileLockPath = fileLock.lock();
TestNode nodeToKill = startNode();
log("Submit job");
final JobId jobId = schedulerHelper.submitJob(createJob(fileLockPath.toString()));
log("Wait when node becomes busy");
RMNodeEvent event;
do {
event = schedulerHelper.waitForAnyNodeEvent(RMEventType.NODE_STATE_CHANGED, TIMEOUT);
} while (!event.getNodeState().equals(NodeState.BUSY));
log("Wait when task starts");
schedulerHelper.waitForEventTaskRunning(jobId, "Test task");
/*
* Want to test two cases (existed at the time of this writing): - if wait some time before
* killing node then node failure is detected by the pinger thread - if kill node
* immediately then node failure is detected by the thread calling TaskLauncher.doTask
*/
if (waitBeforeKill) {
log("Wait some time");
Thread.sleep(5000);
}
log("Stop task node process (node " + nodeToKill.getNode().getNodeInformation().getURL() + ")");
nodeToKill.kill();
TestNode newNode = startNode();
log("Let task finish");
fileLock.unlock();
log("Wait when job finish");
schedulerHelper.waitForEventJobFinished(jobId, TIMEOUT);
event = schedulerHelper.waitForNodeEvent(RMEventType.NODE_STATE_CHANGED, newNode.getNode().getNodeInformation().getURL(), TIMEOUT);
assertEquals(NodeState.BUSY, event.getNodeState());
event = schedulerHelper.waitForNodeEvent(RMEventType.NODE_STATE_CHANGED, newNode.getNode().getNodeInformation().getURL(), TIMEOUT);
assertEquals(NodeState.FREE, event.getNodeState());
log("Check job result");
checkJobResult(schedulerHelper.getSchedulerInterface(), jobId);
schedulerHelper.getResourceManager().removeNode(newNode.getNodeURL(), true);
newNode.kill();
}
use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.
the class TopologyManager method addNode.
/**
* Updates the topology for new node. Executes the pinger on new node when this node belongs
* to unknow host.
*/
public void addNode(Node node) {
try {
rwLock.writeLock().lock();
if (!PAResourceManagerProperties.RM_TOPOLOGY_ENABLED.getValueAsBoolean()) {
// do not do anything if topology disabled
return;
}
if (logger.isDebugEnabled()) {
logger.debug("Adding Node " + node.getNodeInformation().getURL() + " to topology");
}
InetAddress host = node.getVMInformation().getInetAddress();
if (topology.knownHost(host)) {
// host topology is already known
if (logger.isDebugEnabled()) {
logger.debug("The topology information has been already added for node " + node.getNodeInformation().getURL());
}
nodesOnHost.get(host).add(node);
return;
}
// unknown host => start pinging process
NodeSet toPing = new NodeSet();
HashMap<InetAddress, Long> hostsTopology = new HashMap<>();
// adding one node from each host
for (InetAddress h : nodesOnHost.keySet()) {
// always have at least one node on each host
if (nodesOnHost.get(h) != null && !nodesOnHost.get(h).isEmpty()) {
toPing.add(nodesOnHost.get(h).iterator().next());
hostsTopology.put(h, Long.MAX_VALUE);
}
}
if (PAResourceManagerProperties.RM_TOPOLOGY_DISTANCE_ENABLED.getValueAsBoolean()) {
hostsTopology = pingNode(node, toPing);
}
topology.addHostTopology(node.getVMInformation().getHostName(), host, hostsTopology);
Set<Node> nodesList = new LinkedHashSet<>();
nodesList.add(node);
nodesOnHost.put(node.getVMInformation().getInetAddress(), nodesList);
} finally {
rwLock.writeLock().unlock();
}
if (logger.isDebugEnabled()) {
logger.debug("Node " + node.getNodeInformation().getURL() + " added.");
}
}
use of org.ow2.proactive.resourcemanager.frontend.topology.pinging.Pinger in project scheduling by ow2-proactive.
the class RMCore method releaseNodes.
/**
* {@inheritDoc}
*/
public BooleanWrapper releaseNodes(NodeSet nodes) {
if (nodes.getExtraNodes() != null) {
// do not forget to release extra nodes
nodes.addAll(nodes.getExtraNodes());
}
// exception to throw in case of problems
RuntimeException exception = null;
NodeSet nodesReleased = new NodeSet();
NodeSet nodesFailedToRelease = new NodeSet();
for (Node node : nodes) {
String nodeURL = null;
try {
nodeURL = node.getNodeInformation().getURL();
logger.debug("Releasing node " + nodeURL);
} catch (RuntimeException e) {
logger.debug("A Runtime exception occurred while obtaining information on the node," + "the node must be down (it will be detected later)", e);
// node is down, will be detected by pinger
exception = new IllegalStateException(e.getMessage(), e);
nodesFailedToRelease.add(node);
}
// verify whether the node has not been removed from the RM
if (this.allNodes.containsKey(nodeURL)) {
RMNode rmnode = this.getNodebyUrl(nodeURL);
// free
if (rmnode.isFree()) {
logger.warn("Client " + caller + " tries to release the already free node " + nodeURL);
nodesFailedToRelease.add(node);
} else if (rmnode.isDown()) {
logger.warn("Node was down, it cannot be released");
nodesFailedToRelease.add(node);
} else {
Set<? extends IdentityPrincipal> userPrincipal = rmnode.getOwner().getSubject().getPrincipals(UserNamePrincipal.class);
Permission ownerPermission = new PrincipalPermission(rmnode.getOwner().getName(), userPrincipal);
try {
caller.checkPermission(ownerPermission, caller + " is not authorized to free node " + node.getNodeInformation().getURL());
if (rmnode.isToRemove()) {
removeNodeFromCoreAndSource(rmnode, caller);
nodesReleased.add(node);
} else {
internalSetFree(rmnode);
nodesReleased.add(node);
}
} catch (SecurityException ex) {
logger.error(ex.getMessage(), ex);
nodesFailedToRelease.add(node);
exception = ex;
}
}
} else {
logger.warn("Cannot release unknown node " + nodeURL);
nodesFailedToRelease.add(node);
exception = new IllegalArgumentException("Cannot release unknown node " + nodeURL);
}
}
logger.info("Nodes released : " + nodesReleased);
if (!nodesFailedToRelease.isEmpty()) {
logger.warn("Nodes failed to release : " + nodesFailedToRelease);
}
if (exception != null) {
// throwing the latest exception we had
throw exception;
}
return new BooleanWrapper(true);
}
Aggregations