use of org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException in project ozone by apache.
the class NodeDecommissionManager method decommissionNodes.
public synchronized List<DatanodeAdminError> decommissionNodes(List<String> nodes) throws InvalidHostStringException {
List<DatanodeDetails> dns = mapHostnamesToDatanodes(nodes);
List<DatanodeAdminError> errors = new ArrayList<>();
for (DatanodeDetails dn : dns) {
try {
startDecommission(dn);
} catch (NodeNotFoundException e) {
// We already validated the host strings and retrieved the DnDetails
// object from the node manager. Therefore we should never get a
// NodeNotFoundException here expect if the node is remove in the
// very short window between validation and starting decom. Therefore
// log a warning and ignore the exception
LOG.warn("The host {} was not found in SCM. Ignoring the request to " + "decommission it", dn.getHostName());
errors.add(new DatanodeAdminError(dn.getHostName(), "The host was not found in SCM"));
} catch (InvalidNodeStateException e) {
errors.add(new DatanodeAdminError(dn.getHostName(), e.getMessage()));
}
}
return errors;
}
use of org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException in project ozone by apache.
the class DeadNodeHandler method onMessage.
@Override
public void onMessage(final DatanodeDetails datanodeDetails, final EventPublisher publisher) {
try {
/*
* We should have already destroyed all the pipelines on this datanode
* when it was marked as stale. Destroy pipeline should also have closed
* all the containers on this datanode.
*
* Ideally we should not have any pipeline or OPEN containers now.
*
* To be on a safer side, we double check here and take appropriate
* action.
*/
LOG.info("A dead datanode is detected. {}", datanodeDetails);
destroyPipelines(datanodeDetails);
closeContainers(datanodeDetails, publisher);
// is IN_MAINTENANCE
if (!nodeManager.getNodeStatus(datanodeDetails).isInMaintenance()) {
removeContainerReplicas(datanodeDetails);
}
// move dead datanode out of ClusterNetworkTopology
NetworkTopology nt = nodeManager.getClusterNetworkTopologyMap();
if (nt.contains(datanodeDetails)) {
nt.remove(datanodeDetails);
// make sure after DN is removed from topology,
// DatanodeDetails instance returned from nodeStateManager has no parent.
Preconditions.checkState(nodeManager.getNodeByUuid(datanodeDetails.getUuidString()).getParent() == null);
}
} catch (NodeNotFoundException ex) {
// This should not happen, we cannot get a dead node event for an
// unregistered datanode!
LOG.error("DeadNode event for a unregistered node: {}!", datanodeDetails);
}
}
use of org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException in project ozone by apache.
the class NodeStateManager method addNode.
/**
* Adds a new node to the state manager.
*
* @param datanodeDetails DatanodeDetails
* @param layoutInfo LayoutVersionProto
*
* @throws NodeAlreadyExistsException if the node is already present
*/
public void addNode(DatanodeDetails datanodeDetails, LayoutVersionProto layoutInfo) throws NodeAlreadyExistsException {
NodeStatus newNodeStatus = newNodeStatus(datanodeDetails, layoutInfo);
nodeStateMap.addNode(datanodeDetails, newNodeStatus, layoutInfo);
UUID dnID = datanodeDetails.getUuid();
try {
updateLastKnownLayoutVersion(datanodeDetails, layoutInfo);
} catch (NodeNotFoundException ex) {
LOG.error("Inconsistent NodeStateMap! Datanode with ID {} was " + "added but not found in map: {}", dnID, nodeStateMap);
}
}
use of org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException in project ozone by apache.
the class DatanodeAdminMonitorImpl method processCancelledNodes.
private void processCancelledNodes() {
while (!cancelledNodes.isEmpty()) {
DatanodeDetails dn = cancelledNodes.poll();
try {
stopTrackingNode(dn);
putNodeBackInService(dn);
LOG.info("Recommissioned node {}", dn);
} catch (NodeNotFoundException e) {
LOG.warn("Failed processing the cancel admin request for {}", dn, e);
}
}
}
use of org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException in project ozone by apache.
the class ReplicationManager method updateInflightAction.
/**
* Reconciles the InflightActions for a given container.
*
* @param container Container to update
* @param inflightActions inflightReplication (or) inflightDeletion
* @param filter filter to check if the operation is completed
* @param timeoutCounter update timeout metrics
* @param completedCounter update completed metrics
*/
private void updateInflightAction(final ContainerInfo container, final Map<ContainerID, List<InflightAction>> inflightActions, final Predicate<InflightAction> filter, final Runnable timeoutCounter, final Consumer<InflightAction> completedCounter) {
final ContainerID id = container.containerID();
final long deadline = clock.millis() - rmConf.getEventTimeout();
if (inflightActions.containsKey(id)) {
final List<InflightAction> actions = inflightActions.get(id);
Iterator<InflightAction> iter = actions.iterator();
while (iter.hasNext()) {
try {
InflightAction a = iter.next();
NodeStatus status = nodeManager.getNodeStatus(a.datanode);
boolean isUnhealthy = status.getHealth() != NodeState.HEALTHY;
boolean isCompleted = filter.test(a);
boolean isTimeout = a.time < deadline;
boolean isNotInService = status.getOperationalState() != NodeOperationalState.IN_SERVICE;
if (isCompleted || isUnhealthy || isTimeout || isNotInService) {
iter.remove();
if (isTimeout) {
timeoutCounter.run();
} else if (isCompleted) {
completedCounter.accept(a);
}
updateMoveIfNeeded(isUnhealthy, isCompleted, isTimeout, isNotInService, container, a.datanode, inflightActions);
}
} catch (NodeNotFoundException | ContainerNotFoundException e) {
// Should not happen, but if it does, just remove the action as the
// node somehow does not exist;
iter.remove();
}
}
if (actions.isEmpty()) {
inflightActions.remove(id);
}
}
}
Aggregations