use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.
the class NodeStateMap method updateNodeHealthState.
/**
* Updates the node health state.
*
* @param nodeId Node Id
* @param newHealth new health state
*
* @throws NodeNotFoundException if the node is not present
*/
public NodeStatus updateNodeHealthState(UUID nodeId, NodeState newHealth) throws NodeNotFoundException {
try {
lock.writeLock().lock();
DatanodeInfo dn = getNodeInfo(nodeId);
NodeStatus oldStatus = dn.getNodeStatus();
NodeStatus newStatus = new NodeStatus(oldStatus.getOperationalState(), newHealth);
dn.setNodeStatus(newStatus);
return newStatus;
} finally {
lock.writeLock().unlock();
}
}
use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.
the class ReplicationManager method move.
/**
* add a move action for a given container.
*
* @param cid Container to move
* @param mp MoveDataNodePair which contains source and target datanodes
*/
public CompletableFuture<MoveResult> move(ContainerID cid, MoveDataNodePair mp) throws ContainerNotFoundException, NodeNotFoundException {
CompletableFuture<MoveResult> ret = new CompletableFuture<>();
if (!isRunning()) {
ret.complete(MoveResult.FAIL_NOT_RUNNING);
return ret;
}
if (!scmContext.isLeader()) {
ret.complete(MoveResult.FAIL_NOT_LEADER);
return ret;
}
/*
* make sure the flowing conditions are met:
* 1 the given two datanodes are in healthy state
* 2 the given container exists on the given source datanode
* 3 the given container does not exist on the given target datanode
* 4 the given container is in closed state
* 5 the giver container is not taking any inflight action
* 6 the given two datanodes are in IN_SERVICE state
* 7 {Existing replicas + Target_Dn - Source_Dn} satisfies
* the placement policy
*
* move is a combination of two steps : replication and deletion.
* if the conditions above are all met, then we take a conservative
* strategy here : replication can always be executed, but the execution
* of deletion always depends on placement policy
*/
DatanodeDetails srcDn = mp.getSrc();
DatanodeDetails targetDn = mp.getTgt();
NodeStatus currentNodeStat = nodeManager.getNodeStatus(srcDn);
NodeState healthStat = currentNodeStat.getHealth();
NodeOperationalState operationalState = currentNodeStat.getOperationalState();
if (healthStat != NodeState.HEALTHY) {
ret.complete(MoveResult.REPLICATION_FAIL_NODE_UNHEALTHY);
return ret;
}
if (operationalState != NodeOperationalState.IN_SERVICE) {
ret.complete(MoveResult.REPLICATION_FAIL_NODE_NOT_IN_SERVICE);
return ret;
}
currentNodeStat = nodeManager.getNodeStatus(targetDn);
healthStat = currentNodeStat.getHealth();
operationalState = currentNodeStat.getOperationalState();
if (healthStat != NodeState.HEALTHY) {
ret.complete(MoveResult.REPLICATION_FAIL_NODE_UNHEALTHY);
return ret;
}
if (operationalState != NodeOperationalState.IN_SERVICE) {
ret.complete(MoveResult.REPLICATION_FAIL_NODE_NOT_IN_SERVICE);
return ret;
}
// we need to synchronize on ContainerInfo, since it is
// shared by ICR/FCR handler and this.processContainer
// TODO: use a Read lock after introducing a RW lock into ContainerInfo
ContainerInfo cif = containerManager.getContainer(cid);
synchronized (cif) {
final Set<ContainerReplica> currentReplicas = containerManager.getContainerReplicas(cid);
final Set<DatanodeDetails> replicas = currentReplicas.stream().map(ContainerReplica::getDatanodeDetails).collect(Collectors.toSet());
if (replicas.contains(targetDn)) {
ret.complete(MoveResult.REPLICATION_FAIL_EXIST_IN_TARGET);
return ret;
}
if (!replicas.contains(srcDn)) {
ret.complete(MoveResult.REPLICATION_FAIL_NOT_EXIST_IN_SOURCE);
return ret;
}
if (inflightReplication.containsKey(cid)) {
ret.complete(MoveResult.REPLICATION_FAIL_INFLIGHT_REPLICATION);
return ret;
}
if (inflightDeletion.containsKey(cid)) {
ret.complete(MoveResult.REPLICATION_FAIL_INFLIGHT_DELETION);
return ret;
}
/*
* here, no need to see whether cid is in inflightMove, because
* these three map are all synchronized on ContainerInfo, if cid
* is in infligtMove , it must now being replicated or deleted,
* so it must be in inflightReplication or in infligthDeletion.
* thus, if we can not find cid in both of them , this cid must
* not be in inflightMove.
*/
LifeCycleState currentContainerStat = cif.getState();
if (currentContainerStat != LifeCycleState.CLOSED) {
ret.complete(MoveResult.REPLICATION_FAIL_CONTAINER_NOT_CLOSED);
return ret;
}
// satisfies current placement policy
if (!isPolicySatisfiedAfterMove(cif, srcDn, targetDn, currentReplicas.stream().collect(Collectors.toList()))) {
ret.complete(MoveResult.PLACEMENT_POLICY_NOT_SATISFIED);
return ret;
}
try {
moveScheduler.startMove(cid.getProtobuf(), mp.getProtobufMessage(CURRENT_VERSION));
} catch (IOException e) {
LOG.warn("Exception while starting move {}", cid);
ret.complete(MoveResult.FAIL_CAN_NOT_RECORD_TO_DB);
return ret;
}
inflightMoveFuture.putIfAbsent(cid, ret);
sendReplicateCommand(cif, targetDn, Collections.singletonList(srcDn));
}
LOG.info("receive a move request about container {} , from {} to {}", cid, srcDn.getUuid(), targetDn.getUuid());
return ret;
}
use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.
the class ReplicationManager method updateInflightAction.
/**
* Reconciles the InflightActions for a given container.
*
* @param container Container to update
* @param inflightActions inflightReplication (or) inflightDeletion
* @param filter filter to check if the operation is completed
* @param timeoutCounter update timeout metrics
* @param completedCounter update completed metrics
*/
private void updateInflightAction(final ContainerInfo container, final Map<ContainerID, List<InflightAction>> inflightActions, final Predicate<InflightAction> filter, final Runnable timeoutCounter, final Consumer<InflightAction> completedCounter) {
final ContainerID id = container.containerID();
final long deadline = clock.millis() - rmConf.getEventTimeout();
if (inflightActions.containsKey(id)) {
final List<InflightAction> actions = inflightActions.get(id);
Iterator<InflightAction> iter = actions.iterator();
while (iter.hasNext()) {
try {
InflightAction a = iter.next();
NodeStatus status = nodeManager.getNodeStatus(a.datanode);
boolean isUnhealthy = status.getHealth() != NodeState.HEALTHY;
boolean isCompleted = filter.test(a);
boolean isTimeout = a.time < deadline;
boolean isNotInService = status.getOperationalState() != NodeOperationalState.IN_SERVICE;
if (isCompleted || isUnhealthy || isTimeout || isNotInService) {
iter.remove();
if (isTimeout) {
timeoutCounter.run();
} else if (isCompleted) {
completedCounter.accept(a);
}
updateMoveIfNeeded(isUnhealthy, isCompleted, isTimeout, isNotInService, container, a.datanode, inflightActions);
}
} catch (NodeNotFoundException | ContainerNotFoundException e) {
// Should not happen, but if it does, just remove the action as the
// node somehow does not exist;
iter.remove();
}
}
if (actions.isEmpty()) {
inflightActions.remove(id);
}
}
}
use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.
the class TestNodeStateMap method testGetNodeMethodsReturnCorrectCountsAndStates.
@Test
public void testGetNodeMethodsReturnCorrectCountsAndStates() throws NodeAlreadyExistsException {
// Add one node for all possible states
int nodeCount = 0;
for (NodeOperationalState op : NodeOperationalState.values()) {
for (NodeState health : NodeState.values()) {
addRandomNodeWithState(op, health);
nodeCount++;
}
}
NodeStatus requestedState = NodeStatus.inServiceStale();
List<UUID> nodes = map.getNodes(requestedState);
assertEquals(1, nodes.size());
assertEquals(1, map.getNodeCount(requestedState));
assertEquals(nodeCount, map.getTotalNodeCount());
assertEquals(nodeCount, map.getAllNodes().size());
assertEquals(nodeCount, map.getAllDatanodeInfos().size());
// Checks for the getNodeCount(opstate, health) method
assertEquals(nodeCount, map.getNodeCount(null, null));
assertEquals(1, map.getNodeCount(NodeOperationalState.DECOMMISSIONING, NodeState.STALE));
assertEquals(5, map.getNodeCount(null, NodeState.HEALTHY));
assertEquals(4, map.getNodeCount(NodeOperationalState.DECOMMISSIONING, null));
}
use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.
the class TestNodeStateMap method testNodeHealthStateCanBeUpdated.
@Test
public void testNodeHealthStateCanBeUpdated() throws NodeAlreadyExistsException, NodeNotFoundException {
DatanodeDetails dn = generateDatanode();
NodeStatus status = NodeStatus.inServiceHealthy();
map.addNode(dn, status, null);
NodeStatus expectedStatus = NodeStatus.inServiceStale();
NodeStatus returnedStatus = map.updateNodeHealthState(dn.getUuid(), expectedStatus.getHealth());
assertEquals(expectedStatus, returnedStatus);
assertEquals(returnedStatus, map.getNodeStatus(dn.getUuid()));
}
Aggregations