Search in sources :

Example 1 with NodeStatus

use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.

the class NodeStateMap method updateNodeHealthState.

/**
 * Updates the node health state.
 *
 * @param nodeId Node Id
 * @param newHealth new health state
 *
 * @throws NodeNotFoundException if the node is not present
 */
public NodeStatus updateNodeHealthState(UUID nodeId, NodeState newHealth) throws NodeNotFoundException {
    try {
        lock.writeLock().lock();
        DatanodeInfo dn = getNodeInfo(nodeId);
        NodeStatus oldStatus = dn.getNodeStatus();
        NodeStatus newStatus = new NodeStatus(oldStatus.getOperationalState(), newHealth);
        dn.setNodeStatus(newStatus);
        return newStatus;
    } finally {
        lock.writeLock().unlock();
    }
}
Also used : DatanodeInfo(org.apache.hadoop.hdds.scm.node.DatanodeInfo) NodeStatus(org.apache.hadoop.hdds.scm.node.NodeStatus)

Example 2 with NodeStatus

use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.

the class ReplicationManager method move.

/**
 * add a move action for a given container.
 *
 * @param cid Container to move
 * @param mp MoveDataNodePair which contains source and target datanodes
 */
public CompletableFuture<MoveResult> move(ContainerID cid, MoveDataNodePair mp) throws ContainerNotFoundException, NodeNotFoundException {
    CompletableFuture<MoveResult> ret = new CompletableFuture<>();
    if (!isRunning()) {
        ret.complete(MoveResult.FAIL_NOT_RUNNING);
        return ret;
    }
    if (!scmContext.isLeader()) {
        ret.complete(MoveResult.FAIL_NOT_LEADER);
        return ret;
    }
    /*
     * make sure the flowing conditions are met:
     *  1 the given two datanodes are in healthy state
     *  2 the given container exists on the given source datanode
     *  3 the given container does not exist on the given target datanode
     *  4 the given container is in closed state
     *  5 the giver container is not taking any inflight action
     *  6 the given two datanodes are in IN_SERVICE state
     *  7 {Existing replicas + Target_Dn - Source_Dn} satisfies
     *     the placement policy
     *
     * move is a combination of two steps : replication and deletion.
     * if the conditions above are all met, then we take a conservative
     * strategy here : replication can always be executed, but the execution
     * of deletion always depends on placement policy
     */
    DatanodeDetails srcDn = mp.getSrc();
    DatanodeDetails targetDn = mp.getTgt();
    NodeStatus currentNodeStat = nodeManager.getNodeStatus(srcDn);
    NodeState healthStat = currentNodeStat.getHealth();
    NodeOperationalState operationalState = currentNodeStat.getOperationalState();
    if (healthStat != NodeState.HEALTHY) {
        ret.complete(MoveResult.REPLICATION_FAIL_NODE_UNHEALTHY);
        return ret;
    }
    if (operationalState != NodeOperationalState.IN_SERVICE) {
        ret.complete(MoveResult.REPLICATION_FAIL_NODE_NOT_IN_SERVICE);
        return ret;
    }
    currentNodeStat = nodeManager.getNodeStatus(targetDn);
    healthStat = currentNodeStat.getHealth();
    operationalState = currentNodeStat.getOperationalState();
    if (healthStat != NodeState.HEALTHY) {
        ret.complete(MoveResult.REPLICATION_FAIL_NODE_UNHEALTHY);
        return ret;
    }
    if (operationalState != NodeOperationalState.IN_SERVICE) {
        ret.complete(MoveResult.REPLICATION_FAIL_NODE_NOT_IN_SERVICE);
        return ret;
    }
    // we need to synchronize on ContainerInfo, since it is
    // shared by ICR/FCR handler and this.processContainer
    // TODO: use a Read lock after introducing a RW lock into ContainerInfo
    ContainerInfo cif = containerManager.getContainer(cid);
    synchronized (cif) {
        final Set<ContainerReplica> currentReplicas = containerManager.getContainerReplicas(cid);
        final Set<DatanodeDetails> replicas = currentReplicas.stream().map(ContainerReplica::getDatanodeDetails).collect(Collectors.toSet());
        if (replicas.contains(targetDn)) {
            ret.complete(MoveResult.REPLICATION_FAIL_EXIST_IN_TARGET);
            return ret;
        }
        if (!replicas.contains(srcDn)) {
            ret.complete(MoveResult.REPLICATION_FAIL_NOT_EXIST_IN_SOURCE);
            return ret;
        }
        if (inflightReplication.containsKey(cid)) {
            ret.complete(MoveResult.REPLICATION_FAIL_INFLIGHT_REPLICATION);
            return ret;
        }
        if (inflightDeletion.containsKey(cid)) {
            ret.complete(MoveResult.REPLICATION_FAIL_INFLIGHT_DELETION);
            return ret;
        }
        /*
      * here, no need to see whether cid is in inflightMove, because
      * these three map are all synchronized on ContainerInfo, if cid
      * is in infligtMove , it must now being replicated or deleted,
      * so it must be in inflightReplication or in infligthDeletion.
      * thus, if we can not find cid in both of them , this cid must
      * not be in inflightMove.
      */
        LifeCycleState currentContainerStat = cif.getState();
        if (currentContainerStat != LifeCycleState.CLOSED) {
            ret.complete(MoveResult.REPLICATION_FAIL_CONTAINER_NOT_CLOSED);
            return ret;
        }
        // satisfies current placement policy
        if (!isPolicySatisfiedAfterMove(cif, srcDn, targetDn, currentReplicas.stream().collect(Collectors.toList()))) {
            ret.complete(MoveResult.PLACEMENT_POLICY_NOT_SATISFIED);
            return ret;
        }
        try {
            moveScheduler.startMove(cid.getProtobuf(), mp.getProtobufMessage(CURRENT_VERSION));
        } catch (IOException e) {
            LOG.warn("Exception while starting move {}", cid);
            ret.complete(MoveResult.FAIL_CAN_NOT_RECORD_TO_DB);
            return ret;
        }
        inflightMoveFuture.putIfAbsent(cid, ret);
        sendReplicateCommand(cif, targetDn, Collections.singletonList(srcDn));
    }
    LOG.info("receive a move request about container {} , from {} to {}", cid, srcDn.getUuid(), targetDn.getUuid());
    return ret;
}
Also used : NodeState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState) IOException(java.io.IOException) CompletableFuture(java.util.concurrent.CompletableFuture) DatanodeDetails(org.apache.hadoop.hdds.protocol.DatanodeDetails) NodeOperationalState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState) LifeCycleState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState) NodeStatus(org.apache.hadoop.hdds.scm.node.NodeStatus)

Example 3 with NodeStatus

use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.

the class ReplicationManager method updateInflightAction.

/**
 * Reconciles the InflightActions for a given container.
 *
 * @param container Container to update
 * @param inflightActions inflightReplication (or) inflightDeletion
 * @param filter filter to check if the operation is completed
 * @param timeoutCounter update timeout metrics
 * @param completedCounter update completed metrics
 */
private void updateInflightAction(final ContainerInfo container, final Map<ContainerID, List<InflightAction>> inflightActions, final Predicate<InflightAction> filter, final Runnable timeoutCounter, final Consumer<InflightAction> completedCounter) {
    final ContainerID id = container.containerID();
    final long deadline = clock.millis() - rmConf.getEventTimeout();
    if (inflightActions.containsKey(id)) {
        final List<InflightAction> actions = inflightActions.get(id);
        Iterator<InflightAction> iter = actions.iterator();
        while (iter.hasNext()) {
            try {
                InflightAction a = iter.next();
                NodeStatus status = nodeManager.getNodeStatus(a.datanode);
                boolean isUnhealthy = status.getHealth() != NodeState.HEALTHY;
                boolean isCompleted = filter.test(a);
                boolean isTimeout = a.time < deadline;
                boolean isNotInService = status.getOperationalState() != NodeOperationalState.IN_SERVICE;
                if (isCompleted || isUnhealthy || isTimeout || isNotInService) {
                    iter.remove();
                    if (isTimeout) {
                        timeoutCounter.run();
                    } else if (isCompleted) {
                        completedCounter.accept(a);
                    }
                    updateMoveIfNeeded(isUnhealthy, isCompleted, isTimeout, isNotInService, container, a.datanode, inflightActions);
                }
            } catch (NodeNotFoundException | ContainerNotFoundException e) {
                // Should not happen, but if it does, just remove the action as the
                // node somehow does not exist;
                iter.remove();
            }
        }
        if (actions.isEmpty()) {
            inflightActions.remove(id);
        }
    }
}
Also used : NodeNotFoundException(org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException) NodeStatus(org.apache.hadoop.hdds.scm.node.NodeStatus)

Example 4 with NodeStatus

use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.

the class TestNodeStateMap method testGetNodeMethodsReturnCorrectCountsAndStates.

@Test
public void testGetNodeMethodsReturnCorrectCountsAndStates() throws NodeAlreadyExistsException {
    // Add one node for all possible states
    int nodeCount = 0;
    for (NodeOperationalState op : NodeOperationalState.values()) {
        for (NodeState health : NodeState.values()) {
            addRandomNodeWithState(op, health);
            nodeCount++;
        }
    }
    NodeStatus requestedState = NodeStatus.inServiceStale();
    List<UUID> nodes = map.getNodes(requestedState);
    assertEquals(1, nodes.size());
    assertEquals(1, map.getNodeCount(requestedState));
    assertEquals(nodeCount, map.getTotalNodeCount());
    assertEquals(nodeCount, map.getAllNodes().size());
    assertEquals(nodeCount, map.getAllDatanodeInfos().size());
    // Checks for the getNodeCount(opstate, health) method
    assertEquals(nodeCount, map.getNodeCount(null, null));
    assertEquals(1, map.getNodeCount(NodeOperationalState.DECOMMISSIONING, NodeState.STALE));
    assertEquals(5, map.getNodeCount(null, NodeState.HEALTHY));
    assertEquals(4, map.getNodeCount(NodeOperationalState.DECOMMISSIONING, null));
}
Also used : NodeState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState) NodeOperationalState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState) UUID(java.util.UUID) NodeStatus(org.apache.hadoop.hdds.scm.node.NodeStatus) Test(org.junit.Test)

Example 5 with NodeStatus

use of org.apache.hadoop.hdds.scm.node.NodeStatus in project ozone by apache.

the class TestNodeStateMap method testNodeHealthStateCanBeUpdated.

@Test
public void testNodeHealthStateCanBeUpdated() throws NodeAlreadyExistsException, NodeNotFoundException {
    DatanodeDetails dn = generateDatanode();
    NodeStatus status = NodeStatus.inServiceHealthy();
    map.addNode(dn, status, null);
    NodeStatus expectedStatus = NodeStatus.inServiceStale();
    NodeStatus returnedStatus = map.updateNodeHealthState(dn.getUuid(), expectedStatus.getHealth());
    assertEquals(expectedStatus, returnedStatus);
    assertEquals(returnedStatus, map.getNodeStatus(dn.getUuid()));
}
Also used : DatanodeDetails(org.apache.hadoop.hdds.protocol.DatanodeDetails) MockDatanodeDetails(org.apache.hadoop.hdds.protocol.MockDatanodeDetails) NodeStatus(org.apache.hadoop.hdds.scm.node.NodeStatus) Test(org.junit.Test)

Aggregations

NodeStatus (org.apache.hadoop.hdds.scm.node.NodeStatus)33 Test (org.junit.Test)25 DatanodeDetails (org.apache.hadoop.hdds.protocol.DatanodeDetails)15 MockDatanodeDetails.randomDatanodeDetails (org.apache.hadoop.hdds.protocol.MockDatanodeDetails.randomDatanodeDetails)7 MockDatanodeDetails.createDatanodeDetails (org.apache.hadoop.hdds.protocol.MockDatanodeDetails.createDatanodeDetails)6 MoveDataNodePair (org.apache.hadoop.hdds.scm.container.common.helpers.MoveDataNodePair)6 MockDatanodeDetails (org.apache.hadoop.hdds.protocol.MockDatanodeDetails)5 MoveResult (org.apache.hadoop.hdds.scm.container.ReplicationManager.MoveResult)5 ReplicationManagerConfiguration (org.apache.hadoop.hdds.scm.container.ReplicationManager.ReplicationManagerConfiguration)5 IOException (java.io.IOException)4 NodeNotFoundException (org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException)4 HddsProtos (org.apache.hadoop.hdds.protocol.proto.HddsProtos)3 NodeOperationalState (org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState)3 DatanodeInfo (org.apache.hadoop.hdds.scm.node.DatanodeInfo)3 ArrayList (java.util.ArrayList)2 UUID (java.util.UUID)2 CompletableFuture (java.util.concurrent.CompletableFuture)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 LifeCycleState (org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState)2 NodeState (org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState)2