Search in sources :

Example 1 with SCM

use of org.apache.hadoop.hdds.conf.ConfigTag.SCM in project ozone by apache.

the class ReplicationManager method processContainer.

/**
 * Process the given container.
 *
 * @param container ContainerInfo
 */
@SuppressWarnings("checkstyle:methodlength")
private void processContainer(ContainerInfo container, ReplicationManagerReport report) {
    if (!shouldRun()) {
        return;
    }
    final ContainerID id = container.containerID();
    try {
        // race conditions with ICR/FCR handlers
        synchronized (container) {
            final Set<ContainerReplica> replicas = containerManager.getContainerReplicas(id);
            final LifeCycleState state = container.getState();
            report.increment(state);
            /*
         * We don't take any action if the container is in OPEN state and
         * the container is healthy. If the container is not healthy, i.e.
         * the replicas are not in OPEN state, send CLOSE_CONTAINER command.
         */
            if (state == LifeCycleState.OPEN) {
                if (!isOpenContainerHealthy(container, replicas)) {
                    report.incrementAndSample(HealthState.OPEN_UNHEALTHY, container.containerID());
                    eventPublisher.fireEvent(SCMEvents.CLOSE_CONTAINER, id);
                }
                return;
            }
            /*
         * If the container is in CLOSING state, the replicas can either
         * be in OPEN or in CLOSING state. In both of this cases
         * we have to resend close container command to the datanodes.
         */
            if (state == LifeCycleState.CLOSING) {
                for (ContainerReplica replica : replicas) {
                    if (replica.getState() != State.UNHEALTHY) {
                        sendCloseCommand(container, replica.getDatanodeDetails(), false);
                    }
                }
                return;
            }
            /*
         * If the container is in QUASI_CLOSED state, check and close the
         * container if possible.
         */
            if (state == LifeCycleState.QUASI_CLOSED) {
                if (canForceCloseContainer(container, replicas)) {
                    forceCloseContainer(container, replicas);
                    return;
                } else {
                    report.incrementAndSample(HealthState.QUASI_CLOSED_STUCK, container.containerID());
                }
            }
            /*
         * Before processing the container we have to reconcile the
         * inflightReplication and inflightDeletion actions.
         *
         * We remove the entry from inflightReplication and inflightDeletion
         * list, if the operation is completed or if it has timed out.
         */
            updateInflightAction(container, inflightReplication, action -> replicas.stream().anyMatch(r -> r.getDatanodeDetails().equals(action.datanode)), () -> metrics.incrNumReplicationCmdsTimeout(), action -> updateCompletedReplicationMetrics(container, action));
            updateInflightAction(container, inflightDeletion, action -> replicas.stream().noneMatch(r -> r.getDatanodeDetails().equals(action.datanode)), () -> metrics.incrNumDeletionCmdsTimeout(), action -> updateCompletedDeletionMetrics(container, action));
            /*
         * If container is under deleting and all it's replicas are deleted,
         * then make the container as CLEANED,
         * or resend the delete replica command if needed.
         */
            if (state == LifeCycleState.DELETING) {
                handleContainerUnderDelete(container, replicas);
                return;
            }
            /**
             * We don't need to take any action for a DELETE container - eventually
             * it will be removed from SCM.
             */
            if (state == LifeCycleState.DELETED) {
                return;
            }
            ContainerReplicaCount replicaSet = getContainerReplicaCount(container, replicas);
            ContainerPlacementStatus placementStatus = getPlacementStatus(replicas, container.getReplicationConfig().getRequiredNodes());
            /*
         * We don't have to take any action if the container is healthy.
         *
         * According to ReplicationMonitor container is considered healthy if
         * the container is either in QUASI_CLOSED or in CLOSED state and has
         * exact number of replicas in the same state.
         */
            if (isContainerEmpty(container, replicas)) {
                report.incrementAndSample(HealthState.EMPTY, container.containerID());
                /*
           *  If container is empty, schedule task to delete the container.
           */
                deleteContainerReplicas(container, replicas);
                return;
            }
            /*
         * Check if the container is under replicated and take appropriate
         * action.
         */
            boolean sufficientlyReplicated = replicaSet.isSufficientlyReplicated();
            boolean placementSatisfied = placementStatus.isPolicySatisfied();
            if (!sufficientlyReplicated || !placementSatisfied) {
                if (!sufficientlyReplicated) {
                    report.incrementAndSample(HealthState.UNDER_REPLICATED, container.containerID());
                    if (replicaSet.isMissing()) {
                        report.incrementAndSample(HealthState.MISSING, container.containerID());
                    }
                }
                if (!placementSatisfied) {
                    report.incrementAndSample(HealthState.MIS_REPLICATED, container.containerID());
                }
                handleUnderReplicatedContainer(container, replicaSet, placementStatus);
                return;
            }
            /*
         * Check if the container is over replicated and take appropriate
         * action.
         */
            if (replicaSet.isOverReplicated()) {
                report.incrementAndSample(HealthState.OVER_REPLICATED, container.containerID());
                handleOverReplicatedContainer(container, replicaSet);
                return;
            }
            /*
       If we get here, the container is not over replicated or under replicated
       but it may be "unhealthy", which means it has one or more replica which
       are not in the same state as the container itself.
       */
            if (!replicaSet.isHealthy()) {
                report.incrementAndSample(HealthState.UNHEALTHY, container.containerID());
                handleUnstableContainer(container, replicas);
            }
        }
    } catch (ContainerNotFoundException ex) {
        LOG.warn("Missing container {}.", id);
    } catch (Exception ex) {
        LOG.warn("Process container {} error: ", id, ex);
    }
}
Also used : ConfigGroup(org.apache.hadoop.hdds.conf.ConfigGroup) ScmConfigKeys(org.apache.hadoop.hdds.scm.ScmConfigKeys) HddsProtos(org.apache.hadoop.hdds.protocol.proto.HddsProtos) NodeStatus(org.apache.hadoop.hdds.scm.node.NodeStatus) DeleteContainerCommand(org.apache.hadoop.ozone.protocol.commands.DeleteContainerCommand) LoggerFactory(org.slf4j.LoggerFactory) ConfigurationSource(org.apache.hadoop.hdds.conf.ConfigurationSource) EventPublisher(org.apache.hadoop.hdds.server.events.EventPublisher) Duration(java.time.Duration) Map(java.util.Map) SCMHAManager(org.apache.hadoop.hdds.scm.ha.SCMHAManager) ReplicateContainerCommand(org.apache.hadoop.ozone.protocol.commands.ReplicateContainerCommand) HddsConfigKeys(org.apache.hadoop.hdds.HddsConfigKeys) ConfigType(org.apache.hadoop.hdds.conf.ConfigType) Predicate(java.util.function.Predicate) MOVE(org.apache.hadoop.hdds.protocol.proto.SCMRatisProtocol.RequestType.MOVE) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) ExitUtil(org.apache.hadoop.util.ExitUtil) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) ContainerPlacementStatus(org.apache.hadoop.hdds.scm.ContainerPlacementStatus) CommandForDatanode(org.apache.hadoop.ozone.protocol.commands.CommandForDatanode) List(java.util.List) StorageUnit(org.apache.hadoop.hdds.conf.StorageUnit) PlacementPolicy(org.apache.hadoop.hdds.scm.PlacementPolicy) Config(org.apache.hadoop.hdds.conf.Config) MoveDataNodePair(org.apache.hadoop.hdds.scm.container.common.helpers.MoveDataNodePair) SCMServiceManager(org.apache.hadoop.hdds.scm.ha.SCMServiceManager) SCMHAInvocationHandler(org.apache.hadoop.hdds.scm.ha.SCMHAInvocationHandler) InvalidStateTransitionException(org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException) Proxy(java.lang.reflect.Proxy) NodeManager(org.apache.hadoop.hdds.scm.node.NodeManager) HealthState(org.apache.hadoop.hdds.scm.container.ReplicationManagerReport.HealthState) CURRENT_VERSION(org.apache.hadoop.ozone.ClientVersions.CURRENT_VERSION) Preconditions(org.apache.ratis.util.Preconditions) Replicate(org.apache.hadoop.hdds.scm.metadata.Replicate) NodeOperationalState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeOperationalState) CompletableFuture(java.util.concurrent.CompletableFuture) SCMContext(org.apache.hadoop.hdds.scm.ha.SCMContext) SCMRatisServer(org.apache.hadoop.hdds.scm.ha.SCMRatisServer) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) LifeCycleState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState) NotLeaderException(org.apache.ratis.protocol.exceptions.NotLeaderException) SCMService(org.apache.hadoop.hdds.scm.ha.SCMService) NodeNotFoundException(org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException) CloseContainerCommand(org.apache.hadoop.ozone.protocol.commands.CloseContainerCommand) GeneratedMessage(com.google.protobuf.GeneratedMessage) LinkedList(java.util.LinkedList) StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) DBTransactionBuffer(org.apache.hadoop.hdds.scm.metadata.DBTransactionBuffer) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) ReentrantLock(java.util.concurrent.locks.ReentrantLock) State(org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReplicaProto.State) DatanodeDetails(org.apache.hadoop.hdds.protocol.DatanodeDetails) IOException(java.io.IOException) SCMEvents(org.apache.hadoop.hdds.scm.events.SCMEvents) NodeState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.NodeState) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) Lock(java.util.concurrent.locks.Lock) OZONE(org.apache.hadoop.hdds.conf.ConfigTag.OZONE) Table(org.apache.hadoop.hdds.utils.db.Table) SCM(org.apache.hadoop.hdds.conf.ConfigTag.SCM) Clock(java.time.Clock) ReplicationManagerMetrics(org.apache.hadoop.hdds.scm.container.replication.ReplicationManagerMetrics) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Comparator(java.util.Comparator) TableIterator(org.apache.hadoop.hdds.utils.db.TableIterator) Collections(java.util.Collections) SCMCommand(org.apache.hadoop.ozone.protocol.commands.SCMCommand) LifeCycleState(org.apache.hadoop.hdds.protocol.proto.HddsProtos.LifeCycleState) ContainerPlacementStatus(org.apache.hadoop.hdds.scm.ContainerPlacementStatus) InvalidStateTransitionException(org.apache.hadoop.ozone.common.statemachine.InvalidStateTransitionException) NotLeaderException(org.apache.ratis.protocol.exceptions.NotLeaderException) NodeNotFoundException(org.apache.hadoop.hdds.scm.node.states.NodeNotFoundException) IOException(java.io.IOException)

Aggregations

VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 GeneratedMessage (com.google.protobuf.GeneratedMessage)1 IOException (java.io.IOException)1 Proxy (java.lang.reflect.Proxy)1 Clock (java.time.Clock)1 Duration (java.time.Duration)1 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 Comparator (java.util.Comparator)1 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 LinkedHashMap (java.util.LinkedHashMap)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 UUID (java.util.UUID)1 CompletableFuture (java.util.concurrent.CompletableFuture)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 TimeUnit (java.util.concurrent.TimeUnit)1