Search in sources :

Example 1 with FiCaSchedulerNode

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode in project hadoop by apache.

the class QueuePriorityContainerCandidateSelector method tryToMakeBetterReservationPlacement.

private void tryToMakeBetterReservationPlacement(RMContainer reservedContainer, List<FiCaSchedulerNode> allSchedulerNodes) {
    for (FiCaSchedulerNode targetNode : allSchedulerNodes) {
        // Precheck if we can move the rmContainer to the new targetNode
        if (!preChecksForMovingReservedContainerToNode(reservedContainer, targetNode)) {
            continue;
        }
        if (canPreemptEnoughResourceForAsked(reservedContainer.getReservedResource(), reservedContainer.getQueueName(), targetNode, true, null)) {
            NodeId fromNode = reservedContainer.getNodeId();
            // scheduler
            if (preemptionContext.getScheduler().moveReservedContainer(reservedContainer, targetNode)) {
                LOG.info("Successfully moved reserved container=" + reservedContainer.getContainerId() + " from targetNode=" + fromNode + " to targetNode=" + targetNode.getNodeID());
                touchedNodes.add(targetNode.getNodeID());
            }
        }
    }
}
Also used : FiCaSchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode) NodeId(org.apache.hadoop.yarn.api.records.NodeId)

Example 2 with FiCaSchedulerNode

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode in project hadoop by apache.

the class QueuePriorityContainerCandidateSelector method selectCandidates.

@Override
public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, Resource clusterResource, Resource totalPreemptedResourceAllowed) {
    // Initialize digraph from queues
    // TODO (wangda): only do this when queue refreshed.
    priorityDigraph.clear();
    intializePriorityDigraph();
    // direct return.
    if (priorityDigraph.isEmpty()) {
        return selectedCandidates;
    }
    // Save parameters to be shared by other methods
    this.selectedCandidates = selectedCandidates;
    this.clusterResource = clusterResource;
    this.totalPreemptionAllowed = totalPreemptedResourceAllowed;
    toPreemptedFromOtherQueues.clear();
    reservedContainers = new ArrayList<>();
    // Clear temp-scheduler-node-map every time when doing selection of
    // containers.
    tempSchedulerNodeMap.clear();
    touchedNodes = new HashSet<>();
    // Add all reserved containers for analysis
    List<FiCaSchedulerNode> allSchedulerNodes = preemptionContext.getScheduler().getAllNodes();
    for (FiCaSchedulerNode node : allSchedulerNodes) {
        RMContainer reservedContainer = node.getReservedContainer();
        if (null != reservedContainer) {
            // container belongs to has high priority than at least one queue
            if (priorityDigraph.containsRow(reservedContainer.getQueueName())) {
                reservedContainers.add(reservedContainer);
            }
        }
    }
    // Sort reserved container by creation time
    Collections.sort(reservedContainers, CONTAINER_CREATION_TIME_COMPARATOR);
    long currentTime = System.currentTimeMillis();
    // From the begining of the list
    for (RMContainer reservedContainer : reservedContainers) {
        // and cannot be allocated after minTimeout
        if (currentTime - reservedContainer.getCreationTime() < minTimeout) {
            continue;
        }
        FiCaSchedulerNode node = preemptionContext.getScheduler().getNode(reservedContainer.getReservedNode());
        if (null == node) {
            // Something is wrong, ignore
            continue;
        }
        List<RMContainer> newlySelectedToBePreemptContainers = new ArrayList<>();
        // Check if we can preempt for this queue
        // We will skip if the demanding queue is already satisfied.
        String demandingQueueName = reservedContainer.getQueueName();
        boolean demandingQueueSatisfied = isQueueSatisfied(demandingQueueName, node.getPartition());
        // We will continue check if it is possible to preempt reserved container
        // from the node.
        boolean canPreempt = false;
        if (!demandingQueueSatisfied) {
            canPreempt = canPreemptEnoughResourceForAsked(reservedContainer.getReservedResource(), demandingQueueName, node, false, newlySelectedToBePreemptContainers);
        }
        // preemption others
        if (canPreempt) {
            touchedNodes.add(node.getNodeID());
            if (LOG.isDebugEnabled()) {
                LOG.debug("Trying to preempt following containers to make reserved " + "container=" + reservedContainer.getContainerId() + " on node=" + node.getNodeID() + " can be allocated:");
            }
            // Update to-be-preempt
            incToPreempt(demandingQueueName, node.getPartition(), reservedContainer.getReservedResource());
            for (RMContainer c : newlySelectedToBePreemptContainers) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug(" --container=" + c.getContainerId() + " resource=" + c.getReservedResource());
                }
                Set<RMContainer> containers = selectedCandidates.get(c.getApplicationAttemptId());
                if (null == containers) {
                    containers = new HashSet<>();
                    selectedCandidates.put(c.getApplicationAttemptId(), containers);
                }
                containers.add(c);
                // Update totalPreemptionResourceAllowed
                Resources.subtractFrom(totalPreemptedResourceAllowed, c.getAllocatedResource());
            }
        } else if (!demandingQueueSatisfied) {
            //
            if (allowMoveReservation) {
                tryToMakeBetterReservationPlacement(reservedContainer, allSchedulerNodes);
            }
        }
    }
    return selectedCandidates;
}
Also used : FiCaSchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode) ArrayList(java.util.ArrayList) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)

Example 3 with FiCaSchedulerNode

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode in project hadoop by apache.

the class ReservedContainerCandidatesSelector method getNodesForPreemption.

private List<NodeForPreemption> getNodesForPreemption(Resource cluster, Map<String, Map<String, Resource>> queueToPreemptableResourceByPartition, Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, Resource totalPreemptionAllowed) {
    List<NodeForPreemption> nfps = new ArrayList<>();
    // get nodes have reserved container
    for (FiCaSchedulerNode node : preemptionContext.getScheduler().getAllNodes()) {
        if (node.getReservedContainer() != null) {
            NodeForPreemption nfp = getPreemptionCandidatesOnNode(node, cluster, queueToPreemptableResourceByPartition, selectedCandidates, totalPreemptionAllowed, true);
            if (null != nfp) {
                // Null means we cannot preempt containers on the node to satisfy
                // reserved container
                nfps.add(nfp);
            }
        }
    }
    // Return sorted node-for-preemptions (by cost)
    Collections.sort(nfps, new Comparator<NodeForPreemption>() {

        @Override
        public int compare(NodeForPreemption o1, NodeForPreemption o2) {
            return Float.compare(o1.preemptionCost, o2.preemptionCost);
        }
    });
    return nfps;
}
Also used : FiCaSchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode) ArrayList(java.util.ArrayList)

Example 4 with FiCaSchedulerNode

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode in project hadoop by apache.

the class AbstractCSQueue method accept.

@Override
public boolean accept(Resource cluster, ResourceCommitRequest<FiCaSchedulerApp, FiCaSchedulerNode> request) {
    // Do we need to check parent queue before making this decision?
    boolean checkParentQueue = false;
    ContainerAllocationProposal<FiCaSchedulerApp, FiCaSchedulerNode> allocation = request.getFirstAllocatedOrReservedContainer();
    SchedulerContainer<FiCaSchedulerApp, FiCaSchedulerNode> schedulerContainer = allocation.getAllocatedOrReservedContainer();
    // Do not check when allocating new container from a reserved container
    if (allocation.getAllocateFromReservedContainer() == null) {
        Resource required = allocation.getAllocatedOrReservedResource();
        Resource netAllocated = Resources.subtract(required, request.getTotalReleasedResource());
        try {
            readLock.lock();
            String partition = schedulerContainer.getNodePartition();
            Resource maxResourceLimit;
            if (allocation.getSchedulingMode() == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY) {
                maxResourceLimit = getQueueMaxResource(partition, cluster);
            } else {
                maxResourceLimit = labelManager.getResourceByLabel(schedulerContainer.getNodePartition(), cluster);
            }
            if (!Resources.fitsIn(resourceCalculator, cluster, Resources.add(queueUsage.getUsed(partition), netAllocated), maxResourceLimit)) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Used resource=" + queueUsage.getUsed(partition) + " exceeded maxResourceLimit of the queue =" + maxResourceLimit);
                }
                return false;
            }
        } finally {
            readLock.unlock();
        }
        // Only check parent queue when something new allocated or reserved.
        checkParentQueue = true;
    }
    if (parent != null && checkParentQueue) {
        return parent.accept(cluster, request);
    }
    return true;
}
Also used : FiCaSchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode) FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp) Resource(org.apache.hadoop.yarn.api.records.Resource)

Example 5 with FiCaSchedulerNode

use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode in project hadoop by apache.

the class ParentQueue method killContainersToEnforceMaxQueueCapacity.

private void killContainersToEnforceMaxQueueCapacity(String partition, Resource clusterResource) {
    Iterator<RMContainer> killableContainerIter = getKillableContainers(partition);
    if (!killableContainerIter.hasNext()) {
        return;
    }
    Resource partitionResource = labelManager.getResourceByLabel(partition, null);
    Resource maxResource = Resources.multiply(partitionResource, getQueueCapacities().getAbsoluteMaximumCapacity(partition));
    while (Resources.greaterThan(resourceCalculator, partitionResource, queueUsage.getUsed(partition), maxResource)) {
        RMContainer toKillContainer = killableContainerIter.next();
        FiCaSchedulerApp attempt = csContext.getApplicationAttempt(toKillContainer.getContainerId().getApplicationAttemptId());
        FiCaSchedulerNode node = csContext.getNode(toKillContainer.getAllocatedNode());
        if (null != attempt && null != node) {
            LeafQueue lq = attempt.getCSLeafQueue();
            lq.completedContainer(clusterResource, attempt, node, toKillContainer, SchedulerUtils.createPreemptedContainerStatus(toKillContainer.getContainerId(), SchedulerUtils.PREEMPTED_CONTAINER), RMContainerEventType.KILL, null, false);
            LOG.info("Killed container=" + toKillContainer.getContainerId() + " from queue=" + lq.getQueueName() + " to make queue=" + this.getQueueName() + "'s max-capacity enforced");
        }
        if (!killableContainerIter.hasNext()) {
            break;
        }
    }
}
Also used : FiCaSchedulerNode(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode) FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp) Resource(org.apache.hadoop.yarn.api.records.Resource) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)

Aggregations

FiCaSchedulerNode (org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode)79 FiCaSchedulerApp (org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp)47 Resource (org.apache.hadoop.yarn.api.records.Resource)46 Test (org.junit.Test)39 ResourceLimits (org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits)37 NodeId (org.apache.hadoop.yarn.api.records.NodeId)35 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)34 Priority (org.apache.hadoop.yarn.api.records.Priority)34 RMContainer (org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)21 ActiveUsersManager (org.apache.hadoop.yarn.server.resourcemanager.scheduler.ActiveUsersManager)20 ArrayList (java.util.ArrayList)14 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)11 ResourceRequest (org.apache.hadoop.yarn.api.records.ResourceRequest)10 SchedulerRequestKey (org.apache.hadoop.yarn.server.scheduler.SchedulerRequestKey)9 HashMap (java.util.HashMap)8 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)7 AMState (org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerApplicationAttempt.AMState)7 Container (org.apache.hadoop.yarn.api.records.Container)6 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)5 RMContext (org.apache.hadoop.yarn.server.resourcemanager.RMContext)5