Search in sources :

Example 56 with Resource

use of org.apache.hadoop.yarn.api.records.Resource in project hadoop by apache.

the class TempQueuePerPartition method updatePreemptableExtras.

public void updatePreemptableExtras(ResourceCalculator rc) {
    // Reset untouchableExtra and preemptableExtra
    untouchableExtra = Resources.none();
    preemptableExtra = Resources.none();
    Resource extra = Resources.subtract(getUsed(), getGuaranteed());
    if (Resources.lessThan(rc, totalPartitionResource, extra, Resources.none())) {
        extra = Resources.none();
    }
    if (null == children || children.isEmpty()) {
        // If it is a leaf queue
        if (preemptionDisabled) {
            untouchableExtra = extra;
        } else {
            preemptableExtra = extra;
        }
    } else {
        // If it is a parent queue
        Resource childrensPreemptable = Resource.newInstance(0, 0);
        for (TempQueuePerPartition child : children) {
            Resources.addTo(childrensPreemptable, child.preemptableExtra);
        }
        // untouchableExtra = max(extra - childrenPreemptable, 0)
        if (Resources.greaterThanOrEqual(rc, totalPartitionResource, childrensPreemptable, extra)) {
            untouchableExtra = Resource.newInstance(0, 0);
        } else {
            untouchableExtra = Resources.subtract(extra, childrensPreemptable);
        }
        preemptableExtra = Resources.min(rc, totalPartitionResource, childrensPreemptable, extra);
    }
}
Also used : Resource(org.apache.hadoop.yarn.api.records.Resource)

Example 57 with Resource

use of org.apache.hadoop.yarn.api.records.Resource in project hadoop by apache.

the class AbstractPreemptableResourceCalculator method computeFixpointAllocation.

/**
   * Given a set of queues compute the fix-point distribution of unassigned
   * resources among them. As pending request of a queue are exhausted, the
   * queue is removed from the set and remaining capacity redistributed among
   * remaining queues. The distribution is weighted based on guaranteed
   * capacity, unless asked to ignoreGuarantee, in which case resources are
   * distributed uniformly.
   *
   * @param totGuarant
   *          total guaranteed resource
   * @param qAlloc
   *          List of child queues
   * @param unassigned
   *          Unassigned resource per queue
   * @param ignoreGuarantee
   *          ignore guarantee per queue.
   */
protected void computeFixpointAllocation(Resource totGuarant, Collection<TempQueuePerPartition> qAlloc, Resource unassigned, boolean ignoreGuarantee) {
    // Prior to assigning the unused resources, process each queue as follows:
    // If current > guaranteed, idealAssigned = guaranteed + untouchable extra
    // Else idealAssigned = current;
    // Subtract idealAssigned resources from unassigned.
    // If the queue has all of its needs met (that is, if
    // idealAssigned >= current + pending), remove it from consideration.
    // Sort queues from most under-guaranteed to most over-guaranteed.
    TQComparator tqComparator = new TQComparator(rc, totGuarant);
    PriorityQueue<TempQueuePerPartition> orderedByNeed = new PriorityQueue<>(10, tqComparator);
    for (Iterator<TempQueuePerPartition> i = qAlloc.iterator(); i.hasNext(); ) {
        TempQueuePerPartition q = i.next();
        Resource used = q.getUsed();
        if (Resources.greaterThan(rc, totGuarant, used, q.getGuaranteed())) {
            q.idealAssigned = Resources.add(q.getGuaranteed(), q.untouchableExtra);
        } else {
            q.idealAssigned = Resources.clone(used);
        }
        Resources.subtractFrom(unassigned, q.idealAssigned);
        // If idealAssigned < (allocated + used + pending), q needs more
        // resources, so
        // add it to the list of underserved queues, ordered by need.
        Resource curPlusPend = Resources.add(q.getUsed(), q.pending);
        if (Resources.lessThan(rc, totGuarant, q.idealAssigned, curPlusPend)) {
            orderedByNeed.add(q);
        }
    }
    // left
    while (!orderedByNeed.isEmpty() && Resources.greaterThan(rc, totGuarant, unassigned, Resources.none())) {
        Resource wQassigned = Resource.newInstance(0, 0);
        // we compute normalizedGuarantees capacity based on currently active
        // queues
        resetCapacity(unassigned, orderedByNeed, ignoreGuarantee);
        // For each underserved queue (or set of queues if multiple are equally
        // underserved), offer its share of the unassigned resources based on its
        // normalized guarantee. After the offer, if the queue is not satisfied,
        // place it back in the ordered list of queues, recalculating its place
        // in the order of most under-guaranteed to most over-guaranteed. In this
        // way, the most underserved queue(s) are always given resources first.
        Collection<TempQueuePerPartition> underserved = getMostUnderservedQueues(orderedByNeed, tqComparator);
        for (Iterator<TempQueuePerPartition> i = underserved.iterator(); i.hasNext(); ) {
            TempQueuePerPartition sub = i.next();
            Resource wQavail = Resources.multiplyAndNormalizeUp(rc, unassigned, sub.normalizedGuarantee, Resource.newInstance(1, 1));
            Resource wQidle = sub.offer(wQavail, rc, totGuarant, isReservedPreemptionCandidatesSelector);
            Resource wQdone = Resources.subtract(wQavail, wQidle);
            if (Resources.greaterThan(rc, totGuarant, wQdone, Resources.none())) {
                // The queue is still asking for more. Put it back in the priority
                // queue, recalculating its order based on need.
                orderedByNeed.add(sub);
            }
            Resources.addTo(wQassigned, wQdone);
        }
        Resources.subtractFrom(unassigned, wQassigned);
    }
    // all queues are added to list.
    while (!orderedByNeed.isEmpty()) {
        TempQueuePerPartition q1 = orderedByNeed.remove();
        context.addPartitionToUnderServedQueues(q1.queueName, q1.partition);
    }
}
Also used : Resource(org.apache.hadoop.yarn.api.records.Resource) PriorityQueue(java.util.PriorityQueue)

Example 58 with Resource

use of org.apache.hadoop.yarn.api.records.Resource in project hadoop by apache.

the class CapacitySchedulerPreemptionUtils method tryPreemptContainerAndDeductResToObtain.

/**
   * Invoke this method to preempt container based on resToObtain.
   *
   * @param rc
   *          resource calculator
   * @param context
   *          preemption context
   * @param resourceToObtainByPartitions
   *          map to hold resource to obtain per partition
   * @param rmContainer
   *          container
   * @param clusterResource
   *          total resource
   * @param preemptMap
   *          map to hold preempted containers
   * @param totalPreemptionAllowed
   *          total preemption allowed per round
   * @return should we preempt rmContainer. If we should, deduct from
   *         <code>resourceToObtainByPartition</code>
   */
public static boolean tryPreemptContainerAndDeductResToObtain(ResourceCalculator rc, CapacitySchedulerPreemptionContext context, Map<String, Resource> resourceToObtainByPartitions, RMContainer rmContainer, Resource clusterResource, Map<ApplicationAttemptId, Set<RMContainer>> preemptMap, Resource totalPreemptionAllowed) {
    ApplicationAttemptId attemptId = rmContainer.getApplicationAttemptId();
    // We will not account resource of a container twice or more
    if (preemptMapContains(preemptMap, attemptId, rmContainer)) {
        return false;
    }
    String nodePartition = getPartitionByNodeId(context, rmContainer.getAllocatedNode());
    Resource toObtainByPartition = resourceToObtainByPartitions.get(nodePartition);
    if (null != toObtainByPartition && Resources.greaterThan(rc, clusterResource, toObtainByPartition, Resources.none()) && Resources.fitsIn(rc, clusterResource, rmContainer.getAllocatedResource(), totalPreemptionAllowed)) {
        Resources.subtractFrom(toObtainByPartition, rmContainer.getAllocatedResource());
        Resources.subtractFrom(totalPreemptionAllowed, rmContainer.getAllocatedResource());
        // When we have no more resource need to obtain, remove from map.
        if (Resources.lessThanOrEqual(rc, clusterResource, toObtainByPartition, Resources.none())) {
            resourceToObtainByPartitions.remove(nodePartition);
        }
        // Add to preemptMap
        addToPreemptMap(preemptMap, attemptId, rmContainer);
        return true;
    }
    return false;
}
Also used : Resource(org.apache.hadoop.yarn.api.records.Resource) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId)

Example 59 with Resource

use of org.apache.hadoop.yarn.api.records.Resource in project hadoop by apache.

the class FifoCandidatesSelector method selectCandidates.

@Override
public Map<ApplicationAttemptId, Set<RMContainer>> selectCandidates(Map<ApplicationAttemptId, Set<RMContainer>> selectedCandidates, Resource clusterResource, Resource totalPreemptionAllowed) {
    // Calculate how much resources we need to preempt
    preemptableAmountCalculator.computeIdealAllocation(clusterResource, totalPreemptionAllowed);
    // Previous selectors (with higher priority) could have already
    // selected containers. We need to deduct preemptable resources
    // based on already selected candidates.
    CapacitySchedulerPreemptionUtils.deductPreemptableResourcesBasedSelectedCandidates(preemptionContext, selectedCandidates);
    List<RMContainer> skippedAMContainerlist = new ArrayList<>();
    // Loop all leaf queues
    for (String queueName : preemptionContext.getLeafQueueNames()) {
        // check if preemption disabled for the queue
        if (preemptionContext.getQueueByPartition(queueName, RMNodeLabelsManager.NO_LABEL).preemptionDisabled) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("skipping from queue=" + queueName + " because it's a non-preemptable queue");
            }
            continue;
        }
        // compute resToObtainByPartition considered inter-queue preemption
        LeafQueue leafQueue = preemptionContext.getQueueByPartition(queueName, RMNodeLabelsManager.NO_LABEL).leafQueue;
        Map<String, Resource> resToObtainByPartition = CapacitySchedulerPreemptionUtils.getResToObtainByPartitionForLeafQueue(preemptionContext, queueName, clusterResource);
        try {
            leafQueue.getReadLock().lock();
            // go through all ignore-partition-exclusivity containers first to make
            // sure such containers will be preemptionCandidates first
            Map<String, TreeSet<RMContainer>> ignorePartitionExclusivityContainers = leafQueue.getIgnoreExclusivityRMContainers();
            for (String partition : resToObtainByPartition.keySet()) {
                if (ignorePartitionExclusivityContainers.containsKey(partition)) {
                    TreeSet<RMContainer> rmContainers = ignorePartitionExclusivityContainers.get(partition);
                    // application's containers will be preemptionCandidates first.
                    for (RMContainer c : rmContainers.descendingSet()) {
                        if (CapacitySchedulerPreemptionUtils.isContainerAlreadySelected(c, selectedCandidates)) {
                            // Skip already selected containers
                            continue;
                        }
                        boolean preempted = CapacitySchedulerPreemptionUtils.tryPreemptContainerAndDeductResToObtain(rc, preemptionContext, resToObtainByPartition, c, clusterResource, selectedCandidates, totalPreemptionAllowed);
                        if (!preempted) {
                            continue;
                        }
                    }
                }
            }
            // preempt other containers
            Resource skippedAMSize = Resource.newInstance(0, 0);
            Iterator<FiCaSchedulerApp> desc = leafQueue.getOrderingPolicy().getPreemptionIterator();
            while (desc.hasNext()) {
                FiCaSchedulerApp fc = desc.next();
                // more preemption is needed
                if (resToObtainByPartition.isEmpty()) {
                    break;
                }
                preemptFrom(fc, clusterResource, resToObtainByPartition, skippedAMContainerlist, skippedAMSize, selectedCandidates, totalPreemptionAllowed);
            }
            // Can try preempting AMContainers (still saving atmost
            // maxAMCapacityForThisQueue AMResource's) if more resources are
            // required to be preemptionCandidates from this Queue.
            Resource maxAMCapacityForThisQueue = Resources.multiply(Resources.multiply(clusterResource, leafQueue.getAbsoluteCapacity()), leafQueue.getMaxAMResourcePerQueuePercent());
            preemptAMContainers(clusterResource, selectedCandidates, skippedAMContainerlist, resToObtainByPartition, skippedAMSize, maxAMCapacityForThisQueue, totalPreemptionAllowed);
        } finally {
            leafQueue.getReadLock().unlock();
        }
    }
    return selectedCandidates;
}
Also used : ArrayList(java.util.ArrayList) Resource(org.apache.hadoop.yarn.api.records.Resource) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) LeafQueue(org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue) TreeSet(java.util.TreeSet) FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp)

Example 60 with Resource

use of org.apache.hadoop.yarn.api.records.Resource in project hadoop by apache.

the class FifoIntraQueuePreemptionPlugin method createTempAppForResCalculation.

private PriorityQueue<TempAppPerPartition> createTempAppForResCalculation(String partition, Collection<FiCaSchedulerApp> apps, TAPriorityComparator taComparator) {
    PriorityQueue<TempAppPerPartition> orderedByPriority = new PriorityQueue<>(100, taComparator);
    // have an internal temp app structure to store intermediate data(priority)
    for (FiCaSchedulerApp app : apps) {
        Resource used = app.getAppAttemptResourceUsage().getUsed(partition);
        Resource amUsed = null;
        if (!app.isWaitingForAMContainer()) {
            amUsed = app.getAMResource(partition);
        }
        Resource pending = app.getTotalPendingRequestsPerPartition().get(partition);
        Resource reserved = app.getAppAttemptResourceUsage().getReserved(partition);
        used = (used == null) ? Resources.createResource(0, 0) : used;
        amUsed = (amUsed == null) ? Resources.createResource(0, 0) : amUsed;
        pending = (pending == null) ? Resources.createResource(0, 0) : pending;
        reserved = (reserved == null) ? Resources.createResource(0, 0) : reserved;
        HashSet<String> partitions = new HashSet<String>(app.getAppAttemptResourceUsage().getNodePartitionsSet());
        partitions.addAll(app.getTotalPendingRequestsPerPartition().keySet());
        // Create TempAppPerQueue for further calculation.
        TempAppPerPartition tmpApp = new TempAppPerPartition(app, Resources.clone(used), Resources.clone(amUsed), Resources.clone(reserved), Resources.clone(pending));
        // Set ideal allocation of app as 0.
        tmpApp.idealAssigned = Resources.createResource(0, 0);
        orderedByPriority.add(tmpApp);
    }
    return orderedByPriority;
}
Also used : FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp) Resource(org.apache.hadoop.yarn.api.records.Resource) PriorityQueue(java.util.PriorityQueue) HashSet(java.util.HashSet)

Aggregations

Resource (org.apache.hadoop.yarn.api.records.Resource)500 Test (org.junit.Test)190 NodeId (org.apache.hadoop.yarn.api.records.NodeId)89 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)82 Priority (org.apache.hadoop.yarn.api.records.Priority)80 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)67 HashMap (java.util.HashMap)62 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)57 RMContainer (org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)55 FiCaSchedulerApp (org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp)53 ArrayList (java.util.ArrayList)49 ResourceLimits (org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceLimits)48 FiCaSchedulerNode (org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode)45 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)43 Container (org.apache.hadoop.yarn.api.records.Container)42 ResourceRequest (org.apache.hadoop.yarn.api.records.ResourceRequest)42 Configuration (org.apache.hadoop.conf.Configuration)34 IOException (java.io.IOException)33 LocalResource (org.apache.hadoop.yarn.api.records.LocalResource)33 Map (java.util.Map)29