use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue in project hadoop by apache.
the class TestWorkPreservingRMRestart method checkCSQueue.
private void checkCSQueue(MockRM rm, SchedulerApplication<SchedulerApplicationAttempt> app, Resource clusterResource, Resource queueResource, Resource usedResource, int numContainers) throws Exception {
checkCSLeafQueue(rm, app, clusterResource, queueResource, usedResource, numContainers);
LeafQueue queue = (LeafQueue) app.getQueue();
Resource availableResources = Resources.subtract(queueResource, usedResource);
// ************ check app headroom ****************
SchedulerApplicationAttempt schedulerAttempt = app.getCurrentAppAttempt();
assertEquals(availableResources, schedulerAttempt.getHeadroom());
// ************* check Queue metrics ************
QueueMetrics queueMetrics = queue.getMetrics();
assertMetrics(queueMetrics, 1, 0, 1, 0, 2, availableResources.getMemorySize(), availableResources.getVirtualCores(), usedResource.getMemorySize(), usedResource.getVirtualCores());
// ************ check user metrics ***********
QueueMetrics userMetrics = queueMetrics.getUserMetrics(app.getUser());
assertMetrics(userMetrics, 1, 0, 1, 0, 2, availableResources.getMemorySize(), availableResources.getVirtualCores(), usedResource.getMemorySize(), usedResource.getVirtualCores());
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue in project hadoop by apache.
the class TestWorkPreservingRMRestart method testCapacitySchedulerRecovery.
// Test CS recovery with multi-level queues and multi-users:
// 1. setup 2 NMs each with 8GB memory;
// 2. setup 2 level queues: Default -> (QueueA, QueueB)
// 3. User1 submits 2 apps on QueueA
// 4. User2 submits 1 app on QueueB
// 5. AM and each container has 1GB memory
// 6. Restart RM.
// 7. nm1 re-syncs back containers belong to user1
// 8. nm2 re-syncs back containers belong to user2.
// 9. Assert the parent queue and 2 leaf queues state and the metrics.
// 10. Assert each user's consumption inside the queue.
@Test(timeout = 30000)
public void testCapacitySchedulerRecovery() throws Exception {
if (getSchedulerType() != SchedulerType.CAPACITY) {
return;
}
conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration(conf);
setupQueueConfiguration(csConf);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(csConf);
rm1 = new MockRM(csConf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
MockNM nm2 = new MockNM("127.1.1.1:4321", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
nm2.registerNode();
RMApp app1_1 = rm1.submitApp(1024, "app1_1", USER_1, null, A);
MockAM am1_1 = MockRM.launchAndRegisterAM(app1_1, rm1, nm1);
RMApp app1_2 = rm1.submitApp(1024, "app1_2", USER_1, null, A);
MockAM am1_2 = MockRM.launchAndRegisterAM(app1_2, rm1, nm2);
RMApp app2 = rm1.submitApp(1024, "app2", USER_2, null, B);
MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
// clear queue metrics
rm1.clearQueueMetrics(app1_1);
rm1.clearQueueMetrics(app1_2);
rm1.clearQueueMetrics(app2);
csConf.set(PREFIX + "root.Default.QueueB.state", "STOPPED");
// Re-start RM
rm2 = new MockRM(csConf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
nm2.setResourceTrackerService(rm2.getResourceTrackerService());
List<NMContainerStatus> am1_1Containers = createNMContainerStatusForApp(am1_1);
List<NMContainerStatus> am1_2Containers = createNMContainerStatusForApp(am1_2);
am1_1Containers.addAll(am1_2Containers);
nm1.registerNode(am1_1Containers, null);
List<NMContainerStatus> am2Containers = createNMContainerStatusForApp(am2);
nm2.registerNode(am2Containers, null);
// Wait for RM to settle down on recovering containers;
waitForNumContainersToRecover(2, rm2, am1_1.getApplicationAttemptId());
waitForNumContainersToRecover(2, rm2, am1_2.getApplicationAttemptId());
waitForNumContainersToRecover(2, rm2, am2.getApplicationAttemptId());
// Calculate each queue's resource usage.
Resource containerResource = Resource.newInstance(1024, 1);
Resource nmResource = Resource.newInstance(nm1.getMemory(), nm1.getvCores());
Resource clusterResource = Resources.multiply(nmResource, 2);
Resource q1Resource = Resources.multiply(clusterResource, 0.5);
Resource q2Resource = Resources.multiply(clusterResource, 0.5);
Resource q1UsedResource = Resources.multiply(containerResource, 4);
Resource q2UsedResource = Resources.multiply(containerResource, 2);
Resource totalUsedResource = Resources.add(q1UsedResource, q2UsedResource);
Resource q1availableResources = Resources.subtract(q1Resource, q1UsedResource);
Resource q2availableResources = Resources.subtract(q2Resource, q2UsedResource);
Resource totalAvailableResource = Resources.add(q1availableResources, q2availableResources);
Map<ApplicationId, SchedulerApplication> schedulerApps = ((AbstractYarnScheduler) rm2.getResourceScheduler()).getSchedulerApplications();
SchedulerApplication schedulerApp1_1 = schedulerApps.get(app1_1.getApplicationId());
// assert queue A state.
checkCSLeafQueue(rm2, schedulerApp1_1, clusterResource, q1Resource, q1UsedResource, 4);
QueueMetrics queue1Metrics = schedulerApp1_1.getQueue().getMetrics();
assertMetrics(queue1Metrics, 2, 0, 2, 0, 4, q1availableResources.getMemorySize(), q1availableResources.getVirtualCores(), q1UsedResource.getMemorySize(), q1UsedResource.getVirtualCores());
// assert queue B state.
SchedulerApplication schedulerApp2 = schedulerApps.get(app2.getApplicationId());
checkCSLeafQueue(rm2, schedulerApp2, clusterResource, q2Resource, q2UsedResource, 2);
QueueMetrics queue2Metrics = schedulerApp2.getQueue().getMetrics();
assertMetrics(queue2Metrics, 1, 0, 1, 0, 2, q2availableResources.getMemorySize(), q2availableResources.getVirtualCores(), q2UsedResource.getMemorySize(), q2UsedResource.getVirtualCores());
// assert parent queue state.
LeafQueue leafQueue = (LeafQueue) schedulerApp2.getQueue();
ParentQueue parentQueue = (ParentQueue) leafQueue.getParent();
checkParentQueue(parentQueue, 6, totalUsedResource, (float) 6 / 16, (float) 6 / 16);
assertMetrics(parentQueue.getMetrics(), 3, 0, 3, 0, 6, totalAvailableResource.getMemorySize(), totalAvailableResource.getVirtualCores(), totalUsedResource.getMemorySize(), totalUsedResource.getVirtualCores());
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue in project hadoop by apache.
the class FiCaSchedulerApp method getActivedAppDiagnosticMessage.
protected void getActivedAppDiagnosticMessage(StringBuilder diagnosticMessage) {
LeafQueue queue = getCSLeafQueue();
QueueCapacities queueCapacities = queue.getQueueCapacities();
diagnosticMessage.append(" Details : AM Partition = ");
diagnosticMessage.append(appAMNodePartitionName.isEmpty() ? NodeLabel.DEFAULT_NODE_LABEL_PARTITION : appAMNodePartitionName);
diagnosticMessage.append(" ; ");
diagnosticMessage.append("Partition Resource = ");
diagnosticMessage.append(rmContext.getNodeLabelManager().getResourceByLabel(appAMNodePartitionName, Resources.none()));
diagnosticMessage.append(" ; ");
diagnosticMessage.append("Queue's Absolute capacity = ");
diagnosticMessage.append(queueCapacities.getAbsoluteCapacity(appAMNodePartitionName) * 100);
diagnosticMessage.append(" % ; ");
diagnosticMessage.append("Queue's Absolute used capacity = ");
diagnosticMessage.append(queueCapacities.getAbsoluteUsedCapacity(appAMNodePartitionName) * 100);
diagnosticMessage.append(" % ; ");
diagnosticMessage.append("Queue's Absolute max capacity = ");
diagnosticMessage.append(queueCapacities.getAbsoluteMaximumCapacity(appAMNodePartitionName) * 100);
diagnosticMessage.append(" % ; ");
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue in project hadoop by apache.
the class CapacitySchedulerInfo method getQueues.
protected CapacitySchedulerQueueInfoList getQueues(CSQueue parent) {
CapacitySchedulerQueueInfoList queuesInfo = new CapacitySchedulerQueueInfoList();
// JAXB marashalling leads to situation where the "type" field injected
// for JSON changes from string to array depending on order of printing
// Issue gets fixed if all the leaf queues are marshalled before the
// non-leaf queues. See YARN-4785 for more details.
List<CSQueue> childQueues = new ArrayList<>();
List<CSQueue> childLeafQueues = new ArrayList<>();
List<CSQueue> childNonLeafQueues = new ArrayList<>();
for (CSQueue queue : parent.getChildQueues()) {
if (queue instanceof LeafQueue) {
childLeafQueues.add(queue);
} else {
childNonLeafQueues.add(queue);
}
}
childQueues.addAll(childLeafQueues);
childQueues.addAll(childNonLeafQueues);
for (CSQueue queue : childQueues) {
CapacitySchedulerQueueInfo info;
if (queue instanceof LeafQueue) {
info = new CapacitySchedulerLeafQueueInfo((LeafQueue) queue);
} else {
info = new CapacitySchedulerQueueInfo(queue);
info.queues = getQueues(queue);
}
queuesInfo.addToQueueInfoList(info);
}
return queuesInfo;
}
use of org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.LeafQueue in project hadoop by apache.
the class FiCaSchedulerApp method getPendingAppDiagnosticMessage.
protected void getPendingAppDiagnosticMessage(StringBuilder diagnosticMessage) {
LeafQueue queue = getCSLeafQueue();
diagnosticMessage.append(" Details : AM Partition = ");
diagnosticMessage.append(appAMNodePartitionName.isEmpty() ? NodeLabel.DEFAULT_NODE_LABEL_PARTITION : appAMNodePartitionName);
diagnosticMessage.append("; ");
diagnosticMessage.append("AM Resource Request = ");
diagnosticMessage.append(getAMResource(appAMNodePartitionName));
diagnosticMessage.append("; ");
diagnosticMessage.append("Queue Resource Limit for AM = ");
diagnosticMessage.append(queue.getAMResourceLimitPerPartition(appAMNodePartitionName));
diagnosticMessage.append("; ");
diagnosticMessage.append("User AM Resource Limit of the queue = ");
diagnosticMessage.append(queue.getUserAMResourceLimitPerPartition(appAMNodePartitionName));
diagnosticMessage.append("; ");
diagnosticMessage.append("Queue AM Resource Usage = ");
diagnosticMessage.append(queue.getQueueResourceUsage().getAMUsed(appAMNodePartitionName));
diagnosticMessage.append("; ");
}
Aggregations