use of com.microsoft.frameworklauncher.common.exceptions.NotAvailableException in project pai by Microsoft.
the class SelectionManager method select.
@VisibleForTesting
public synchronized SelectionResult select(ResourceDescriptor requestResource, String requestNodeLabel, String requestNodeGpuType, int startStatesTaskCount, List<ValueRange> reusePorts, Map<String, NodeConfiguration> configuredNodes) throws NotAvailableException {
LOGGER.logInfo("select: Request: Resource: [%s], NodeLabel: [%s], NodeGpuType: [%s], StartStatesTaskCount: [%d], ReusePorts: [%s]", requestResource, requestNodeLabel, requestNodeGpuType, startStatesTaskCount, ValueRangeUtils.toString(reusePorts));
initFilteredNodes();
filterNodesByNodeLabel(requestNodeLabel);
filterNodesByGpuType(configuredNodes, requestNodeGpuType);
if (!conf.getAmAllowNoneGpuJobOnGpuNode()) {
int jobTotalRequestGpu = requestManager.getTotalGpuCount();
filterNodesForNoneGpuJob(jobTotalRequestGpu);
}
ResourceDescriptor optimizedRequestResource = YamlUtils.deepCopy(requestResource, ResourceDescriptor.class);
if (ValueRangeUtils.getValueNumber(reusePorts) > 0) {
LOGGER.logInfo("select: reuse pre-selected ports: [%s]", ValueRangeUtils.toString(reusePorts));
optimizedRequestResource.setPortRanges(reusePorts);
}
filterNodesByResource(optimizedRequestResource, conf.getAmSkipLocalTriedResource());
filterNodesByRackSelectionPolicy(optimizedRequestResource, startStatesTaskCount);
if (filteredNodes.size() < 1) {
// Don't have candidate nodes for this request.
if (requestNodeGpuType != null || requestResource.getPortNumber() > 0) {
// GpuType and port relax are not support in yarn, If gpuType or portNumber is specified, abort this request and try later.
throw new NotAvailableException(String.format("Don't have enough nodes to meet request: optimizedRequestResource: [%s], NodeGpuType: [%s], NodeLabel: [%s]", optimizedRequestResource, requestNodeGpuType, requestNodeLabel));
}
}
SelectionResult selectionResult = selectNodes(optimizedRequestResource, startStatesTaskCount);
List<ValueRange> portRanges = selectPorts(selectionResult, optimizedRequestResource);
optimizedRequestResource.setPortRanges(portRanges);
selectionResult.setOptimizedResource(optimizedRequestResource);
return selectionResult;
}
use of com.microsoft.frameworklauncher.common.exceptions.NotAvailableException in project pai by Microsoft.
the class ApplicationMaster method addContainerRequest.
private void addContainerRequest(TaskStatus taskStatus) throws Exception {
String taskRoleName = taskStatus.getTaskRoleName();
TaskStatusLocator taskLocator = new TaskStatusLocator(taskRoleName, taskStatus.getTaskIndex());
String logPrefix = String.format("%s: addContainerRequest: ", taskLocator);
LOGGER.logInfo(logPrefix + "Start");
// 1. setupContainerRequest, retry later if request is not available.
Integer setupContainerRequestRetryIntervalSec = CommonUtils.getRandomNumber(conf.getLauncherConfig().getAmSetupContainerRequestMinRetryIntervalSec(), conf.getLauncherConfig().getAmSetupContainerRequestMaxRetryIntervalSec());
ContainerRequest request;
try {
request = setupContainerRequest(taskStatus);
} catch (NotAvailableException e) {
LOGGER.logWarning(e, logPrefix + "Failed to setupContainerRequest: " + "ContainerRequest may be temporarily not available. " + "Will retry after %ss.", setupContainerRequestRetryIntervalSec);
TaskStatus taskStatusSnapshot = YamlUtils.deepCopy(taskStatus, TaskStatus.class);
transitionTaskStateQueue.queueSystemTaskDelayed(() -> {
if (statusManager.containsTask(taskStatusSnapshot)) {
addContainerRequest(taskStatusSnapshot);
} else {
LOGGER.logWarning(logPrefix + "Task not found in Status. Ignore it.");
}
}, setupContainerRequestRetryIntervalSec * 1000);
return;
}
// 2. addContainerRequest, retry later if request is timeout.
Integer containerRequestTimeoutSec = CommonUtils.getRandomNumber(conf.getLauncherConfig().getAmContainerRequestMinTimeoutSec(), conf.getLauncherConfig().getAmContainerRequestMaxTimeoutSec());
LOGGER.logInfo(logPrefix + "Send ContainerRequest to RM with timeout %ss. ContainerRequest: [%s]", containerRequestTimeoutSec, HadoopExts.toString(request));
rmClient.addContainerRequest(request);
selectionManager.addContainerRequest(request);
statusManager.transitionTaskState(taskLocator, TaskState.CONTAINER_REQUESTED, new TaskEvent().setContainerRequest(request));
transitionTaskStateQueue.queueSystemTaskDelayed(() -> {
if (statusManager.containsTask(request.getPriority())) {
LOGGER.logWarning(logPrefix + "ContainerRequest cannot be satisfied within timeout %ss. " + "Cancel it and Request again. ContainerRequest: [%s]", containerRequestTimeoutSec, HadoopExts.toString(request));
removeContainerRequest(taskStatus);
statusManager.transitionTaskState(taskLocator, TaskState.TASK_WAITING);
addContainerRequest(taskStatus);
}
}, containerRequestTimeoutSec * 1000);
}
use of com.microsoft.frameworklauncher.common.exceptions.NotAvailableException in project pai by Microsoft.
the class SelectionManagerTest method testSelectionManager.
@Test
public void testSelectionManager() throws Exception {
Node node1 = new Node("node1", null, ResourceDescriptor.newInstance(200, 200, 2, 3L), ResourceDescriptor.newInstance(0, 0, 0, 0L));
Node node2 = new Node("node2", null, ResourceDescriptor.newInstance(200, 200, 4, 0xFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
Node node3 = new Node("node3", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
Node node4 = new Node("node4", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
Node node6 = new Node("node6", null, ResourceDescriptor.newInstance(200, 200, 4, 0xFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
MockApplicationMaster am = new MockApplicationMaster();
am.initialize();
SelectionManager sm = new SelectionManager(am.conf.getLauncherConfig(), am.statusManager, am.requestManager);
long candidateGPU = sm.selectCandidateGpuAttribute(node1, 1);
Assert.assertEquals(1L, candidateGPU);
candidateGPU = sm.selectCandidateGpuAttribute(node1, 2);
Assert.assertEquals(3L, candidateGPU);
candidateGPU = sm.selectCandidateGpuAttribute(node3, 2);
Assert.assertEquals(3L, candidateGPU);
candidateGPU = sm.selectCandidateGpuAttribute(node3, 4);
Assert.assertEquals(0xFL, candidateGPU);
candidateGPU = sm.selectCandidateGpuAttribute(node3, 8);
Assert.assertEquals(0xFFL, candidateGPU);
candidateGPU = sm.selectCandidateGpuAttribute(node4, 2);
Assert.assertEquals(0x30L, candidateGPU);
SelectionResult result = sm.select(ResourceDescriptor.newInstance(1, 1, 1, 0L), null, null, 1, null, null);
// Empty allocation failed;
Assert.assertEquals(0, result.getNodeHosts().size());
sm.addNode(node1);
result = sm.select(ResourceDescriptor.newInstance(1, 1, 3, 0L), null, null, 1, null, null);
Assert.assertEquals(0, result.getNodeHosts().size());
result = sm.select(ResourceDescriptor.newInstance(1, 1, 2, 0L), null, null, 1, null, null);
Assert.assertEquals("node1", result.getNodeHosts().get(0));
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 3L);
sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 2, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
sm.addNode(node3);
sm.addNode(node4);
ResourceDescriptor resourceDescriptor = ResourceDescriptor.newInstance(1, 1, 8, 0L);
result = sm.select(resourceDescriptor, null, null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().get(0), "node3");
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xFF);
sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 8, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
result = sm.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().get(0), "node4");
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF0);
sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
result = sm.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, null, 1, null, null);
Assert.assertEquals(0, result.getNodeHosts().size());
sm.addNode(node2);
result = sm.select(ResourceDescriptor.newInstance(1, 1, 1, 0L), null, null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().get(0), "node2");
sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 1, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
result = sm.select(ResourceDescriptor.newInstance(1, 1, 2, 0L), null, null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().get(0), "node2");
sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 2, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
result = sm.select(ResourceDescriptor.newInstance(1, 1, 2, 0L), null, null, 1, null, null);
Assert.assertEquals(0, result.getNodeHosts().size());
sm.addNode(new Node("node5", null, ResourceDescriptor.newInstance(200, 200, 4, 0xFL), ResourceDescriptor.newInstance(0, 0, 0, 0L)));
result = sm.select(ResourceDescriptor.newInstance(1, 1, 1, 0L), null, null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().size(), 2);
sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 1, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
result = sm.select(ResourceDescriptor.newInstance(1, 1, 2, 0L), null, null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().get(0), "node5");
sm.addContainerRequest(ResourceDescriptor.newInstance(1, 2, 1, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
result = sm.select(ResourceDescriptor.newInstance(1, 1, 1, 0L), null, null, 1, null, null);
Assert.assertEquals(2, result.getNodeHosts().size());
sm.addNode(new Node("node6", null, ResourceDescriptor.newInstance(200, 200, 4, 0xFL), ResourceDescriptor.newInstance(0, 0, 0, 0L)));
result = sm.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().get(0), "node6");
sm.addNode(node6);
// Allocation with Gpu type label
Set<String> tag = new HashSet<>();
// Case for node label only
tag.add("K40");
node3 = new Node("node3", tag, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
node4 = new Node("node4", tag, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 8, 0xFFL));
node6 = new Node("node6", tag, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
SelectionManager sm2 = new SelectionManager(am.conf.getLauncherConfig(), am.statusManager, am.requestManager);
sm2.addNode(node3);
sm2.addNode(node4);
sm2.addNode(node6);
result = sm2.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), "K40", null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().size(), 2);
if (result.getNodeHosts().get(0).equals("node6")) {
Assert.assertEquals(240, result.getGpuAttribute(result.getNodeHosts().get(0)).longValue());
Assert.assertEquals(15, result.getGpuAttribute(result.getNodeHosts().get(1)).longValue());
}
if (result.getNodeHosts().get(0).equals("node3")) {
Assert.assertEquals(15, result.getGpuAttribute(result.getNodeHosts().get(0)).longValue());
Assert.assertEquals(240, result.getGpuAttribute(result.getNodeHosts().get(1)).longValue());
}
List<String> nodeList = new ArrayList<String>();
nodeList.add(result.getNodeHosts().get(0));
sm2.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), nodeList);
nodeList.clear();
nodeList.add(result.getNodeHosts().get(1));
sm2.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(1))), nodeList);
result = sm2.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), "K40", null, 1, null, null);
Assert.assertEquals(result.getNodeHosts().get(0), "node3");
Assert.assertEquals(0xF0L, result.getGpuAttribute(result.getNodeHosts().get(0)).longValue());
sm2.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
// Node label not match
result = sm2.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), "M40", null, 1, null, null);
Assert.assertEquals(0, result.getNodeHosts().size());
Node node7 = new Node("node7", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
sm2.addNode(node7);
result = sm2.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, null, 1, null, null);
Assert.assertEquals("node7", result.getNodeHosts().get(0));
sm2.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
// Case for gpu type config only
node3 = new Node("node3", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
node4 = new Node("node4", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
Map<String, NodeConfiguration> gpuNodeConfig = createClusterTestNodes();
SelectionManager sm3 = new SelectionManager(am.conf.getLauncherConfig(), am.statusManager, am.requestManager);
sm3.addNode(node3);
sm3.addNode(node4);
result = sm3.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, "K40", 1, null, gpuNodeConfig);
Assert.assertEquals("node3", result.getNodeHosts().get(0));
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF);
sm3.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
result = sm3.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, "T40", 1, null, gpuNodeConfig);
Assert.assertEquals("node4", result.getNodeHosts().get(0));
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF0);
sm3.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
try {
result = sm3.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, "L40", 1, null, gpuNodeConfig);
Assert.fail("NodeGpuType should not be relaxed to RM");
} catch (NotAvailableException e) {
}
result = sm3.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, "L40,T40,K40", 1, null, gpuNodeConfig);
Assert.assertEquals("node3", result.getNodeHosts().get(0));
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF0);
SelectionManager sm4 = new SelectionManager(am.conf.getLauncherConfig(), am.statusManager, am.requestManager);
node6 = new Node("node6", null, ResourceDescriptor.newInstance(2, 2, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
node7 = new Node("node7", null, ResourceDescriptor.newInstance(2, 2, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
sm4.addNode(node6);
sm4.addNode(node7);
try {
sm4.select(ResourceDescriptor.newInstance(1, 1, 4, 0x33L), null, "K40", 1, null, gpuNodeConfig);
Assert.fail("NodeGpuType should not be relaxed to RM");
} catch (NotAvailableException ignored) {
}
result = sm4.select(ResourceDescriptor.newInstance(1, 1, 4, 0x33L), null, "M40", 1, null, gpuNodeConfig);
Assert.assertEquals("node6", result.getNodeHosts().get(0));
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0x33);
result = sm4.select(ResourceDescriptor.newInstance(1, 1, 4, 0xFL), null, null, 1, null, gpuNodeConfig);
Assert.assertEquals("node6", result.getNodeHosts().get(0));
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xFL);
sm4.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
result = sm4.select(ResourceDescriptor.newInstance(1, 1, 4, 0xF0L), null, "K40", 1, null, gpuNodeConfig);
Assert.assertEquals("node7", result.getNodeHosts().get(0));
Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF0);
}
Aggregations