Search in sources :

Example 1 with NotAvailableException

use of com.microsoft.frameworklauncher.common.exceptions.NotAvailableException in project pai by Microsoft.

the class SelectionManager method select.

@VisibleForTesting
public synchronized SelectionResult select(ResourceDescriptor requestResource, String requestNodeLabel, String requestNodeGpuType, int startStatesTaskCount, List<ValueRange> reusePorts, Map<String, NodeConfiguration> configuredNodes) throws NotAvailableException {
    LOGGER.logInfo("select: Request: Resource: [%s], NodeLabel: [%s], NodeGpuType: [%s], StartStatesTaskCount: [%d], ReusePorts: [%s]", requestResource, requestNodeLabel, requestNodeGpuType, startStatesTaskCount, ValueRangeUtils.toString(reusePorts));
    initFilteredNodes();
    filterNodesByNodeLabel(requestNodeLabel);
    filterNodesByGpuType(configuredNodes, requestNodeGpuType);
    if (!conf.getAmAllowNoneGpuJobOnGpuNode()) {
        int jobTotalRequestGpu = requestManager.getTotalGpuCount();
        filterNodesForNoneGpuJob(jobTotalRequestGpu);
    }
    ResourceDescriptor optimizedRequestResource = YamlUtils.deepCopy(requestResource, ResourceDescriptor.class);
    if (ValueRangeUtils.getValueNumber(reusePorts) > 0) {
        LOGGER.logInfo("select: reuse pre-selected ports: [%s]", ValueRangeUtils.toString(reusePorts));
        optimizedRequestResource.setPortRanges(reusePorts);
    }
    filterNodesByResource(optimizedRequestResource, conf.getAmSkipLocalTriedResource());
    filterNodesByRackSelectionPolicy(optimizedRequestResource, startStatesTaskCount);
    if (filteredNodes.size() < 1) {
        // Don't have candidate nodes for this request.
        if (requestNodeGpuType != null || requestResource.getPortNumber() > 0) {
            // GpuType and port relax are not support in yarn, If gpuType or portNumber is specified, abort this request and try later.
            throw new NotAvailableException(String.format("Don't have enough nodes to meet request: optimizedRequestResource: [%s], NodeGpuType: [%s], NodeLabel: [%s]", optimizedRequestResource, requestNodeGpuType, requestNodeLabel));
        }
    }
    SelectionResult selectionResult = selectNodes(optimizedRequestResource, startStatesTaskCount);
    List<ValueRange> portRanges = selectPorts(selectionResult, optimizedRequestResource);
    optimizedRequestResource.setPortRanges(portRanges);
    selectionResult.setOptimizedResource(optimizedRequestResource);
    return selectionResult;
}
Also used : ValueRange(com.microsoft.frameworklauncher.common.model.ValueRange) NotAvailableException(com.microsoft.frameworklauncher.common.exceptions.NotAvailableException) ResourceDescriptor(com.microsoft.frameworklauncher.common.model.ResourceDescriptor) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 2 with NotAvailableException

use of com.microsoft.frameworklauncher.common.exceptions.NotAvailableException in project pai by Microsoft.

the class ApplicationMaster method addContainerRequest.

private void addContainerRequest(TaskStatus taskStatus) throws Exception {
    String taskRoleName = taskStatus.getTaskRoleName();
    TaskStatusLocator taskLocator = new TaskStatusLocator(taskRoleName, taskStatus.getTaskIndex());
    String logPrefix = String.format("%s: addContainerRequest: ", taskLocator);
    LOGGER.logInfo(logPrefix + "Start");
    // 1. setupContainerRequest, retry later if request is not available.
    Integer setupContainerRequestRetryIntervalSec = CommonUtils.getRandomNumber(conf.getLauncherConfig().getAmSetupContainerRequestMinRetryIntervalSec(), conf.getLauncherConfig().getAmSetupContainerRequestMaxRetryIntervalSec());
    ContainerRequest request;
    try {
        request = setupContainerRequest(taskStatus);
    } catch (NotAvailableException e) {
        LOGGER.logWarning(e, logPrefix + "Failed to setupContainerRequest: " + "ContainerRequest may be temporarily not available. " + "Will retry after %ss.", setupContainerRequestRetryIntervalSec);
        TaskStatus taskStatusSnapshot = YamlUtils.deepCopy(taskStatus, TaskStatus.class);
        transitionTaskStateQueue.queueSystemTaskDelayed(() -> {
            if (statusManager.containsTask(taskStatusSnapshot)) {
                addContainerRequest(taskStatusSnapshot);
            } else {
                LOGGER.logWarning(logPrefix + "Task not found in Status. Ignore it.");
            }
        }, setupContainerRequestRetryIntervalSec * 1000);
        return;
    }
    // 2. addContainerRequest, retry later if request is timeout.
    Integer containerRequestTimeoutSec = CommonUtils.getRandomNumber(conf.getLauncherConfig().getAmContainerRequestMinTimeoutSec(), conf.getLauncherConfig().getAmContainerRequestMaxTimeoutSec());
    LOGGER.logInfo(logPrefix + "Send ContainerRequest to RM with timeout %ss. ContainerRequest: [%s]", containerRequestTimeoutSec, HadoopExts.toString(request));
    rmClient.addContainerRequest(request);
    selectionManager.addContainerRequest(request);
    statusManager.transitionTaskState(taskLocator, TaskState.CONTAINER_REQUESTED, new TaskEvent().setContainerRequest(request));
    transitionTaskStateQueue.queueSystemTaskDelayed(() -> {
        if (statusManager.containsTask(request.getPriority())) {
            LOGGER.logWarning(logPrefix + "ContainerRequest cannot be satisfied within timeout %ss. " + "Cancel it and Request again. ContainerRequest: [%s]", containerRequestTimeoutSec, HadoopExts.toString(request));
            removeContainerRequest(taskStatus);
            statusManager.transitionTaskState(taskLocator, TaskState.TASK_WAITING);
            addContainerRequest(taskStatus);
        }
    }, containerRequestTimeoutSec * 1000);
}
Also used : ContainerRequest(org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest) NotAvailableException(com.microsoft.frameworklauncher.common.exceptions.NotAvailableException)

Example 3 with NotAvailableException

use of com.microsoft.frameworklauncher.common.exceptions.NotAvailableException in project pai by Microsoft.

the class SelectionManagerTest method testSelectionManager.

@Test
public void testSelectionManager() throws Exception {
    Node node1 = new Node("node1", null, ResourceDescriptor.newInstance(200, 200, 2, 3L), ResourceDescriptor.newInstance(0, 0, 0, 0L));
    Node node2 = new Node("node2", null, ResourceDescriptor.newInstance(200, 200, 4, 0xFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
    Node node3 = new Node("node3", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
    Node node4 = new Node("node4", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
    Node node6 = new Node("node6", null, ResourceDescriptor.newInstance(200, 200, 4, 0xFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
    MockApplicationMaster am = new MockApplicationMaster();
    am.initialize();
    SelectionManager sm = new SelectionManager(am.conf.getLauncherConfig(), am.statusManager, am.requestManager);
    long candidateGPU = sm.selectCandidateGpuAttribute(node1, 1);
    Assert.assertEquals(1L, candidateGPU);
    candidateGPU = sm.selectCandidateGpuAttribute(node1, 2);
    Assert.assertEquals(3L, candidateGPU);
    candidateGPU = sm.selectCandidateGpuAttribute(node3, 2);
    Assert.assertEquals(3L, candidateGPU);
    candidateGPU = sm.selectCandidateGpuAttribute(node3, 4);
    Assert.assertEquals(0xFL, candidateGPU);
    candidateGPU = sm.selectCandidateGpuAttribute(node3, 8);
    Assert.assertEquals(0xFFL, candidateGPU);
    candidateGPU = sm.selectCandidateGpuAttribute(node4, 2);
    Assert.assertEquals(0x30L, candidateGPU);
    SelectionResult result = sm.select(ResourceDescriptor.newInstance(1, 1, 1, 0L), null, null, 1, null, null);
    // Empty allocation failed;
    Assert.assertEquals(0, result.getNodeHosts().size());
    sm.addNode(node1);
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 3, 0L), null, null, 1, null, null);
    Assert.assertEquals(0, result.getNodeHosts().size());
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 2, 0L), null, null, 1, null, null);
    Assert.assertEquals("node1", result.getNodeHosts().get(0));
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 3L);
    sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 2, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    sm.addNode(node3);
    sm.addNode(node4);
    ResourceDescriptor resourceDescriptor = ResourceDescriptor.newInstance(1, 1, 8, 0L);
    result = sm.select(resourceDescriptor, null, null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().get(0), "node3");
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xFF);
    sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 8, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().get(0), "node4");
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF0);
    sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, null, 1, null, null);
    Assert.assertEquals(0, result.getNodeHosts().size());
    sm.addNode(node2);
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 1, 0L), null, null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().get(0), "node2");
    sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 1, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 2, 0L), null, null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().get(0), "node2");
    sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 2, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 2, 0L), null, null, 1, null, null);
    Assert.assertEquals(0, result.getNodeHosts().size());
    sm.addNode(new Node("node5", null, ResourceDescriptor.newInstance(200, 200, 4, 0xFL), ResourceDescriptor.newInstance(0, 0, 0, 0L)));
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 1, 0L), null, null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().size(), 2);
    sm.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 1, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 2, 0L), null, null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().get(0), "node5");
    sm.addContainerRequest(ResourceDescriptor.newInstance(1, 2, 1, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 1, 0L), null, null, 1, null, null);
    Assert.assertEquals(2, result.getNodeHosts().size());
    sm.addNode(new Node("node6", null, ResourceDescriptor.newInstance(200, 200, 4, 0xFL), ResourceDescriptor.newInstance(0, 0, 0, 0L)));
    result = sm.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().get(0), "node6");
    sm.addNode(node6);
    // Allocation with Gpu type label
    Set<String> tag = new HashSet<>();
    // Case for node label only
    tag.add("K40");
    node3 = new Node("node3", tag, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
    node4 = new Node("node4", tag, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 8, 0xFFL));
    node6 = new Node("node6", tag, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
    SelectionManager sm2 = new SelectionManager(am.conf.getLauncherConfig(), am.statusManager, am.requestManager);
    sm2.addNode(node3);
    sm2.addNode(node4);
    sm2.addNode(node6);
    result = sm2.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), "K40", null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().size(), 2);
    if (result.getNodeHosts().get(0).equals("node6")) {
        Assert.assertEquals(240, result.getGpuAttribute(result.getNodeHosts().get(0)).longValue());
        Assert.assertEquals(15, result.getGpuAttribute(result.getNodeHosts().get(1)).longValue());
    }
    if (result.getNodeHosts().get(0).equals("node3")) {
        Assert.assertEquals(15, result.getGpuAttribute(result.getNodeHosts().get(0)).longValue());
        Assert.assertEquals(240, result.getGpuAttribute(result.getNodeHosts().get(1)).longValue());
    }
    List<String> nodeList = new ArrayList<String>();
    nodeList.add(result.getNodeHosts().get(0));
    sm2.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), nodeList);
    nodeList.clear();
    nodeList.add(result.getNodeHosts().get(1));
    sm2.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(1))), nodeList);
    result = sm2.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), "K40", null, 1, null, null);
    Assert.assertEquals(result.getNodeHosts().get(0), "node3");
    Assert.assertEquals(0xF0L, result.getGpuAttribute(result.getNodeHosts().get(0)).longValue());
    sm2.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    // Node label not match
    result = sm2.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), "M40", null, 1, null, null);
    Assert.assertEquals(0, result.getNodeHosts().size());
    Node node7 = new Node("node7", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
    sm2.addNode(node7);
    result = sm2.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, null, 1, null, null);
    Assert.assertEquals("node7", result.getNodeHosts().get(0));
    sm2.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    // Case for gpu type config only
    node3 = new Node("node3", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
    node4 = new Node("node4", null, ResourceDescriptor.newInstance(200, 200, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
    Map<String, NodeConfiguration> gpuNodeConfig = createClusterTestNodes();
    SelectionManager sm3 = new SelectionManager(am.conf.getLauncherConfig(), am.statusManager, am.requestManager);
    sm3.addNode(node3);
    sm3.addNode(node4);
    result = sm3.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, "K40", 1, null, gpuNodeConfig);
    Assert.assertEquals("node3", result.getNodeHosts().get(0));
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF);
    sm3.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    result = sm3.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, "T40", 1, null, gpuNodeConfig);
    Assert.assertEquals("node4", result.getNodeHosts().get(0));
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF0);
    sm3.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    try {
        result = sm3.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, "L40", 1, null, gpuNodeConfig);
        Assert.fail("NodeGpuType should not be relaxed to RM");
    } catch (NotAvailableException e) {
    }
    result = sm3.select(ResourceDescriptor.newInstance(1, 1, 4, 0L), null, "L40,T40,K40", 1, null, gpuNodeConfig);
    Assert.assertEquals("node3", result.getNodeHosts().get(0));
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF0);
    SelectionManager sm4 = new SelectionManager(am.conf.getLauncherConfig(), am.statusManager, am.requestManager);
    node6 = new Node("node6", null, ResourceDescriptor.newInstance(2, 2, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 0, 0L));
    node7 = new Node("node7", null, ResourceDescriptor.newInstance(2, 2, 8, 0xFFL), ResourceDescriptor.newInstance(0, 0, 4, 0xFL));
    sm4.addNode(node6);
    sm4.addNode(node7);
    try {
        sm4.select(ResourceDescriptor.newInstance(1, 1, 4, 0x33L), null, "K40", 1, null, gpuNodeConfig);
        Assert.fail("NodeGpuType should not be relaxed to RM");
    } catch (NotAvailableException ignored) {
    }
    result = sm4.select(ResourceDescriptor.newInstance(1, 1, 4, 0x33L), null, "M40", 1, null, gpuNodeConfig);
    Assert.assertEquals("node6", result.getNodeHosts().get(0));
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0x33);
    result = sm4.select(ResourceDescriptor.newInstance(1, 1, 4, 0xFL), null, null, 1, null, gpuNodeConfig);
    Assert.assertEquals("node6", result.getNodeHosts().get(0));
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xFL);
    sm4.addContainerRequest(ResourceDescriptor.newInstance(1, 1, 4, result.getGpuAttribute(result.getNodeHosts().get(0))), result.getNodeHosts());
    result = sm4.select(ResourceDescriptor.newInstance(1, 1, 4, 0xF0L), null, "K40", 1, null, gpuNodeConfig);
    Assert.assertEquals("node7", result.getNodeHosts().get(0));
    Assert.assertEquals(result.getGpuAttribute(result.getNodeHosts().get(0)).longValue(), 0xF0);
}
Also used : NodeConfiguration(com.microsoft.frameworklauncher.common.model.NodeConfiguration) NotAvailableException(com.microsoft.frameworklauncher.common.exceptions.NotAvailableException) ResourceDescriptor(com.microsoft.frameworklauncher.common.model.ResourceDescriptor) Test(org.junit.Test)

Aggregations

NotAvailableException (com.microsoft.frameworklauncher.common.exceptions.NotAvailableException)3 ResourceDescriptor (com.microsoft.frameworklauncher.common.model.ResourceDescriptor)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 NodeConfiguration (com.microsoft.frameworklauncher.common.model.NodeConfiguration)1 ValueRange (com.microsoft.frameworklauncher.common.model.ValueRange)1 ContainerRequest (org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest)1 Test (org.junit.Test)1