use of com.microsoft.frameworklauncher.common.model.ResourceDescriptor in project pai by Microsoft.
the class Node method compareTo.
// Compare two node's AvailableResource, order is Gpu, Cpu, Memory
@Override
public int compareTo(Node other) {
ResourceDescriptor thisAvailableResource = this.getAvailableResource();
ResourceDescriptor otherAvailableResource = other.getAvailableResource();
if (thisAvailableResource.getGpuNumber() > otherAvailableResource.getGpuNumber())
return 1;
if (thisAvailableResource.getGpuNumber() < otherAvailableResource.getGpuNumber()) {
return -1;
}
if (thisAvailableResource.getCpuNumber() > otherAvailableResource.getCpuNumber()) {
return 1;
}
if (thisAvailableResource.getCpuNumber() < otherAvailableResource.getCpuNumber()) {
return -1;
}
if (thisAvailableResource.getMemoryMB() > otherAvailableResource.getMemoryMB()) {
return 1;
}
if (thisAvailableResource.getMemoryMB() < otherAvailableResource.getMemoryMB()) {
return -1;
}
return 0;
}
use of com.microsoft.frameworklauncher.common.model.ResourceDescriptor in project pai by Microsoft.
the class SelectionManager method selectCandidateGpuAttribute.
@VisibleForTesting
public synchronized Long selectCandidateGpuAttribute(Node node, Integer requestGpuNumber) {
ResourceDescriptor nodeAvailable = node.getAvailableResource();
assert (requestGpuNumber <= nodeAvailable.getGpuNumber());
Long selectedGpuAttribute = 0L;
Long availableGpuAttribute = nodeAvailable.getGpuAttribute();
// the communication cost among Gpus.
for (int i = 0; i < requestGpuNumber; i++) {
selectedGpuAttribute += (availableGpuAttribute - (availableGpuAttribute & (availableGpuAttribute - 1)));
availableGpuAttribute &= (availableGpuAttribute - 1);
}
return selectedGpuAttribute;
}
use of com.microsoft.frameworklauncher.common.model.ResourceDescriptor in project pai by Microsoft.
the class SelectionManager method filterNodesForNoneGpuJob.
private void filterNodesForNoneGpuJob(int jobTotalRequestGpu) {
if (jobTotalRequestGpu == 0) {
for (int i = filteredNodes.size() - 1; i >= 0; i--) {
Node node = allNodes.get(filteredNodes.get(i));
ResourceDescriptor totalResource = node.getTotalResource();
if (totalResource.getGpuNumber() > 0) {
LOGGER.logDebug("skip gpu node for none gpu job: Node [%s], Node total resource: [%s]", node.getHost(), totalResource);
filteredNodes.remove(i);
}
}
}
}
use of com.microsoft.frameworklauncher.common.model.ResourceDescriptor in project pai by Microsoft.
the class SelectionManager method select.
public synchronized SelectionResult select(String taskRoleName) throws NotAvailableException {
ResourceDescriptor requestResource = requestManager.getTaskResources().get(taskRoleName);
LOGGER.logInfo("Select: TaskRole: [%s] Resource: [%s]", taskRoleName, requestResource);
String requestNodeLabel = requestManager.getTaskPlatParams().get(taskRoleName).getTaskNodeLabel();
String requestNodeGpuType = requestManager.getTaskPlatParams().get(taskRoleName).getTaskNodeGpuType();
Map<String, NodeConfiguration> configuredNodes = requestManager.getClusterConfiguration().getNodes();
int startStatesTaskCount = statusManager.getStartStatesTaskCount(taskRoleName);
List<ValueRange> reusePorts = null;
// Prefer to use previous successfully associated ports. if no associated ports, try to reuse the "Requesting" ports.
if (requestManager.getTaskRoles().get(taskRoleName).getUseTheSamePorts()) {
reusePorts = statusManager.getLiveAssociatedContainerPorts(taskRoleName);
if (ValueRangeUtils.getValueNumber(reusePorts) <= 0 && previousRequestedPorts.containsKey(taskRoleName)) {
reusePorts = previousRequestedPorts.get(taskRoleName);
// the cache only guide the next task to use previous requesting port.
previousRequestedPorts.remove(taskRoleName);
}
}
SelectionResult result = select(requestResource, requestNodeLabel, requestNodeGpuType, startStatesTaskCount, reusePorts, configuredNodes);
if (requestManager.getTaskRoles().get(taskRoleName).getUseTheSamePorts()) {
// reusePortsTimes time is used to avoid startStatesTaskCount not decrease in the situation of timeout tasks back to startStates.
if (startStatesTaskCount > 1) {
if (reusePortsTimes == 0) {
reusePortsTimes = startStatesTaskCount;
}
// If there has other tasks waiting, push current ports to previousRequestedPorts.
if (reusePortsTimes > 1) {
previousRequestedPorts.put(taskRoleName, result.getOptimizedResource().getPortRanges());
}
reusePortsTimes--;
}
}
return result;
}
use of com.microsoft.frameworklauncher.common.model.ResourceDescriptor in project pai by Microsoft.
the class SelectionManager method filterNodesByResource.
private void filterNodesByResource(ResourceDescriptor requestResource, Boolean skipLocalTriedResource) {
if (requestResource != null) {
for (int i = filteredNodes.size() - 1; i >= 0; i--) {
Node node = allNodes.get(filteredNodes.get(i));
ResourceDescriptor availableResource = YamlUtils.deepCopy(node.getAvailableResource(), ResourceDescriptor.class);
if (skipLocalTriedResource && localTriedResource.containsKey(node.getHost())) {
LOGGER.logDebug("Skip local tried resources: [%s] on Node : [%s]", localTriedResource.get(node.getHost()), node.getHost());
availableResource = ResourceDescriptor.subtract(availableResource, localTriedResource.get(node.getHost()));
}
if (!ResourceDescriptor.fitsIn(requestResource, availableResource)) {
LOGGER.logDebug("Resource does not fit in: Node: [%s] Request Resource: [%s], Available Resource: [%s]", node.getHost(), requestResource, availableResource);
filteredNodes.remove(i);
}
}
}
}
Aggregations