use of org.apache.hadoop.yarn.api.records.UpdateContainerError in project hadoop by apache.
the class TestIncreaseAllocationExpirer method testConsecutiveContainerIncreaseAllocationExpiration.
@Test
public void testConsecutiveContainerIncreaseAllocationExpiration() throws Exception {
/**
* 1. Allocate 1 container: containerId2 (1G)
* 2. Increase resource of containerId2: 1G -> 3G
* 3. AM acquires the token
* 4. Increase resource of containerId2 again: 3G -> 5G
* 5. AM acquires the token
* 6. AM uses the first token to increase the container in NM to 3G
* 7. AM NEVER uses the second token
* 8. Verify containerId2 eventually is allocated 3G after token expires
* 9. Verify NM eventually uses 3G for containerId2
*/
// Set the allocation expiration to 5 seconds
conf.setLong(YarnConfiguration.RM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS, 5000);
MockRM rm1 = new MockRM(conf);
rm1.start();
// Submit an application
MockNM nm1 = rm1.registerNode("127.0.0.1:1234", 20 * GB);
RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "default");
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
nm1.nodeHeartbeat(app1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.RUNNING);
// AM request a new container
am1.allocate("127.0.0.1", 1 * GB, 1, new ArrayList<ContainerId>());
ContainerId containerId2 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 2);
rm1.waitForState(nm1, containerId2, RMContainerState.ALLOCATED);
// AM acquire a new container to start container allocation expirer
am1.allocate(null, null).getAllocatedContainers();
// Report container status
nm1.nodeHeartbeat(app1.getCurrentAppAttempt().getAppAttemptId(), 2, ContainerState.RUNNING);
// Wait until container status is RUNNING, and is removed from
// allocation expirer
rm1.waitForState(nm1, containerId2, RMContainerState.RUNNING);
// am1 asks to change containerId2 from 1GB to 3GB
am1.sendContainerResizingRequest(Collections.singletonList(UpdateContainerRequest.newInstance(0, containerId2, ContainerUpdateType.INCREASE_RESOURCE, Resources.createResource(3 * GB), null)));
// Kick off scheduling and sleep for 1 second to
// make sure the allocation is done
nm1.nodeHeartbeat(true);
Thread.sleep(1000);
// Start container increase allocation expirer
am1.allocate(null, null);
// Remember the resource (3G) in order to report status
Resource resource1 = Resources.clone(rm1.getResourceScheduler().getRMContainer(containerId2).getAllocatedResource());
// This should not work, since the container version is wrong
AllocateResponse response = am1.sendContainerResizingRequest(Collections.singletonList(UpdateContainerRequest.newInstance(0, containerId2, ContainerUpdateType.INCREASE_RESOURCE, Resources.createResource(5 * GB), null)));
List<UpdateContainerError> updateErrors = response.getUpdateErrors();
Assert.assertEquals(1, updateErrors.size());
Assert.assertEquals("INCORRECT_CONTAINER_VERSION_ERROR", updateErrors.get(0).getReason());
Assert.assertEquals(1, updateErrors.get(0).getCurrentContainerVersion());
// am1 asks to change containerId2 from 3GB to 5GB
am1.sendContainerResizingRequest(Collections.singletonList(UpdateContainerRequest.newInstance(1, containerId2, ContainerUpdateType.INCREASE_RESOURCE, Resources.createResource(5 * GB), null)));
// Kick off scheduling and sleep for 1 second to
// make sure the allocation is done
nm1.nodeHeartbeat(true);
Thread.sleep(1000);
// Reset container increase allocation expirer
am1.allocate(null, null);
// Verify current resource allocation in RM
checkUsedResource(rm1, "default", 6 * GB, null);
FiCaSchedulerApp app = TestUtils.getFiCaSchedulerApp(rm1, app1.getApplicationId());
Assert.assertEquals(6 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize());
// Verify available resource is now reduced to 14GB
verifyAvailableResourceOfSchedulerNode(rm1, nm1.getNodeId(), 14 * GB);
// Use the first token (3G)
nm1.containerIncreaseStatus(getContainer(rm1, containerId2, resource1));
// Wait long enough for the second token (5G) to expire, and verify that
// the roll back action is completed as expected
Thread.sleep(10000);
am1.allocate(null, null);
Thread.sleep(2000);
// Verify container size is rolled back to 3G
Assert.assertEquals(3 * GB, rm1.getResourceScheduler().getRMContainer(containerId2).getAllocatedResource().getMemorySize());
// Verify total resource usage is 4G
checkUsedResource(rm1, "default", 4 * GB, null);
Assert.assertEquals(4 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize());
// Verify available resource is rolled back to 14GB
verifyAvailableResourceOfSchedulerNode(rm1, nm1.getNodeId(), 16 * GB);
// Verify NM receives the decrease message (3G)
List<Container> containersToDecrease = nm1.nodeHeartbeat(true).getContainersToDecrease();
Assert.assertEquals(1, containersToDecrease.size());
Assert.assertEquals(3 * GB, containersToDecrease.get(0).getResource().getMemorySize());
rm1.stop();
}
use of org.apache.hadoop.yarn.api.records.UpdateContainerError in project hadoop by apache.
the class AllocateResponsePBImpl method initLocalUpdateErrorsList.
private synchronized void initLocalUpdateErrorsList() {
if (updateErrors != null) {
return;
}
AllocateResponseProtoOrBuilder p = viaProto ? proto : builder;
List<YarnServiceProtos.UpdateContainerErrorProto> list = p.getUpdateErrorsList();
this.updateErrors = new ArrayList<UpdateContainerError>();
for (YarnServiceProtos.UpdateContainerErrorProto t : list) {
updateErrors.add(ProtoUtils.convertFromProtoFormat(t));
}
}
use of org.apache.hadoop.yarn.api.records.UpdateContainerError in project hadoop by apache.
the class ApplicationMasterService method allocateInternal.
protected void allocateInternal(ApplicationAttemptId appAttemptId, AllocateRequest request, AllocateResponse allocateResponse) throws YarnException {
//filter illegal progress values
float filteredProgress = request.getProgress();
if (Float.isNaN(filteredProgress) || filteredProgress == Float.NEGATIVE_INFINITY || filteredProgress < 0) {
request.setProgress(0);
} else if (filteredProgress > 1 || filteredProgress == Float.POSITIVE_INFINITY) {
request.setProgress(1);
}
// Send the status update to the appAttempt.
this.rmContext.getDispatcher().getEventHandler().handle(new RMAppAttemptStatusupdateEvent(appAttemptId, request.getProgress()));
List<ResourceRequest> ask = request.getAskList();
List<ContainerId> release = request.getReleaseList();
ResourceBlacklistRequest blacklistRequest = request.getResourceBlacklistRequest();
List<String> blacklistAdditions = (blacklistRequest != null) ? blacklistRequest.getBlacklistAdditions() : Collections.EMPTY_LIST;
List<String> blacklistRemovals = (blacklistRequest != null) ? blacklistRequest.getBlacklistRemovals() : Collections.EMPTY_LIST;
RMApp app = this.rmContext.getRMApps().get(appAttemptId.getApplicationId());
// set label expression for Resource Requests if resourceName=ANY
ApplicationSubmissionContext asc = app.getApplicationSubmissionContext();
for (ResourceRequest req : ask) {
if (null == req.getNodeLabelExpression() && ResourceRequest.ANY.equals(req.getResourceName())) {
req.setNodeLabelExpression(asc.getNodeLabelExpression());
}
}
Resource maximumCapacity = rScheduler.getMaximumResourceCapability();
// sanity check
try {
RMServerUtils.normalizeAndValidateRequests(ask, maximumCapacity, app.getQueue(), rScheduler, rmContext);
} catch (InvalidResourceRequestException e) {
LOG.warn("Invalid resource ask by application " + appAttemptId, e);
throw e;
}
try {
RMServerUtils.validateBlacklistRequest(blacklistRequest);
} catch (InvalidResourceBlacklistRequestException e) {
LOG.warn("Invalid blacklist request by application " + appAttemptId, e);
throw e;
}
// AM to release containers from the earlier attempt.
if (!app.getApplicationSubmissionContext().getKeepContainersAcrossApplicationAttempts()) {
try {
RMServerUtils.validateContainerReleaseRequest(release, appAttemptId);
} catch (InvalidContainerReleaseException e) {
LOG.warn("Invalid container release by application " + appAttemptId, e);
throw e;
}
}
// Split Update Resource Requests into increase and decrease.
// No Exceptions are thrown here. All update errors are aggregated
// and returned to the AM.
List<UpdateContainerError> updateErrors = new ArrayList<>();
ContainerUpdates containerUpdateRequests = RMServerUtils.validateAndSplitUpdateResourceRequests(rmContext, request, maximumCapacity, updateErrors);
// Send new requests to appAttempt.
Allocation allocation;
RMAppAttemptState state = app.getRMAppAttempt(appAttemptId).getAppAttemptState();
if (state.equals(RMAppAttemptState.FINAL_SAVING) || state.equals(RMAppAttemptState.FINISHING) || app.isAppFinalStateStored()) {
LOG.warn(appAttemptId + " is in " + state + " state, ignore container allocate request.");
allocation = EMPTY_ALLOCATION;
} else {
allocation = this.rScheduler.allocate(appAttemptId, ask, release, blacklistAdditions, blacklistRemovals, containerUpdateRequests);
}
if (!blacklistAdditions.isEmpty() || !blacklistRemovals.isEmpty()) {
LOG.info("blacklist are updated in Scheduler." + "blacklistAdditions: " + blacklistAdditions + ", " + "blacklistRemovals: " + blacklistRemovals);
}
RMAppAttempt appAttempt = app.getRMAppAttempt(appAttemptId);
if (allocation.getNMTokens() != null && !allocation.getNMTokens().isEmpty()) {
allocateResponse.setNMTokens(allocation.getNMTokens());
}
// Notify the AM of container update errors
addToUpdateContainerErrors(allocateResponse, updateErrors);
// update the response with the deltas of node status changes
List<RMNode> updatedNodes = new ArrayList<RMNode>();
if (app.pullRMNodeUpdates(updatedNodes) > 0) {
List<NodeReport> updatedNodeReports = new ArrayList<NodeReport>();
for (RMNode rmNode : updatedNodes) {
SchedulerNodeReport schedulerNodeReport = rScheduler.getNodeReport(rmNode.getNodeID());
Resource used = BuilderUtils.newResource(0, 0);
int numContainers = 0;
if (schedulerNodeReport != null) {
used = schedulerNodeReport.getUsedResource();
numContainers = schedulerNodeReport.getNumContainers();
}
NodeId nodeId = rmNode.getNodeID();
NodeReport report = BuilderUtils.newNodeReport(nodeId, rmNode.getState(), rmNode.getHttpAddress(), rmNode.getRackName(), used, rmNode.getTotalCapability(), numContainers, rmNode.getHealthReport(), rmNode.getLastHealthReportTime(), rmNode.getNodeLabels());
updatedNodeReports.add(report);
}
allocateResponse.setUpdatedNodes(updatedNodeReports);
}
addToAllocatedContainers(allocateResponse, allocation.getContainers());
allocateResponse.setCompletedContainersStatuses(appAttempt.pullJustFinishedContainers());
allocateResponse.setAvailableResources(allocation.getResourceLimit());
addToContainerUpdates(appAttemptId, allocateResponse, allocation);
allocateResponse.setNumClusterNodes(this.rScheduler.getNumClusterNodes());
// add collector address for this application
if (YarnConfiguration.timelineServiceV2Enabled(getConfig())) {
allocateResponse.setCollectorAddr(this.rmContext.getRMApps().get(appAttemptId.getApplicationId()).getCollectorAddr());
}
// add preemption to the allocateResponse message (if any)
allocateResponse.setPreemptionMessage(generatePreemptionMessage(allocation));
// Set application priority
allocateResponse.setApplicationPriority(app.getApplicationPriority());
}
use of org.apache.hadoop.yarn.api.records.UpdateContainerError in project hadoop by apache.
the class RMServerUtils method checkAndcreateUpdateError.
private static void checkAndcreateUpdateError(List<UpdateContainerError> errors, UpdateContainerRequest updateReq, RMContainer rmContainer, String msg) {
if (msg != null) {
UpdateContainerError updateError = RECORD_FACTORY.newRecordInstance(UpdateContainerError.class);
updateError.setReason(msg);
updateError.setUpdateContainerRequest(updateReq);
if (rmContainer != null) {
updateError.setCurrentContainerVersion(rmContainer.getContainer().getVersion());
} else {
updateError.setCurrentContainerVersion(-1);
}
errors.add(updateError);
}
}
Aggregations