Search in sources :

Example 1 with UpdateContainerError

use of org.apache.hadoop.yarn.api.records.UpdateContainerError in project hadoop by apache.

the class TestIncreaseAllocationExpirer method testConsecutiveContainerIncreaseAllocationExpiration.

@Test
public void testConsecutiveContainerIncreaseAllocationExpiration() throws Exception {
    /**
     * 1. Allocate 1 container: containerId2 (1G)
     * 2. Increase resource of containerId2: 1G -> 3G
     * 3. AM acquires the token
     * 4. Increase resource of containerId2 again: 3G -> 5G
     * 5. AM acquires the token
     * 6. AM uses the first token to increase the container in NM to 3G
     * 7. AM NEVER uses the second token
     * 8. Verify containerId2 eventually is allocated 3G after token expires
     * 9. Verify NM eventually uses 3G for containerId2
     */
    // Set the allocation expiration to 5 seconds
    conf.setLong(YarnConfiguration.RM_CONTAINER_ALLOC_EXPIRY_INTERVAL_MS, 5000);
    MockRM rm1 = new MockRM(conf);
    rm1.start();
    // Submit an application
    MockNM nm1 = rm1.registerNode("127.0.0.1:1234", 20 * GB);
    RMApp app1 = rm1.submitApp(1 * GB, "app", "user", null, "default");
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    nm1.nodeHeartbeat(app1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.RUNNING);
    // AM request a new container
    am1.allocate("127.0.0.1", 1 * GB, 1, new ArrayList<ContainerId>());
    ContainerId containerId2 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 2);
    rm1.waitForState(nm1, containerId2, RMContainerState.ALLOCATED);
    // AM acquire a new container to start container allocation expirer
    am1.allocate(null, null).getAllocatedContainers();
    // Report container status
    nm1.nodeHeartbeat(app1.getCurrentAppAttempt().getAppAttemptId(), 2, ContainerState.RUNNING);
    // Wait until container status is RUNNING, and is removed from
    // allocation expirer
    rm1.waitForState(nm1, containerId2, RMContainerState.RUNNING);
    // am1 asks to change containerId2 from 1GB to 3GB
    am1.sendContainerResizingRequest(Collections.singletonList(UpdateContainerRequest.newInstance(0, containerId2, ContainerUpdateType.INCREASE_RESOURCE, Resources.createResource(3 * GB), null)));
    // Kick off scheduling and sleep for 1 second to
    // make sure the allocation is done
    nm1.nodeHeartbeat(true);
    Thread.sleep(1000);
    // Start container increase allocation expirer
    am1.allocate(null, null);
    // Remember the resource (3G) in order to report status
    Resource resource1 = Resources.clone(rm1.getResourceScheduler().getRMContainer(containerId2).getAllocatedResource());
    // This should not work, since the container version is wrong
    AllocateResponse response = am1.sendContainerResizingRequest(Collections.singletonList(UpdateContainerRequest.newInstance(0, containerId2, ContainerUpdateType.INCREASE_RESOURCE, Resources.createResource(5 * GB), null)));
    List<UpdateContainerError> updateErrors = response.getUpdateErrors();
    Assert.assertEquals(1, updateErrors.size());
    Assert.assertEquals("INCORRECT_CONTAINER_VERSION_ERROR", updateErrors.get(0).getReason());
    Assert.assertEquals(1, updateErrors.get(0).getCurrentContainerVersion());
    // am1 asks to change containerId2 from 3GB to 5GB
    am1.sendContainerResizingRequest(Collections.singletonList(UpdateContainerRequest.newInstance(1, containerId2, ContainerUpdateType.INCREASE_RESOURCE, Resources.createResource(5 * GB), null)));
    // Kick off scheduling and sleep for 1 second to
    // make sure the allocation is done
    nm1.nodeHeartbeat(true);
    Thread.sleep(1000);
    // Reset container increase allocation expirer
    am1.allocate(null, null);
    // Verify current resource allocation in RM
    checkUsedResource(rm1, "default", 6 * GB, null);
    FiCaSchedulerApp app = TestUtils.getFiCaSchedulerApp(rm1, app1.getApplicationId());
    Assert.assertEquals(6 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize());
    // Verify available resource is now reduced to 14GB
    verifyAvailableResourceOfSchedulerNode(rm1, nm1.getNodeId(), 14 * GB);
    // Use the first token (3G)
    nm1.containerIncreaseStatus(getContainer(rm1, containerId2, resource1));
    // Wait long enough for the second token (5G) to expire, and verify that
    // the roll back action is completed as expected
    Thread.sleep(10000);
    am1.allocate(null, null);
    Thread.sleep(2000);
    // Verify container size is rolled back to 3G
    Assert.assertEquals(3 * GB, rm1.getResourceScheduler().getRMContainer(containerId2).getAllocatedResource().getMemorySize());
    // Verify total resource usage is 4G
    checkUsedResource(rm1, "default", 4 * GB, null);
    Assert.assertEquals(4 * GB, app.getAppAttemptResourceUsage().getUsed().getMemorySize());
    // Verify available resource is rolled back to 14GB
    verifyAvailableResourceOfSchedulerNode(rm1, nm1.getNodeId(), 16 * GB);
    // Verify NM receives the decrease message (3G)
    List<Container> containersToDecrease = nm1.nodeHeartbeat(true).getContainersToDecrease();
    Assert.assertEquals(1, containersToDecrease.size());
    Assert.assertEquals(3 * GB, containersToDecrease.get(0).getResource().getMemorySize());
    rm1.stop();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) Resource(org.apache.hadoop.yarn.api.records.Resource) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) UpdateContainerError(org.apache.hadoop.yarn.api.records.UpdateContainerError) RMContainer(org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer) Container(org.apache.hadoop.yarn.api.records.Container) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) Test(org.junit.Test)

Example 2 with UpdateContainerError

use of org.apache.hadoop.yarn.api.records.UpdateContainerError in project hadoop by apache.

the class AllocateResponsePBImpl method initLocalUpdateErrorsList.

private synchronized void initLocalUpdateErrorsList() {
    if (updateErrors != null) {
        return;
    }
    AllocateResponseProtoOrBuilder p = viaProto ? proto : builder;
    List<YarnServiceProtos.UpdateContainerErrorProto> list = p.getUpdateErrorsList();
    this.updateErrors = new ArrayList<UpdateContainerError>();
    for (YarnServiceProtos.UpdateContainerErrorProto t : list) {
        updateErrors.add(ProtoUtils.convertFromProtoFormat(t));
    }
}
Also used : UpdateContainerError(org.apache.hadoop.yarn.api.records.UpdateContainerError) AllocateResponseProtoOrBuilder(org.apache.hadoop.yarn.proto.YarnServiceProtos.AllocateResponseProtoOrBuilder) YarnServiceProtos(org.apache.hadoop.yarn.proto.YarnServiceProtos)

Example 3 with UpdateContainerError

use of org.apache.hadoop.yarn.api.records.UpdateContainerError in project hadoop by apache.

the class ApplicationMasterService method allocateInternal.

protected void allocateInternal(ApplicationAttemptId appAttemptId, AllocateRequest request, AllocateResponse allocateResponse) throws YarnException {
    //filter illegal progress values
    float filteredProgress = request.getProgress();
    if (Float.isNaN(filteredProgress) || filteredProgress == Float.NEGATIVE_INFINITY || filteredProgress < 0) {
        request.setProgress(0);
    } else if (filteredProgress > 1 || filteredProgress == Float.POSITIVE_INFINITY) {
        request.setProgress(1);
    }
    // Send the status update to the appAttempt.
    this.rmContext.getDispatcher().getEventHandler().handle(new RMAppAttemptStatusupdateEvent(appAttemptId, request.getProgress()));
    List<ResourceRequest> ask = request.getAskList();
    List<ContainerId> release = request.getReleaseList();
    ResourceBlacklistRequest blacklistRequest = request.getResourceBlacklistRequest();
    List<String> blacklistAdditions = (blacklistRequest != null) ? blacklistRequest.getBlacklistAdditions() : Collections.EMPTY_LIST;
    List<String> blacklistRemovals = (blacklistRequest != null) ? blacklistRequest.getBlacklistRemovals() : Collections.EMPTY_LIST;
    RMApp app = this.rmContext.getRMApps().get(appAttemptId.getApplicationId());
    // set label expression for Resource Requests if resourceName=ANY
    ApplicationSubmissionContext asc = app.getApplicationSubmissionContext();
    for (ResourceRequest req : ask) {
        if (null == req.getNodeLabelExpression() && ResourceRequest.ANY.equals(req.getResourceName())) {
            req.setNodeLabelExpression(asc.getNodeLabelExpression());
        }
    }
    Resource maximumCapacity = rScheduler.getMaximumResourceCapability();
    // sanity check
    try {
        RMServerUtils.normalizeAndValidateRequests(ask, maximumCapacity, app.getQueue(), rScheduler, rmContext);
    } catch (InvalidResourceRequestException e) {
        LOG.warn("Invalid resource ask by application " + appAttemptId, e);
        throw e;
    }
    try {
        RMServerUtils.validateBlacklistRequest(blacklistRequest);
    } catch (InvalidResourceBlacklistRequestException e) {
        LOG.warn("Invalid blacklist request by application " + appAttemptId, e);
        throw e;
    }
    // AM to release containers from the earlier attempt.
    if (!app.getApplicationSubmissionContext().getKeepContainersAcrossApplicationAttempts()) {
        try {
            RMServerUtils.validateContainerReleaseRequest(release, appAttemptId);
        } catch (InvalidContainerReleaseException e) {
            LOG.warn("Invalid container release by application " + appAttemptId, e);
            throw e;
        }
    }
    // Split Update Resource Requests into increase and decrease.
    // No Exceptions are thrown here. All update errors are aggregated
    // and returned to the AM.
    List<UpdateContainerError> updateErrors = new ArrayList<>();
    ContainerUpdates containerUpdateRequests = RMServerUtils.validateAndSplitUpdateResourceRequests(rmContext, request, maximumCapacity, updateErrors);
    // Send new requests to appAttempt.
    Allocation allocation;
    RMAppAttemptState state = app.getRMAppAttempt(appAttemptId).getAppAttemptState();
    if (state.equals(RMAppAttemptState.FINAL_SAVING) || state.equals(RMAppAttemptState.FINISHING) || app.isAppFinalStateStored()) {
        LOG.warn(appAttemptId + " is in " + state + " state, ignore container allocate request.");
        allocation = EMPTY_ALLOCATION;
    } else {
        allocation = this.rScheduler.allocate(appAttemptId, ask, release, blacklistAdditions, blacklistRemovals, containerUpdateRequests);
    }
    if (!blacklistAdditions.isEmpty() || !blacklistRemovals.isEmpty()) {
        LOG.info("blacklist are updated in Scheduler." + "blacklistAdditions: " + blacklistAdditions + ", " + "blacklistRemovals: " + blacklistRemovals);
    }
    RMAppAttempt appAttempt = app.getRMAppAttempt(appAttemptId);
    if (allocation.getNMTokens() != null && !allocation.getNMTokens().isEmpty()) {
        allocateResponse.setNMTokens(allocation.getNMTokens());
    }
    // Notify the AM of container update errors
    addToUpdateContainerErrors(allocateResponse, updateErrors);
    // update the response with the deltas of node status changes
    List<RMNode> updatedNodes = new ArrayList<RMNode>();
    if (app.pullRMNodeUpdates(updatedNodes) > 0) {
        List<NodeReport> updatedNodeReports = new ArrayList<NodeReport>();
        for (RMNode rmNode : updatedNodes) {
            SchedulerNodeReport schedulerNodeReport = rScheduler.getNodeReport(rmNode.getNodeID());
            Resource used = BuilderUtils.newResource(0, 0);
            int numContainers = 0;
            if (schedulerNodeReport != null) {
                used = schedulerNodeReport.getUsedResource();
                numContainers = schedulerNodeReport.getNumContainers();
            }
            NodeId nodeId = rmNode.getNodeID();
            NodeReport report = BuilderUtils.newNodeReport(nodeId, rmNode.getState(), rmNode.getHttpAddress(), rmNode.getRackName(), used, rmNode.getTotalCapability(), numContainers, rmNode.getHealthReport(), rmNode.getLastHealthReportTime(), rmNode.getNodeLabels());
            updatedNodeReports.add(report);
        }
        allocateResponse.setUpdatedNodes(updatedNodeReports);
    }
    addToAllocatedContainers(allocateResponse, allocation.getContainers());
    allocateResponse.setCompletedContainersStatuses(appAttempt.pullJustFinishedContainers());
    allocateResponse.setAvailableResources(allocation.getResourceLimit());
    addToContainerUpdates(appAttemptId, allocateResponse, allocation);
    allocateResponse.setNumClusterNodes(this.rScheduler.getNumClusterNodes());
    // add collector address for this application
    if (YarnConfiguration.timelineServiceV2Enabled(getConfig())) {
        allocateResponse.setCollectorAddr(this.rmContext.getRMApps().get(appAttemptId.getApplicationId()).getCollectorAddr());
    }
    // add preemption to the allocateResponse message (if any)
    allocateResponse.setPreemptionMessage(generatePreemptionMessage(allocation));
    // Set application priority
    allocateResponse.setApplicationPriority(app.getApplicationPriority());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) ResourceBlacklistRequest(org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest) ContainerUpdates(org.apache.hadoop.yarn.server.resourcemanager.scheduler.ContainerUpdates) ArrayList(java.util.ArrayList) InvalidResourceRequestException(org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) Allocation(org.apache.hadoop.yarn.server.resourcemanager.scheduler.Allocation) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) InvalidContainerReleaseException(org.apache.hadoop.yarn.exceptions.InvalidContainerReleaseException) SchedulerNodeReport(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport) ApplicationSubmissionContext(org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext) RMAppAttemptStatusupdateEvent(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.event.RMAppAttemptStatusupdateEvent) Resource(org.apache.hadoop.yarn.api.records.Resource) UpdateContainerError(org.apache.hadoop.yarn.api.records.UpdateContainerError) NodeId(org.apache.hadoop.yarn.api.records.NodeId) InvalidResourceBlacklistRequestException(org.apache.hadoop.yarn.exceptions.InvalidResourceBlacklistRequestException) RMAppAttemptState(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttemptState) PreemptionResourceRequest(org.apache.hadoop.yarn.api.records.PreemptionResourceRequest) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) NodeReport(org.apache.hadoop.yarn.api.records.NodeReport) SchedulerNodeReport(org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport)

Example 4 with UpdateContainerError

use of org.apache.hadoop.yarn.api.records.UpdateContainerError in project hadoop by apache.

the class RMServerUtils method checkAndcreateUpdateError.

private static void checkAndcreateUpdateError(List<UpdateContainerError> errors, UpdateContainerRequest updateReq, RMContainer rmContainer, String msg) {
    if (msg != null) {
        UpdateContainerError updateError = RECORD_FACTORY.newRecordInstance(UpdateContainerError.class);
        updateError.setReason(msg);
        updateError.setUpdateContainerRequest(updateReq);
        if (rmContainer != null) {
            updateError.setCurrentContainerVersion(rmContainer.getContainer().getVersion());
        } else {
            updateError.setCurrentContainerVersion(-1);
        }
        errors.add(updateError);
    }
}
Also used : UpdateContainerError(org.apache.hadoop.yarn.api.records.UpdateContainerError)

Aggregations

UpdateContainerError (org.apache.hadoop.yarn.api.records.UpdateContainerError)4 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)2 Resource (org.apache.hadoop.yarn.api.records.Resource)2 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)2 ArrayList (java.util.ArrayList)1 AllocateResponse (org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse)1 ApplicationSubmissionContext (org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext)1 Container (org.apache.hadoop.yarn.api.records.Container)1 NodeId (org.apache.hadoop.yarn.api.records.NodeId)1 NodeReport (org.apache.hadoop.yarn.api.records.NodeReport)1 PreemptionResourceRequest (org.apache.hadoop.yarn.api.records.PreemptionResourceRequest)1 ResourceBlacklistRequest (org.apache.hadoop.yarn.api.records.ResourceBlacklistRequest)1 ResourceRequest (org.apache.hadoop.yarn.api.records.ResourceRequest)1 InvalidContainerReleaseException (org.apache.hadoop.yarn.exceptions.InvalidContainerReleaseException)1 InvalidResourceBlacklistRequestException (org.apache.hadoop.yarn.exceptions.InvalidResourceBlacklistRequestException)1 InvalidResourceRequestException (org.apache.hadoop.yarn.exceptions.InvalidResourceRequestException)1 YarnServiceProtos (org.apache.hadoop.yarn.proto.YarnServiceProtos)1 AllocateResponseProtoOrBuilder (org.apache.hadoop.yarn.proto.YarnServiceProtos.AllocateResponseProtoOrBuilder)1 MockAM (org.apache.hadoop.yarn.server.resourcemanager.MockAM)1 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)1