Search in sources :

Example 11 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestWorkPreservingRMRestart method testRecoverSchedulerAppAndAttemptSynchronously.

@Test(timeout = 20000)
public void testRecoverSchedulerAppAndAttemptSynchronously() throws Exception {
    // start RM
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // create app and launch the AM
    RMApp app0 = rm1.submitApp(200);
    MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
    rm2 = new MockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // scheduler app/attempt is immediately available after RM is re-started.
    Assert.assertNotNull(rm2.getResourceScheduler().getSchedulerAppInfo(am0.getApplicationAttemptId()));
    // getTransferredContainers should not throw NPE.
    rm2.getResourceScheduler().getTransferredContainers(am0.getApplicationAttemptId());
    List<NMContainerStatus> containers = createNMContainerStatusForApp(am0);
    nm1.registerNode(containers, null);
    waitForNumContainersToRecover(2, rm2, am0.getApplicationAttemptId());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) Test(org.junit.Test)

Example 12 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestAMRestart method testRMAppAttemptFailuresValidityInterval.

@Test(timeout = 120000)
public void testRMAppAttemptFailuresValidityInterval() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    // explicitly set max-am-retry count as 2.
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
    nm1.registerNode();
    // set window size to a larger number : 60s
    // we will verify the app should be failed if
    // two continuous attempts failed in 60s.
    RMApp app = rm1.submitApp(200, 60000, false);
    MockAM am = MockRM.launchAM(app, rm1, nm1);
    // Fail current attempt normally
    nm1.nodeHeartbeat(am.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // launch the second attempt
    rm1.waitForState(app.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(2, app.getAppAttempts().size());
    MockAM am_2 = MockRM.launchAndRegisterAM(app, rm1, nm1);
    rm1.waitForState(am_2.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    nm1.nodeHeartbeat(am_2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am_2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // current app should be failed.
    rm1.waitForState(app.getApplicationId(), RMAppState.FAILED);
    ControlledClock clock = new ControlledClock();
    // set window size to 10s
    RMAppImpl app1 = (RMAppImpl) rm1.submitApp(200, 10000, false);
    app1.setSystemClock(clock);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    // Fail attempt1 normally
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    //Wait to make sure attempt1 be removed in State Store
    //TODO explore a better way than sleeping for a while (YARN-4929)
    Thread.sleep(15 * 1000);
    // launch the second attempt
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(2, app1.getAppAttempts().size());
    RMAppAttempt attempt2 = app1.getCurrentAppAttempt();
    MockAM am2 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // wait for 10 seconds
    clock.setTime(System.currentTimeMillis() + 10 * 1000);
    // Fail attempt2 normally
    nm1.nodeHeartbeat(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // can launch the third attempt successfully
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(3, app1.getAppAttempts().size());
    RMAppAttempt attempt3 = app1.getCurrentAppAttempt();
    clock.reset();
    MockAM am3 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    rm1.waitForState(am3.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // Restart rm.
    @SuppressWarnings("resource") MockRM rm2 = new MockRM(conf, memStore);
    rm2.start();
    ApplicationStateData app1State = memStore.getState().getApplicationState().get(app1.getApplicationId());
    Assert.assertEquals(1, app1State.getFirstAttemptId());
    // re-register the NM
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NMContainerStatus status = Records.newRecord(NMContainerStatus.class);
    status.setContainerExitStatus(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
    status.setContainerId(attempt3.getMasterContainer().getId());
    status.setContainerState(ContainerState.COMPLETE);
    status.setDiagnostics("");
    nm1.registerNode(Collections.singletonList(status), null);
    rm2.waitForState(attempt3.getAppAttemptId(), RMAppAttemptState.FAILED);
    //Wait to make sure attempt3 be removed in State Store
    //TODO explore a better way than sleeping for a while (YARN-4929)
    Thread.sleep(15 * 1000);
    Assert.assertEquals(2, app1State.getAttemptCount());
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    // Lauch Attempt 4
    MockAM am4 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 4, nm1);
    // wait for 10 seconds
    clock.setTime(System.currentTimeMillis() + 10 * 1000);
    // Fail attempt4 normally
    nm1.nodeHeartbeat(am4.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm2.waitForState(am4.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertEquals(2, app1State.getAttemptCount());
    // can launch the 5th attempt successfully
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am5 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 5, nm1);
    clock.reset();
    rm2.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // Fail attempt5 normally
    nm1.nodeHeartbeat(am5.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm2.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertEquals(2, app1State.getAttemptCount());
    rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
    rm1.stop();
    rm2.stop();
}
Also used : RMAppImpl(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) Test(org.junit.Test)

Example 13 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class TestAMRMClientOnRMRestart method testAMRMClientForUnregisterAMOnRMRestart.

// Test verify for
// 1. AM try to unregister without registering
// 2. AM register to RM, and try to unregister immediately after RM restart
@Test(timeout = 60000)
public void testAMRMClientForUnregisterAMOnRMRestart() throws Exception {
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    // Phase-1 Start 1st RM
    MyResourceManager rm1 = new MyResourceManager(conf, memStore);
    rm1.start();
    DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
    // Submit the application
    RMApp app = rm1.submitApp(1024);
    dispatcher.await();
    MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // Node heartbeat
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
    rm1.sendAMLaunched(appAttemptId);
    dispatcher.await();
    org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> token = rm1.getRMContext().getRMApps().get(appAttemptId.getApplicationId()).getRMAppAttempt(appAttemptId).getAMRMToken();
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    ugi.addTokenIdentifier(token.decodeIdentifier());
    AMRMClient<ContainerRequest> amClient = new MyAMRMClientImpl(rm1);
    amClient.init(conf);
    amClient.start();
    amClient.registerApplicationMaster("h1", 10000, "");
    amClient.allocate(0.1f);
    // Phase-2 start 2nd RM is up
    MyResourceManager rm2 = new MyResourceManager(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    ((MyAMRMClientImpl) amClient).updateRMProxy(rm2);
    dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
    // NM should be rebooted on heartbeat, even first heartbeat for nm2
    NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
    // new NM to represent NM re-register
    nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
    ContainerId containerId = ContainerId.newContainerId(appAttemptId, 1);
    NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, ContainerState.RUNNING, Resource.newInstance(1024, 1), "recover container", 0, Priority.newInstance(0), 0);
    nm1.registerNode(Arrays.asList(containerReport), null);
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    amClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, null, null);
    rm2.waitForState(appAttemptId, RMAppAttemptState.FINISHING);
    nm1.nodeHeartbeat(appAttemptId, 1, ContainerState.COMPLETE);
    rm2.waitForState(appAttemptId, RMAppAttemptState.FINISHED);
    rm2.waitForState(app.getApplicationId(), RMAppState.FINISHED);
    amClient.stop();
    rm1.stop();
    rm2.stop();
}
Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) AMRMTokenIdentifier(org.apache.hadoop.yarn.security.AMRMTokenIdentifier) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ContainerRequest(org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest) UpdateContainerRequest(org.apache.hadoop.yarn.api.records.UpdateContainerRequest) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) Test(org.junit.Test)

Example 14 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class ResourceTrackerService method registerNodeManager.

@SuppressWarnings("unchecked")
@Override
public RegisterNodeManagerResponse registerNodeManager(RegisterNodeManagerRequest request) throws YarnException, IOException {
    NodeId nodeId = request.getNodeId();
    String host = nodeId.getHost();
    int cmPort = nodeId.getPort();
    int httpPort = request.getHttpPort();
    Resource capability = request.getResource();
    String nodeManagerVersion = request.getNMVersion();
    Resource physicalResource = request.getPhysicalResource();
    RegisterNodeManagerResponse response = recordFactory.newRecordInstance(RegisterNodeManagerResponse.class);
    if (!minimumNodeManagerVersion.equals("NONE")) {
        if (minimumNodeManagerVersion.equals("EqualToRM")) {
            minimumNodeManagerVersion = YarnVersionInfo.getVersion();
        }
        if ((nodeManagerVersion == null) || (VersionUtil.compareVersions(nodeManagerVersion, minimumNodeManagerVersion)) < 0) {
            String message = "Disallowed NodeManager Version " + nodeManagerVersion + ", is less than the minimum version " + minimumNodeManagerVersion + " sending SHUTDOWN signal to " + "NodeManager.";
            LOG.info(message);
            response.setDiagnosticsMessage(message);
            response.setNodeAction(NodeAction.SHUTDOWN);
            return response;
        }
    }
    // Check if this node is a 'valid' node
    if (!this.nodesListManager.isValidNode(host) && !isNodeInDecommissioning(nodeId)) {
        String message = "Disallowed NodeManager from  " + host + ", Sending SHUTDOWN signal to the NodeManager.";
        LOG.info(message);
        response.setDiagnosticsMessage(message);
        response.setNodeAction(NodeAction.SHUTDOWN);
        return response;
    }
    // check if node's capacity is load from dynamic-resources.xml
    String nid = nodeId.toString();
    Resource dynamicLoadCapability = loadNodeResourceFromDRConfiguration(nid);
    if (dynamicLoadCapability != null) {
        if (LOG.isDebugEnabled()) {
            LOG.debug("Resource for node: " + nid + " is adjusted from: " + capability + " to: " + dynamicLoadCapability + " due to settings in dynamic-resources.xml.");
        }
        capability = dynamicLoadCapability;
        // sync back with new resource.
        response.setResource(capability);
    }
    // Check if this node has minimum allocations
    if (capability.getMemorySize() < minAllocMb || capability.getVirtualCores() < minAllocVcores) {
        String message = "NodeManager from  " + host + " doesn't satisfy minimum allocations, Sending SHUTDOWN" + " signal to the NodeManager.";
        LOG.info(message);
        response.setDiagnosticsMessage(message);
        response.setNodeAction(NodeAction.SHUTDOWN);
        return response;
    }
    response.setContainerTokenMasterKey(containerTokenSecretManager.getCurrentKey());
    response.setNMTokenMasterKey(nmTokenSecretManager.getCurrentKey());
    RMNode rmNode = new RMNodeImpl(nodeId, rmContext, host, cmPort, httpPort, resolve(host), capability, nodeManagerVersion, physicalResource);
    RMNode oldNode = this.rmContext.getRMNodes().putIfAbsent(nodeId, rmNode);
    if (oldNode == null) {
        this.rmContext.getDispatcher().getEventHandler().handle(new RMNodeStartedEvent(nodeId, request.getNMContainerStatuses(), request.getRunningApplications()));
    } else {
        LOG.info("Reconnect from the node at: " + host);
        this.nmLivelinessMonitor.unregister(nodeId);
        // Reset heartbeat ID since node just restarted.
        oldNode.resetLastNodeHeartBeatResponse();
        this.rmContext.getDispatcher().getEventHandler().handle(new RMNodeReconnectEvent(nodeId, rmNode, request.getRunningApplications(), request.getNMContainerStatuses()));
    }
    // On every node manager register we will be clearing NMToken keys if
    // present for any running application.
    this.nmTokenSecretManager.removeNodeKey(nodeId);
    this.nmLivelinessMonitor.register(nodeId);
    // RMNode inserted
    if (!rmContext.isWorkPreservingRecoveryEnabled()) {
        if (!request.getNMContainerStatuses().isEmpty()) {
            LOG.info("received container statuses on node manager register :" + request.getNMContainerStatuses());
            for (NMContainerStatus status : request.getNMContainerStatuses()) {
                handleNMContainerStatus(status, nodeId);
            }
        }
    }
    // Update node's labels to RM's NodeLabelManager.
    Set<String> nodeLabels = NodeLabelsUtils.convertToStringSet(request.getNodeLabels());
    if (isDistributedNodeLabelsConf && nodeLabels != null) {
        try {
            updateNodeLabelsFromNMReport(nodeLabels, nodeId);
            response.setAreNodeLabelsAcceptedByRM(true);
        } catch (IOException ex) {
            // Ensure the exception is captured in the response
            response.setDiagnosticsMessage(ex.getMessage());
            response.setAreNodeLabelsAcceptedByRM(false);
        }
    } else if (isDelegatedCentralizedNodeLabelsConf) {
        this.rmContext.getRMDelegatedNodeLabelsUpdater().updateNodeLabels(nodeId);
    }
    StringBuilder message = new StringBuilder();
    message.append("NodeManager from node ").append(host).append("(cmPort: ").append(cmPort).append(" httpPort: ");
    message.append(httpPort).append(") ").append("registered with capability: ").append(capability);
    message.append(", assigned nodeId ").append(nodeId);
    if (response.getAreNodeLabelsAcceptedByRM()) {
        message.append(", node labels { ").append(StringUtils.join(",", nodeLabels) + " } ");
    }
    LOG.info(message.toString());
    response.setNodeAction(NodeAction.NORMAL);
    response.setRMIdentifier(ResourceManager.getClusterTimeStamp());
    response.setRMVersion(YarnVersionInfo.getVersion());
    return response;
}
Also used : RMNodeReconnectEvent(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeReconnectEvent) Resource(org.apache.hadoop.yarn.api.records.Resource) IOException(java.io.IOException) RMNodeStartedEvent(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStartedEvent) RMNode(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) NodeId(org.apache.hadoop.yarn.api.records.NodeId) RegisterNodeManagerResponse(org.apache.hadoop.yarn.server.api.protocolrecords.RegisterNodeManagerResponse) UnRegisterNodeManagerResponse(org.apache.hadoop.yarn.server.api.protocolrecords.UnRegisterNodeManagerResponse) RMNodeImpl(org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl)

Example 15 with NMContainerStatus

use of org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus in project hadoop by apache.

the class RegisterNodeManagerRequestPBImpl method addNMContainerStatusesToProto.

private synchronized void addNMContainerStatusesToProto() {
    maybeInitBuilder();
    builder.clearContainerStatuses();
    List<NMContainerStatusProto> list = new ArrayList<NMContainerStatusProto>();
    for (NMContainerStatus status : this.containerStatuses) {
        list.add(convertToProtoFormat(status));
    }
    builder.addAllContainerStatuses(list);
}
Also used : NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ArrayList(java.util.ArrayList) NMContainerStatusProto(org.apache.hadoop.yarn.proto.YarnServerCommonServiceProtos.NMContainerStatusProto)

Aggregations

NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)39 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)24 Test (org.junit.Test)24 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)22 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)17 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)11 TestSecurityMockRM (org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM)11 ArrayList (java.util.ArrayList)10 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)10 AbstractYarnScheduler (org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler)9 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)8 Container (org.apache.hadoop.yarn.api.records.Container)8 Resource (org.apache.hadoop.yarn.api.records.Resource)7 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)6 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)6 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)6 ResourceRequest (org.apache.hadoop.yarn.api.records.ResourceRequest)4 NodeHeartbeatResponse (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)4 ApplicationStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData)4 RMNodeImpl (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeImpl)4