Search in sources :

Example 1 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestRMContainerAllocator method testRMContainerAllocatorResendsRequestsOnRMRestart.

// Step-1 : AM send allocate request for 2 ContainerRequests and 1
// blackListeNode
// Step-2 : 2 containers are allocated by RM.
// Step-3 : AM Send 1 containerRequest(event3) and 1 releaseRequests to
// RM
// Step-4 : On RM restart, AM(does not know RM is restarted) sends
// additional containerRequest(event4) and blacklisted nodes.
// Intern RM send resync command
// Step-5 : On Resync,AM sends all outstanding
// asks,release,blacklistAaddition
// and another containerRequest(event5)
// Step-6 : RM allocates containers i.e event3,event4 and cRequest5
@Test
public void testRMContainerAllocatorResendsRequestsOnRMRestart() throws Exception {
    Configuration conf = new Configuration();
    conf.set(YarnConfiguration.RECOVERY_ENABLED, "true");
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, true);
    conf.setLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS, 0);
    conf.setBoolean(MRJobConfig.MR_AM_JOB_NODE_BLACKLISTING_ENABLE, true);
    conf.setInt(MRJobConfig.MAX_TASK_FAILURES_PER_TRACKER, 1);
    conf.setInt(MRJobConfig.MR_AM_IGNORE_BLACKLISTING_BLACKLISTED_NODE_PERECENT, -1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MyResourceManager rm1 = new MyResourceManager(conf, memStore);
    rm1.start();
    DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
    // Submit the application
    RMApp app = rm1.submitApp(1024);
    dispatcher.await();
    MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // Node heartbeat
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
    rm1.sendAMLaunched(appAttemptId);
    dispatcher.await();
    JobId jobId = MRBuilderUtils.newJobId(appAttemptId.getApplicationId(), 0);
    Job mockJob = mock(Job.class);
    when(mockJob.getReport()).thenReturn(MRBuilderUtils.newJobReport(jobId, "job", "user", JobState.RUNNING, 0, 0, 0, 0, 0, 0, 0, "jobfile", null, false, ""));
    MyContainerAllocator allocator = new MyContainerAllocator(rm1, conf, appAttemptId, mockJob);
    // Step-1 : AM send allocate request for 2 ContainerRequests and 1
    // blackListeNode
    // create the container request
    // send MAP request
    ContainerRequestEvent event1 = createReq(jobId, 1, 1024, new String[] { "h1" });
    allocator.sendRequest(event1);
    ContainerRequestEvent event2 = createReq(jobId, 2, 2048, new String[] { "h1", "h2" });
    allocator.sendRequest(event2);
    // Send events to blacklist h2
    ContainerFailedEvent f1 = createFailEvent(jobId, 1, "h2", false);
    allocator.sendFailure(f1);
    // send allocate request and 1 blacklisted nodes
    List<TaskAttemptContainerAssignedEvent> assignedContainers = allocator.schedule();
    dispatcher.await();
    Assert.assertEquals("No of assignments must be 0", 0, assignedContainers.size());
    // Why ask is 3, not 4? --> ask from blacklisted node h2 is removed
    assertAsksAndReleases(3, 0, rm1);
    assertBlacklistAdditionsAndRemovals(1, 0, rm1);
    // Node heartbeat
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    // Step-2 : 2 containers are allocated by RM.
    assignedContainers = allocator.schedule();
    dispatcher.await();
    Assert.assertEquals("No of assignments must be 2", 2, assignedContainers.size());
    assertAsksAndReleases(0, 0, rm1);
    assertBlacklistAdditionsAndRemovals(0, 0, rm1);
    assignedContainers = allocator.schedule();
    Assert.assertEquals("No of assignments must be 0", 0, assignedContainers.size());
    assertAsksAndReleases(3, 0, rm1);
    assertBlacklistAdditionsAndRemovals(0, 0, rm1);
    // Step-3 : AM Send 1 containerRequest(event3) and 1 releaseRequests to
    // RM
    // send container request
    ContainerRequestEvent event3 = createReq(jobId, 3, 1000, new String[] { "h1" });
    allocator.sendRequest(event3);
    // send deallocate request
    ContainerAllocatorEvent deallocate1 = createDeallocateEvent(jobId, 1, false);
    allocator.sendDeallocate(deallocate1);
    assignedContainers = allocator.schedule();
    Assert.assertEquals("No of assignments must be 0", 0, assignedContainers.size());
    assertAsksAndReleases(3, 1, rm1);
    assertBlacklistAdditionsAndRemovals(0, 0, rm1);
    // Phase-2 start 2nd RM is up
    MyResourceManager rm2 = new MyResourceManager(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    allocator.updateSchedulerProxy(rm2);
    dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
    // NM should be rebooted on heartbeat, even first heartbeat for nm2
    NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
    // new NM to represent NM re-register
    nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
    nm1.registerNode();
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    // Step-4 : On RM restart, AM(does not know RM is restarted) sends
    // additional containerRequest(event4) and blacklisted nodes.
    // Intern RM send resync command
    // send deallocate request, release=1
    ContainerAllocatorEvent deallocate2 = createDeallocateEvent(jobId, 2, false);
    allocator.sendDeallocate(deallocate2);
    // Send events to blacklist nodes h3
    ContainerFailedEvent f2 = createFailEvent(jobId, 1, "h3", false);
    allocator.sendFailure(f2);
    ContainerRequestEvent event4 = createReq(jobId, 4, 2000, new String[] { "h1", "h2" });
    allocator.sendRequest(event4);
    // send allocate request to 2nd RM and get resync command
    allocator.schedule();
    dispatcher.await();
    // Step-5 : On Resync,AM sends all outstanding
    // asks,release,blacklistAaddition
    // and another containerRequest(event5)
    ContainerRequestEvent event5 = createReq(jobId, 5, 3000, new String[] { "h1", "h2", "h3" });
    allocator.sendRequest(event5);
    // send all outstanding request again.
    assignedContainers = allocator.schedule();
    dispatcher.await();
    assertAsksAndReleases(3, 2, rm2);
    assertBlacklistAdditionsAndRemovals(2, 0, rm2);
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    // Step-6 : RM allocates containers i.e event3,event4 and cRequest5
    assignedContainers = allocator.schedule();
    dispatcher.await();
    Assert.assertEquals("Number of container should be 3", 3, assignedContainers.size());
    for (TaskAttemptContainerAssignedEvent assig : assignedContainers) {
        Assert.assertTrue("Assigned count not correct", "h1".equals(assig.getContainer().getNodeId().getHost()));
    }
    rm1.stop();
    rm2.stop();
}
Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) TaskAttemptContainerAssignedEvent(org.apache.hadoop.mapreduce.v2.app.job.event.TaskAttemptContainerAssignedEvent) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) Job(org.apache.hadoop.mapreduce.v2.app.job.Job) JobId(org.apache.hadoop.mapreduce.v2.api.records.JobId) Test(org.junit.Test)

Example 2 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestAMRMClientOnRMRestart method testAMRMClientForUnregisterAMOnRMRestart.

// Test verify for
// 1. AM try to unregister without registering
// 2. AM register to RM, and try to unregister immediately after RM restart
@Test(timeout = 60000)
public void testAMRMClientForUnregisterAMOnRMRestart() throws Exception {
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    // Phase-1 Start 1st RM
    MyResourceManager rm1 = new MyResourceManager(conf, memStore);
    rm1.start();
    DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
    // Submit the application
    RMApp app = rm1.submitApp(1024);
    dispatcher.await();
    MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // Node heartbeat
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
    rm1.sendAMLaunched(appAttemptId);
    dispatcher.await();
    org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> token = rm1.getRMContext().getRMApps().get(appAttemptId.getApplicationId()).getRMAppAttempt(appAttemptId).getAMRMToken();
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    ugi.addTokenIdentifier(token.decodeIdentifier());
    AMRMClient<ContainerRequest> amClient = new MyAMRMClientImpl(rm1);
    amClient.init(conf);
    amClient.start();
    amClient.registerApplicationMaster("h1", 10000, "");
    amClient.allocate(0.1f);
    // Phase-2 start 2nd RM is up
    MyResourceManager rm2 = new MyResourceManager(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    ((MyAMRMClientImpl) amClient).updateRMProxy(rm2);
    dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
    // NM should be rebooted on heartbeat, even first heartbeat for nm2
    NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
    // new NM to represent NM re-register
    nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
    ContainerId containerId = ContainerId.newContainerId(appAttemptId, 1);
    NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, ContainerState.RUNNING, Resource.newInstance(1024, 1), "recover container", 0, Priority.newInstance(0), 0);
    nm1.registerNode(Arrays.asList(containerReport), null);
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    amClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, null, null);
    rm2.waitForState(appAttemptId, RMAppAttemptState.FINISHING);
    nm1.nodeHeartbeat(appAttemptId, 1, ContainerState.COMPLETE);
    rm2.waitForState(appAttemptId, RMAppAttemptState.FINISHED);
    rm2.waitForState(app.getApplicationId(), RMAppState.FINISHED);
    amClient.stop();
    rm1.stop();
    rm2.stop();
}
Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) AMRMTokenIdentifier(org.apache.hadoop.yarn.security.AMRMTokenIdentifier) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ContainerRequest(org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest) UpdateContainerRequest(org.apache.hadoop.yarn.api.records.UpdateContainerRequest) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) Test(org.junit.Test)

Example 3 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class NMSimulator method middleStep.

@Override
public void middleStep() throws Exception {
    // we check the lifetime for each running containers
    ContainerSimulator cs = null;
    synchronized (completedContainerList) {
        while ((cs = containerQueue.poll()) != null) {
            runningContainers.remove(cs.getId());
            completedContainerList.add(cs.getId());
            LOG.debug(MessageFormat.format("Container {0} has completed", cs.getId()));
        }
    }
    // send heart beat
    NodeHeartbeatRequest beatRequest = Records.newRecord(NodeHeartbeatRequest.class);
    beatRequest.setLastKnownNMTokenMasterKey(masterKey);
    NodeStatus ns = Records.newRecord(NodeStatus.class);
    ns.setContainersStatuses(generateContainerStatusList());
    ns.setNodeId(node.getNodeID());
    ns.setKeepAliveApplications(new ArrayList<ApplicationId>());
    ns.setResponseId(RESPONSE_ID++);
    ns.setNodeHealthStatus(NodeHealthStatus.newInstance(true, "", 0));
    beatRequest.setNodeStatus(ns);
    NodeHeartbeatResponse beatResponse = rm.getResourceTrackerService().nodeHeartbeat(beatRequest);
    if (!beatResponse.getContainersToCleanup().isEmpty()) {
        // remove from queue
        synchronized (releasedContainerList) {
            for (ContainerId containerId : beatResponse.getContainersToCleanup()) {
                if (amContainerList.contains(containerId)) {
                    // AM container (not killed?, only release)
                    synchronized (amContainerList) {
                        amContainerList.remove(containerId);
                    }
                    LOG.debug(MessageFormat.format("NodeManager {0} releases " + "an AM ({1}).", node.getNodeID(), containerId));
                } else {
                    cs = runningContainers.remove(containerId);
                    containerQueue.remove(cs);
                    releasedContainerList.add(containerId);
                    LOG.debug(MessageFormat.format("NodeManager {0} releases a " + "container ({1}).", node.getNodeID(), containerId));
                }
            }
        }
    }
    if (beatResponse.getNodeAction() == NodeAction.SHUTDOWN) {
        lastStep();
    }
}
Also used : ContainerSimulator(org.apache.hadoop.yarn.sls.scheduler.ContainerSimulator) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NodeHeartbeatRequest(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) NodeStatus(org.apache.hadoop.yarn.server.api.records.NodeStatus)

Example 4 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class TestApplicationCleanup method testContainerCleanupWhenRMRestartedAppNotRegistered.

@SuppressWarnings("resource")
@Test(timeout = 60000)
public void testContainerCleanupWhenRMRestartedAppNotRegistered() throws Exception {
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    // start RM
    final DrainDispatcher dispatcher = new DrainDispatcher();
    MockRM rm1 = new MockRM(conf, memStore) {

        @Override
        protected Dispatcher createDispatcher() {
            return dispatcher;
        }
    };
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // create app and launch the AM
    RMApp app0 = rm1.submitApp(200);
    MockAM am0 = launchAM(app0, rm1, nm1);
    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    rm1.waitForState(app0.getApplicationId(), RMAppState.RUNNING);
    // start new RM
    final DrainDispatcher dispatcher2 = new DrainDispatcher();
    MockRM rm2 = new MockRM(conf, memStore) {

        @Override
        protected Dispatcher createDispatcher() {
            return dispatcher2;
        }
    };
    rm2.start();
    // nm1 register to rm2, and do a heartbeat
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    nm1.registerNode(Arrays.asList(app0.getApplicationId()));
    rm2.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
    // Add unknown container for application unknown to scheduler
    NodeHeartbeatResponse response = nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    waitForContainerCleanup(dispatcher2, nm1, response);
    rm1.stop();
    rm2.stop();
}
Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) Test(org.junit.Test)

Example 5 with NodeHeartbeatResponse

use of org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse in project hadoop by apache.

the class YarnServerBuilderUtils method newNodeHeartbeatResponse.

public static NodeHeartbeatResponse newNodeHeartbeatResponse(int responseId, NodeAction action, List<ContainerId> containersToCleanUp, List<ApplicationId> applicationsToCleanUp, MasterKey containerTokenMasterKey, MasterKey nmTokenMasterKey, long nextHeartbeatInterval) {
    NodeHeartbeatResponse response = recordFactory.newRecordInstance(NodeHeartbeatResponse.class);
    response.setResponseId(responseId);
    response.setNodeAction(action);
    response.setContainerTokenMasterKey(containerTokenMasterKey);
    response.setNMTokenMasterKey(nmTokenMasterKey);
    response.setNextHeartBeatInterval(nextHeartbeatInterval);
    if (containersToCleanUp != null) {
        response.addAllContainersToCleanup(containersToCleanUp);
    }
    if (applicationsToCleanUp != null) {
        response.addAllApplicationsToCleanup(applicationsToCleanUp);
    }
    return response;
}
Also used : NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)

Aggregations

NodeHeartbeatResponse (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse)49 Test (org.junit.Test)33 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)23 Configuration (org.apache.hadoop.conf.Configuration)21 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)16 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)13 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)12 ArrayList (java.util.ArrayList)10 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)9 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)8 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)7 NodeHeartbeatRequest (org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatRequest)7 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)7 RMNodeStatusEvent (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNodeStatusEvent)7 Container (org.apache.hadoop.yarn.api.records.Container)6 Resource (org.apache.hadoop.yarn.api.records.Resource)6 DrainDispatcher (org.apache.hadoop.yarn.event.DrainDispatcher)6 NodeHealthStatus (org.apache.hadoop.yarn.server.api.records.NodeHealthStatus)6 ByteBuffer (java.nio.ByteBuffer)5 ContainerStatus (org.apache.hadoop.yarn.api.records.ContainerStatus)5