Search in sources :

Example 61 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestAMRestart method testRMRestartOrFailoverNotCountedForAMFailures.

// Test regular RM restart/failover, new RM should not count
// AM failure towards the max-retry-account and should be able to
// re-launch the AM.
@Test(timeout = 50000)
public void testRMRestartOrFailoverNotCountedForAMFailures() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    // explicitly set max-am-retry count as 1.
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    // AM should be restarted even though max-am-attempt is 1.
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
    // Restart rm.
    MockRM rm2 = new MockRM(conf, memStore);
    rm2.start();
    ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
    // re-register the NM
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NMContainerStatus status = Records.newRecord(NMContainerStatus.class);
    status.setContainerExitStatus(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
    status.setContainerId(attempt1.getMasterContainer().getId());
    status.setContainerState(ContainerState.COMPLETE);
    status.setDiagnostics("");
    nm1.registerNode(Collections.singletonList(status), null);
    rm2.waitForState(attempt1.getAppAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, appState.getAttempt(am1.getApplicationAttemptId()).getAMContainerExitStatus());
    // Will automatically start a new AppAttempt in rm2
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am2 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
    MockRM.finishAMAndVerifyAppState(app1, rm2, nm1, am2);
    RMAppAttempt attempt3 = rm2.getRMContext().getRMApps().get(app1.getApplicationId()).getCurrentAppAttempt();
    Assert.assertTrue(attempt3.shouldCountTowardsMaxAttemptRetry());
    Assert.assertEquals(ContainerExitStatus.INVALID, appState.getAttempt(am2.getApplicationAttemptId()).getAMContainerExitStatus());
    rm1.stop();
    rm2.stop();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) Test(org.junit.Test)

Example 62 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestWorkPreservingRMRestart method testContainerCompleteMsgNotLostAfterAMFailedAndRMRestart.

@Test(timeout = 20000)
public void testContainerCompleteMsgNotLostAfterAMFailedAndRMRestart() throws Exception {
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    // submit app with keepContainersAcrossApplicationAttempts true
    Resource resource = Records.newRecord(Resource.class);
    resource.setMemorySize(200);
    RMApp app0 = rm1.submitApp(resource, "", UserGroupInformation.getCurrentUser().getShortUserName(), null, false, null, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS, null, null, true, true, false, null, 0, null, true, null);
    MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
    am0.allocate("127.0.0.1", 1000, 2, new ArrayList<ContainerId>());
    nm1.nodeHeartbeat(true);
    List<Container> conts = am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
    while (conts.size() == 0) {
        nm1.nodeHeartbeat(true);
        conts.addAll(am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
        Thread.sleep(500);
    }
    // am failed,and relaunch it
    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am1 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
    // rm failover
    rm2 = new MockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // container launched by first am completed
    NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 1, ContainerState.RUNNING);
    NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 2, ContainerState.COMPLETE);
    NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am0.getApplicationAttemptId(), 3, ContainerState.RUNNING);
    nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
    Thread.sleep(200);
    // check whether current am could get containerCompleteMsg
    RMApp recoveredApp0 = rm2.getRMContext().getRMApps().get(app0.getApplicationId());
    RMAppAttempt loadedAttempt1 = recoveredApp0.getCurrentAppAttempt();
    assertEquals(1, loadedAttempt1.getJustFinishedContainers().size());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) Resource(org.apache.hadoop.yarn.api.records.Resource) ArrayList(java.util.ArrayList) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) Container(org.apache.hadoop.yarn.api.records.Container) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) Test(org.junit.Test)

Example 63 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestWorkPreservingRMRestart method testNewContainersNotAllocatedDuringSchedulerRecovery.

@Test(timeout = 20000)
public void testNewContainersNotAllocatedDuringSchedulerRecovery() throws Exception {
    conf.setLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS, 4000);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    // Restart RM
    rm2 = new MockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    nm1.registerNode();
    ControlledClock clock = new ControlledClock();
    long startTime = System.currentTimeMillis();
    ((RMContextImpl) rm2.getRMContext()).setSystemClock(clock);
    am1.setAMRMProtocol(rm2.getApplicationMasterService(), rm2.getRMContext());
    am1.registerAppAttempt(true);
    rm2.waitForState(app1.getApplicationId(), RMAppState.RUNNING);
    // AM request for new containers
    am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
    List<Container> containers = new ArrayList<Container>();
    clock.setTime(startTime + 2000);
    nm1.nodeHeartbeat(true);
    // sleep some time as allocation happens asynchronously.
    Thread.sleep(3000);
    containers.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
    // container is not allocated during scheduling recovery.
    Assert.assertTrue(containers.isEmpty());
    clock.setTime(startTime + 8000);
    nm1.nodeHeartbeat(true);
    // Container is created after recovery is done.
    while (containers.isEmpty()) {
        containers.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
        Thread.sleep(500);
    }
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) ArrayList(java.util.ArrayList) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) Container(org.apache.hadoop.yarn.api.records.Container) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) Test(org.junit.Test)

Example 64 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestWorkPreservingRMRestart method testContainersNotRecoveredForCompletedApps.

// Apps already completed before RM restart. Restarted RM scheduler should not
// recover containers for completed apps.
@Test(timeout = 20000)
public void testContainersNotRecoveredForCompletedApps() throws Exception {
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am1);
    rm2 = new MockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(runningContainer, completedContainer), null);
    RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    assertEquals(RMAppState.FINISHED, recoveredApp1.getState());
    // Wait for RM to settle down on recovering containers;
    Thread.sleep(3000);
    YarnScheduler scheduler = rm2.getResourceScheduler();
    // scheduler should not recover containers for finished apps.
    assertNull(scheduler.getRMContainer(runningContainer.getContainerId()));
    assertNull(scheduler.getRMContainer(completedContainer.getContainerId()));
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) YarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler) AbstractYarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) Test(org.junit.Test)

Example 65 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestRMDelegationTokens method testRMDTMasterKeyStateOnRollingMasterKey.

// Test the DT mast key in the state-store when the mast key is being rolled.
@Test(timeout = 15000)
public void testRMDTMasterKeyStateOnRollingMasterKey() throws Exception {
    Configuration conf = new Configuration(testConf);
    conf.set("hadoop.security.authentication", "kerberos");
    UserGroupInformation.setConfiguration(conf);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    RMState rmState = memStore.getState();
    Map<RMDelegationTokenIdentifier, Long> rmDTState = rmState.getRMDTSecretManagerState().getTokenState();
    Set<DelegationKey> rmDTMasterKeyState = rmState.getRMDTSecretManagerState().getMasterKeyState();
    MockRM rm1 = new MyMockRM(conf, memStore);
    rm1.start();
    // on rm start, two master keys are created.
    // One is created at RMDTSecretMgr.startThreads.updateCurrentKey();
    // the other is created on the first run of
    // tokenRemoverThread.rollMasterKey()
    RMDelegationTokenSecretManager dtSecretManager = rm1.getRMContext().getRMDelegationTokenSecretManager();
    // assert all master keys are saved
    Assert.assertEquals(dtSecretManager.getAllMasterKeys(), rmDTMasterKeyState);
    // request to generate a RMDelegationToken
    GetDelegationTokenRequest request = mock(GetDelegationTokenRequest.class);
    when(request.getRenewer()).thenReturn("renewer1");
    GetDelegationTokenResponse response = rm1.getClientRMService().getDelegationToken(request);
    org.apache.hadoop.yarn.api.records.Token delegationToken = response.getRMDelegationToken();
    Token<RMDelegationTokenIdentifier> token1 = ConverterUtils.convertFromYarn(delegationToken, (Text) null);
    RMDelegationTokenIdentifier dtId1 = token1.decodeIdentifier();
    // in state-store also.
    while (((TestRMDelegationTokenSecretManager) dtSecretManager).numUpdatedKeys.get() < 3) {
        ((TestRMDelegationTokenSecretManager) dtSecretManager).checkCurrentKeyInStateStore(rmDTMasterKeyState);
        Thread.sleep(100);
    }
    // wait for token to expire and remove from state-store
    // rollMasterKey is called every 1 second.
    int count = 0;
    while (rmDTState.containsKey(dtId1) && count < 100) {
        Thread.sleep(100);
        count++;
    }
    rm1.stop();
}
Also used : YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Configuration(org.apache.hadoop.conf.Configuration) GetDelegationTokenResponse(org.apache.hadoop.yarn.api.protocolrecords.GetDelegationTokenResponse) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) RMDelegationTokenIdentifier(org.apache.hadoop.yarn.security.client.RMDelegationTokenIdentifier) GetDelegationTokenRequest(org.apache.hadoop.yarn.api.protocolrecords.GetDelegationTokenRequest) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) DelegationKey(org.apache.hadoop.security.token.delegation.DelegationKey) RMState(org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState) Test(org.junit.Test)

Aggregations

MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)84 Test (org.junit.Test)81 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)68 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)28 ApplicationStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData)27 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)23 TestSecurityMockRM (org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM)22 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)21 RMState (org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState)21 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)20 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)20 MockRM (org.apache.hadoop.yarn.server.resourcemanager.MockRM)19 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)16 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)16 IOException (java.io.IOException)15 MockAM (org.apache.hadoop.yarn.server.resourcemanager.MockAM)14 Configuration (org.apache.hadoop.conf.Configuration)12 AbstractYarnScheduler (org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler)10 ArrayList (java.util.ArrayList)9 ApplicationAccessType (org.apache.hadoop.yarn.api.records.ApplicationAccessType)9