Search in sources :

Example 16 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestWorkPreservingRMRestart method testCapacitySchedulerQueueRemovedRecovery.

//Test behavior of an app if queue is removed during recovery. Test case does
//following:
//1. Add some apps to two queues, attempt to add an app to a non-existant
//   queue to verify that the new logic is not in effect during normal app
//   submission
//2. Remove one of the queues, restart the RM, once with fail fast config as
//   false and once with fail fast as true.
//3. Verify that app was killed if fail fast is false.
//4. Verify that QueueException was thrown if fail fast is true.
@Test(timeout = 30000)
public void testCapacitySchedulerQueueRemovedRecovery() throws Exception {
    if (getSchedulerType() != SchedulerType.CAPACITY) {
        return;
    }
    conf.setBoolean(CapacitySchedulerConfiguration.ENABLE_USER_METRICS, true);
    conf.set(CapacitySchedulerConfiguration.RESOURCE_CALCULATOR_CLASS, DominantResourceCalculator.class.getName());
    CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration(conf);
    setupQueueConfiguration(csConf);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(csConf);
    rm1 = new MockRM(csConf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    MockNM nm2 = new MockNM("127.1.1.1:4321", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    nm2.registerNode();
    RMApp app1_1 = rm1.submitApp(1024, "app1_1", USER_1, null, A);
    MockAM am1_1 = MockRM.launchAndRegisterAM(app1_1, rm1, nm1);
    RMApp app1_2 = rm1.submitApp(1024, "app1_2", USER_1, null, A);
    MockAM am1_2 = MockRM.launchAndRegisterAM(app1_2, rm1, nm2);
    RMApp app2 = rm1.submitApp(1024, "app2", USER_2, null, B);
    MockAM am2 = MockRM.launchAndRegisterAM(app2, rm1, nm2);
    assertEquals(rm1.getApplicationReport(app2.getApplicationId()).getYarnApplicationState(), YarnApplicationState.RUNNING);
    //Submit an app with a non existant queue to make sure it does not
    //cause a fatal failure in the non-recovery case
    RMApp appNA = rm1.submitApp(1024, "app1_2", USER_1, null, QUEUE_DOESNT_EXIST, false);
    // clear queue metrics
    rm1.clearQueueMetrics(app1_1);
    rm1.clearQueueMetrics(app1_2);
    rm1.clearQueueMetrics(app2);
    // Take a copy of state store so that it can be reset to this state.
    RMState state = memStore.loadState();
    // Set new configuration with QueueB removed.
    csConf = new CapacitySchedulerConfiguration(conf);
    setupQueueConfigurationOnlyA(csConf);
    String diags = "Application killed on recovery as it was submitted to " + "queue QueueB which no longer exists after restart.";
    verifyAppRecoveryWithWrongQueueConfig(csConf, app2, diags, memStore, state);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) DominantResourceCalculator(org.apache.hadoop.yarn.util.resource.DominantResourceCalculator) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) RMState(org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState) CapacitySchedulerConfiguration(org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacitySchedulerConfiguration) Test(org.junit.Test)

Example 17 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestWorkPreservingRMRestart method testAMfailedBetweenRMRestart.

// Test RM shuts down, in the meanwhile, AM fails. Restarted RM scheduler
// should not recover the containers that belong to the failed AM.
@Test(timeout = 20000)
public void testAMfailedBetweenRMRestart() throws Exception {
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    conf.setLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS, 0);
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    rm2 = new MockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NMContainerStatus amContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    NMContainerStatus runningContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    NMContainerStatus completedContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(amContainer, runningContainer, completedContainer), null);
    rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // Wait for RM to settle down on recovering containers;
    Thread.sleep(3000);
    YarnScheduler scheduler = rm2.getResourceScheduler();
    // Previous AM failed, The failed AM should once again release the
    // just-recovered containers.
    assertNull(scheduler.getRMContainer(runningContainer.getContainerId()));
    assertNull(scheduler.getRMContainer(completedContainer.getContainerId()));
    rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
    MockNM nm2 = new MockNM("127.1.1.1:4321", 8192, rm2.getResourceTrackerService());
    NMContainerStatus previousAttemptContainer = TestRMRestart.createNMContainerStatus(am1.getApplicationAttemptId(), 4, ContainerState.RUNNING);
    nm2.registerNode(Arrays.asList(previousAttemptContainer), null);
    // Wait for RM to settle down on recovering containers;
    Thread.sleep(3000);
    // check containers from previous failed attempt should not be recovered.
    assertNull(scheduler.getRMContainer(previousAttemptContainer.getContainerId()));
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) YarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler) AbstractYarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) Test(org.junit.Test)

Example 18 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestWorkPreservingRMRestart method testRecoverSchedulerAppAndAttemptSynchronously.

@Test(timeout = 20000)
public void testRecoverSchedulerAppAndAttemptSynchronously() throws Exception {
    // start RM
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // create app and launch the AM
    RMApp app0 = rm1.submitApp(200);
    MockAM am0 = MockRM.launchAndRegisterAM(app0, rm1, nm1);
    rm2 = new MockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // scheduler app/attempt is immediately available after RM is re-started.
    Assert.assertNotNull(rm2.getResourceScheduler().getSchedulerAppInfo(am0.getApplicationAttemptId()));
    // getTransferredContainers should not throw NPE.
    rm2.getResourceScheduler().getTransferredContainers(am0.getApplicationAttemptId());
    List<NMContainerStatus> containers = createNMContainerStatusForApp(am0);
    nm1.registerNode(containers, null);
    waitForNumContainersToRecover(2, rm2, am0.getApplicationAttemptId());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) Test(org.junit.Test)

Example 19 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestAMLivelinessMonitor method testResetTimer.

@Test(timeout = 10000)
public void testResetTimer() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    UserGroupInformation.setConfiguration(conf);
    conf.set(YarnConfiguration.RECOVERY_ENABLED, "true");
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, true);
    conf.setInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 6000);
    final ControlledClock clock = new ControlledClock();
    clock.setTime(0);
    MemoryRMStateStore memStore = new MemoryRMStateStore() {

        @Override
        public synchronized RMState loadState() throws Exception {
            clock.setTime(8000);
            return super.loadState();
        }
    };
    memStore.init(conf);
    final ApplicationAttemptId attemptId = mock(ApplicationAttemptId.class);
    final Dispatcher dispatcher = mock(Dispatcher.class);
    final boolean[] expired = new boolean[] { false };
    final AMLivelinessMonitor monitor = new AMLivelinessMonitor(dispatcher, clock) {

        @Override
        protected void expire(ApplicationAttemptId id) {
            Assert.assertEquals(id, attemptId);
            expired[0] = true;
        }
    };
    monitor.register(attemptId);
    MockRM rm = new MockRM(conf, memStore) {

        @Override
        protected AMLivelinessMonitor createAMLivelinessMonitor() {
            return monitor;
        }
    };
    rm.start();
    // make sure that monitor has started
    while (monitor.getServiceState() != Service.STATE.STARTED) {
        Thread.sleep(100);
    }
    // expired[0] would be set to true without resetTimer
    Assert.assertFalse(expired[0]);
    rm.stop();
}
Also used : MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) Dispatcher(org.apache.hadoop.yarn.event.Dispatcher) ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) Test(org.junit.Test)

Example 20 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestDelegationTokenRenewer method testRMRestartWithExpiredToken.

// 1. token is expired before app completes.
// 2. RM shutdown.
// 3. When RM recovers the app, token renewal will fail as token expired.
//    RM should request a new token and sent it to NM for log-aggregation.
@Test
public void testRMRestartWithExpiredToken() throws Exception {
    Configuration yarnConf = new YarnConfiguration();
    yarnConf.setBoolean(YarnConfiguration.RM_PROXY_USER_PRIVILEGES_ENABLED, true);
    yarnConf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
    yarnConf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    yarnConf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    UserGroupInformation.setConfiguration(yarnConf);
    // create Token1:
    Text userText1 = new Text("user1");
    DelegationTokenIdentifier dtId1 = new DelegationTokenIdentifier(userText1, new Text("renewer1"), userText1);
    final Token<DelegationTokenIdentifier> originalToken = new Token<>(dtId1.getBytes(), "password1".getBytes(), dtId1.getKind(), new Text("service1"));
    Credentials credentials = new Credentials();
    credentials.addToken(userText1, originalToken);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(yarnConf);
    MockRM rm1 = new TestSecurityMockRM(yarnConf, memStore);
    rm1.start();
    RMApp app = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", 1, credentials);
    // create token2
    Text userText2 = new Text("user1");
    DelegationTokenIdentifier dtId2 = new DelegationTokenIdentifier(userText1, new Text("renewer2"), userText2);
    final Token<DelegationTokenIdentifier> updatedToken = new Token<DelegationTokenIdentifier>(dtId2.getBytes(), "password2".getBytes(), dtId2.getKind(), new Text("service2"));
    AtomicBoolean firstRenewInvoked = new AtomicBoolean(false);
    AtomicBoolean secondRenewInvoked = new AtomicBoolean(false);
    MockRM rm2 = new TestSecurityMockRM(yarnConf, memStore) {

        @Override
        protected DelegationTokenRenewer createDelegationTokenRenewer() {
            return new DelegationTokenRenewer() {

                @Override
                protected void renewToken(final DelegationTokenToRenew dttr) throws IOException {
                    if (dttr.token.equals(updatedToken)) {
                        secondRenewInvoked.set(true);
                        super.renewToken(dttr);
                    } else if (dttr.token.equals(originalToken)) {
                        firstRenewInvoked.set(true);
                        throw new InvalidToken("Failed to renew");
                    } else {
                        throw new IOException("Unexpected");
                    }
                }

                @Override
                protected Token<?>[] obtainSystemTokensForUser(String user, final Credentials credentials) throws IOException {
                    credentials.addToken(updatedToken.getService(), updatedToken);
                    return new Token<?>[] { updatedToken };
                }
            };
        }
    };
    // simulating restart the rm
    rm2.start();
    // check nm can retrieve the token
    final MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
    nm1.registerNode();
    NodeHeartbeatResponse response = nm1.nodeHeartbeat(true);
    ByteBuffer tokenBuffer = response.getSystemCredentialsForApps().get(app.getApplicationId());
    Assert.assertNotNull(tokenBuffer);
    Credentials appCredentials = new Credentials();
    DataInputByteBuffer buf = new DataInputByteBuffer();
    tokenBuffer.rewind();
    buf.reset(tokenBuffer);
    appCredentials.readTokenStorageStream(buf);
    Assert.assertTrue(firstRenewInvoked.get() && secondRenewInvoked.get());
    Assert.assertTrue(appCredentials.getAllTokens().contains(updatedToken));
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) DelegationTokenIdentifier(org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) DataInputByteBuffer(org.apache.hadoop.io.DataInputByteBuffer) Text(org.apache.hadoop.io.Text) InvalidToken(org.apache.hadoop.security.token.SecretManager.InvalidToken) Token(org.apache.hadoop.security.token.Token) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) DelegationTokenToRenew(org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer.DelegationTokenToRenew) IOException(java.io.IOException) ByteBuffer(java.nio.ByteBuffer) DataInputByteBuffer(org.apache.hadoop.io.DataInputByteBuffer) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) ApplicationAccessType(org.apache.hadoop.yarn.api.records.ApplicationAccessType) InvalidToken(org.apache.hadoop.security.token.SecretManager.InvalidToken) Credentials(org.apache.hadoop.security.Credentials) Test(org.junit.Test)

Aggregations

MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)84 Test (org.junit.Test)81 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)68 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)28 ApplicationStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData)27 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)23 TestSecurityMockRM (org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM)22 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)21 RMState (org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState)21 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)20 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)20 MockRM (org.apache.hadoop.yarn.server.resourcemanager.MockRM)19 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)16 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)16 IOException (java.io.IOException)15 MockAM (org.apache.hadoop.yarn.server.resourcemanager.MockAM)14 Configuration (org.apache.hadoop.conf.Configuration)12 AbstractYarnScheduler (org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler)10 ArrayList (java.util.ArrayList)9 ApplicationAccessType (org.apache.hadoop.yarn.api.records.ApplicationAccessType)9