Search in sources :

Example 56 with MockNM

use of org.apache.hadoop.yarn.server.resourcemanager.MockNM in project hadoop by apache.

the class TestDelegationTokenRenewer method testRenewTokenUsingTokenConfProvidedByApp.

// Test DelegationTokenRenewer uses the tokenConf provided by application
// for token renewal.
@Test
public void testRenewTokenUsingTokenConfProvidedByApp() throws Exception {
    conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
    UserGroupInformation.setConfiguration(conf);
    MockRM rm = new TestSecurityMockRM(conf, null);
    rm.start();
    final MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm.getResourceTrackerService());
    nm1.registerNode();
    // create a token
    Text userText1 = new Text("user1");
    DelegationTokenIdentifier dtId1 = new DelegationTokenIdentifier(userText1, new Text("renewer1"), userText1);
    final Token<DelegationTokenIdentifier> token1 = new Token<DelegationTokenIdentifier>(dtId1.getBytes(), "password1".getBytes(), dtId1.getKind(), new Text("service1"));
    Credentials credentials = new Credentials();
    credentials.addToken(userText1, token1);
    // create token conf for renewal
    Configuration appConf = new Configuration(false);
    appConf.set("dfs.nameservices", "mycluster1,mycluster2");
    appConf.set("dfs.namenode.rpc-address.mycluster2.nn1", "123.0.0.1");
    appConf.set("dfs.namenode.rpc-address.mycluster2.nn2", "123.0.0.2");
    appConf.set("dfs.ha.namenodes.mycluster2", "nn1,nn2");
    appConf.set("dfs.client.failover.proxy.provider.mycluster2", "provider");
    DataOutputBuffer dob = new DataOutputBuffer();
    appConf.write(dob);
    ByteBuffer tokenConf = ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
    final int confSize = appConf.size();
    // submit app
    RMApp app = rm.submitApp(credentials, tokenConf);
    GenericTestUtils.waitFor(new Supplier<Boolean>() {

        public Boolean get() {
            DelegationTokenToRenew toRenew = rm.getRMContext().getDelegationTokenRenewer().getAllTokens().get(token1);
            // the specific config we added.
            return toRenew != null && toRenew.conf != null && toRenew.conf.size() == confSize && toRenew.conf.get("dfs.namenode.rpc-address.mycluster2.nn1").equals("123.0.0.1");
        }
    }, 200, 10000);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) DelegationTokenIdentifier(org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) Text(org.apache.hadoop.io.Text) InvalidToken(org.apache.hadoop.security.token.SecretManager.InvalidToken) Token(org.apache.hadoop.security.token.Token) DelegationTokenToRenew(org.apache.hadoop.yarn.server.resourcemanager.security.DelegationTokenRenewer.DelegationTokenToRenew) ByteBuffer(java.nio.ByteBuffer) DataInputByteBuffer(org.apache.hadoop.io.DataInputByteBuffer) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) DataOutputBuffer(org.apache.hadoop.io.DataOutputBuffer) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Credentials(org.apache.hadoop.security.Credentials) Test(org.junit.Test)

Example 57 with MockNM

use of org.apache.hadoop.yarn.server.resourcemanager.MockNM in project hadoop by apache.

the class TestAMRestart method testAMRestartNotLostContainerCompleteMsg.

@Test(timeout = 30000)
public void testAMRestartNotLostContainerCompleteMsg() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
    MockRM rm1 = new MockRM(conf);
    rm1.start();
    RMApp app1 = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", -1, null, "MAPREDUCE", false, true);
    MockNM nm1 = new MockNM("127.0.0.1:1234", 10240, rm1.getResourceTrackerService());
    nm1.registerNode();
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    allocateContainers(nm1, am1, 1);
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
    ContainerId containerId2 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 2);
    rm1.waitForState(nm1, containerId2, RMContainerState.RUNNING);
    // container complete
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 2, ContainerState.COMPLETE);
    rm1.waitForState(nm1, containerId2, RMContainerState.COMPLETED);
    // before this msg pass to AM, AM may crash
    while (true) {
        AllocateResponse response = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>());
        List<ContainerStatus> containerStatuses = response.getCompletedContainersStatuses();
        if (isContainerIdInContainerStatus(containerStatuses, containerId2) == false) {
            Thread.sleep(100);
            continue;
        }
        // is containerId still in justFinishedContainer?
        containerStatuses = app1.getCurrentAppAttempt().getJustFinishedContainers();
        if (isContainerIdInContainerStatus(containerStatuses, containerId2)) {
            Assert.fail();
        }
        break;
    }
    // fail the AM by sending CONTAINER_FINISHED event without registering.
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // wait for app to start a new attempt.
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    // assert this is a new AM.
    ApplicationAttemptId newAttemptId = app1.getCurrentAppAttempt().getAppAttemptId();
    Assert.assertFalse(newAttemptId.equals(am1.getApplicationAttemptId()));
    // launch the new AM
    RMAppAttempt attempt2 = app1.getCurrentAppAttempt();
    MockAM am2 = rm1.launchAndRegisterAM(app1, rm1, nm1);
    // whether new AM could get container complete msg
    AllocateResponse allocateResponse = am2.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>());
    List<ContainerStatus> containerStatuses = allocateResponse.getCompletedContainersStatuses();
    if (isContainerIdInContainerStatus(containerStatuses, containerId2) == false) {
        Assert.fail();
    }
    containerStatuses = attempt2.getJustFinishedContainers();
    if (isContainerIdInContainerStatus(containerStatuses, containerId2)) {
        Assert.fail();
    }
    // the second allocate should not get container complete msg
    allocateResponse = am2.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>());
    containerStatuses = allocateResponse.getCompletedContainersStatuses();
    if (isContainerIdInContainerStatus(containerStatuses, containerId2)) {
        Assert.fail();
    }
    rm1.stop();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) ArrayList(java.util.ArrayList) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) AllocateResponse(org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) ApplicationAccessType(org.apache.hadoop.yarn.api.records.ApplicationAccessType) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) Test(org.junit.Test)

Example 58 with MockNM

use of org.apache.hadoop.yarn.server.resourcemanager.MockNM in project hadoop by apache.

the class TestAMRestart method testRMAppAttemptFailuresValidityInterval.

@Test(timeout = 120000)
public void testRMAppAttemptFailuresValidityInterval() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    // explicitly set max-am-retry count as 2.
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
    nm1.registerNode();
    // set window size to a larger number : 60s
    // we will verify the app should be failed if
    // two continuous attempts failed in 60s.
    RMApp app = rm1.submitApp(200, 60000, false);
    MockAM am = MockRM.launchAM(app, rm1, nm1);
    // Fail current attempt normally
    nm1.nodeHeartbeat(am.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // launch the second attempt
    rm1.waitForState(app.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(2, app.getAppAttempts().size());
    MockAM am_2 = MockRM.launchAndRegisterAM(app, rm1, nm1);
    rm1.waitForState(am_2.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    nm1.nodeHeartbeat(am_2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am_2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // current app should be failed.
    rm1.waitForState(app.getApplicationId(), RMAppState.FAILED);
    ControlledClock clock = new ControlledClock();
    // set window size to 10s
    RMAppImpl app1 = (RMAppImpl) rm1.submitApp(200, 10000, false);
    app1.setSystemClock(clock);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    // Fail attempt1 normally
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    //Wait to make sure attempt1 be removed in State Store
    //TODO explore a better way than sleeping for a while (YARN-4929)
    Thread.sleep(15 * 1000);
    // launch the second attempt
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(2, app1.getAppAttempts().size());
    RMAppAttempt attempt2 = app1.getCurrentAppAttempt();
    MockAM am2 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // wait for 10 seconds
    clock.setTime(System.currentTimeMillis() + 10 * 1000);
    // Fail attempt2 normally
    nm1.nodeHeartbeat(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // can launch the third attempt successfully
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(3, app1.getAppAttempts().size());
    RMAppAttempt attempt3 = app1.getCurrentAppAttempt();
    clock.reset();
    MockAM am3 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    rm1.waitForState(am3.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // Restart rm.
    @SuppressWarnings("resource") MockRM rm2 = new MockRM(conf, memStore);
    rm2.start();
    ApplicationStateData app1State = memStore.getState().getApplicationState().get(app1.getApplicationId());
    Assert.assertEquals(1, app1State.getFirstAttemptId());
    // re-register the NM
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NMContainerStatus status = Records.newRecord(NMContainerStatus.class);
    status.setContainerExitStatus(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
    status.setContainerId(attempt3.getMasterContainer().getId());
    status.setContainerState(ContainerState.COMPLETE);
    status.setDiagnostics("");
    nm1.registerNode(Collections.singletonList(status), null);
    rm2.waitForState(attempt3.getAppAttemptId(), RMAppAttemptState.FAILED);
    //Wait to make sure attempt3 be removed in State Store
    //TODO explore a better way than sleeping for a while (YARN-4929)
    Thread.sleep(15 * 1000);
    Assert.assertEquals(2, app1State.getAttemptCount());
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    // Lauch Attempt 4
    MockAM am4 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 4, nm1);
    // wait for 10 seconds
    clock.setTime(System.currentTimeMillis() + 10 * 1000);
    // Fail attempt4 normally
    nm1.nodeHeartbeat(am4.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm2.waitForState(am4.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertEquals(2, app1State.getAttemptCount());
    // can launch the 5th attempt successfully
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am5 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 5, nm1);
    clock.reset();
    rm2.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.RUNNING);
    // Fail attempt5 normally
    nm1.nodeHeartbeat(am5.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm2.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertEquals(2, app1State.getAttemptCount());
    rm2.waitForState(app1.getApplicationId(), RMAppState.FAILED);
    rm1.stop();
    rm2.stop();
}
Also used : RMAppImpl(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMAppImpl) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ControlledClock(org.apache.hadoop.yarn.util.ControlledClock) Test(org.junit.Test)

Example 59 with MockNM

use of org.apache.hadoop.yarn.server.resourcemanager.MockNM in project hadoop by apache.

the class TestAMRestart method testPreemptedAMRestartOnRMRestart.

// Test RM restarts after AM container is preempted, new RM should not count
// AM preemption failure towards the max-retry-account and should be able to
// re-launch the AM.
@Test(timeout = 20000)
public void testPreemptedAMRestartOnRMRestart() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    // explicitly set max-am-retry count as 1.
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    CapacityScheduler scheduler = (CapacityScheduler) rm1.getResourceScheduler();
    ContainerId amContainer = ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
    // Forcibly preempt the am container;
    scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer));
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertTrue(!attempt1.shouldCountTowardsMaxAttemptRetry());
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    // state store has 1 attempt stored.
    ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
    Assert.assertEquals(1, appState.getAttemptCount());
    // attempt stored has the preempted container exit status.
    Assert.assertEquals(ContainerExitStatus.PREEMPTED, appState.getAttempt(am1.getApplicationAttemptId()).getAMContainerExitStatus());
    // Restart rm.
    MockRM rm2 = new MockRM(conf, memStore);
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    nm1.registerNode();
    rm2.start();
    // Restarted RM should re-launch the am.
    MockAM am2 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
    MockRM.finishAMAndVerifyAppState(app1, rm2, nm1, am2);
    RMAppAttempt attempt2 = rm2.getRMContext().getRMApps().get(app1.getApplicationId()).getCurrentAppAttempt();
    Assert.assertTrue(attempt2.shouldCountTowardsMaxAttemptRetry());
    Assert.assertEquals(ContainerExitStatus.INVALID, appState.getAttempt(am2.getApplicationAttemptId()).getAMContainerExitStatus());
    rm1.stop();
    rm2.stop();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) CapacityScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler) Test(org.junit.Test)

Example 60 with MockNM

use of org.apache.hadoop.yarn.server.resourcemanager.MockNM in project hadoop by apache.

the class TestAMRestart method testShouldNotCountFailureToMaxAttemptRetry.

// AM container preempted, nm disk failure
// should not be counted towards AM max retry count.
@Test(timeout = 100000)
public void testShouldNotCountFailureToMaxAttemptRetry() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    // explicitly set max-am-retry count as 1.
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    CapacityScheduler scheduler = (CapacityScheduler) rm1.getResourceScheduler();
    ContainerId amContainer = ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
    // Preempt the first attempt;
    scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer));
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am1.getApplicationAttemptId());
    Assert.assertTrue(!attempt1.shouldCountTowardsMaxAttemptRetry());
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
    // AM should be restarted even though max-am-attempt is 1.
    MockAM am2 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
    RMAppAttempt attempt2 = app1.getCurrentAppAttempt();
    // Preempt the second attempt.
    ContainerId amContainer2 = ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
    scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer2));
    rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am2.getApplicationAttemptId());
    Assert.assertTrue(!attempt2.shouldCountTowardsMaxAttemptRetry());
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am3 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 3, nm1);
    RMAppAttempt attempt3 = app1.getCurrentAppAttempt();
    // mimic NM disk_failure
    ContainerStatus containerStatus = Records.newRecord(ContainerStatus.class);
    containerStatus.setContainerId(attempt3.getMasterContainer().getId());
    containerStatus.setDiagnostics("mimic NM disk_failure");
    containerStatus.setState(ContainerState.COMPLETE);
    containerStatus.setExitStatus(ContainerExitStatus.DISKS_FAILED);
    Map<ApplicationId, List<ContainerStatus>> conts = new HashMap<ApplicationId, List<ContainerStatus>>();
    conts.put(app1.getApplicationId(), Collections.singletonList(containerStatus));
    nm1.nodeHeartbeat(conts, true);
    rm1.waitForState(am3.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am3.getApplicationAttemptId());
    Assert.assertTrue(!attempt3.shouldCountTowardsMaxAttemptRetry());
    Assert.assertEquals(ContainerExitStatus.DISKS_FAILED, appState.getAttempt(am3.getApplicationAttemptId()).getAMContainerExitStatus());
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am4 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 4, nm1);
    RMAppAttempt attempt4 = app1.getCurrentAppAttempt();
    // create second NM, and register to rm1
    MockNM nm2 = new MockNM("127.0.0.1:2234", 8000, rm1.getResourceTrackerService());
    nm2.registerNode();
    // nm1 heartbeats to report unhealthy
    // This will mimic ContainerExitStatus.ABORT
    nm1.nodeHeartbeat(false);
    rm1.waitForState(am4.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am4.getApplicationAttemptId());
    Assert.assertTrue(!attempt4.shouldCountTowardsMaxAttemptRetry());
    Assert.assertEquals(ContainerExitStatus.ABORTED, appState.getAttempt(am4.getApplicationAttemptId()).getAMContainerExitStatus());
    // launch next AM in nm2
    MockAM am5 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 5, nm2);
    RMAppAttempt attempt5 = app1.getCurrentAppAttempt();
    // fail the AM normally
    nm2.nodeHeartbeat(am5.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am5.getApplicationAttemptId());
    Assert.assertTrue(attempt5.shouldCountTowardsMaxAttemptRetry());
    // AM should not be restarted.
    rm1.waitForState(app1.getApplicationId(), RMAppState.FAILED);
    Assert.assertEquals(5, app1.getAppAttempts().size());
    rm1.stop();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) HashMap(java.util.HashMap) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ContainerStatus(org.apache.hadoop.yarn.api.records.ContainerStatus) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) ArrayList(java.util.ArrayList) List(java.util.List) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) CapacityScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.CapacityScheduler) Test(org.junit.Test)

Aggregations

MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)224 Test (org.junit.Test)218 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)196 MockRM (org.apache.hadoop.yarn.server.resourcemanager.MockRM)128 MockAM (org.apache.hadoop.yarn.server.resourcemanager.MockAM)127 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)79 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)69 FiCaSchedulerApp (org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp)48 ClientResponse (com.sun.jersey.api.client.ClientResponse)47 Configuration (org.apache.hadoop.conf.Configuration)47 WebResource (com.sun.jersey.api.client.WebResource)39 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)38 RMNode (org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode)37 JSONObject (org.codehaus.jettison.json.JSONObject)37 NodeUpdateSchedulerEvent (org.apache.hadoop.yarn.server.resourcemanager.scheduler.event.NodeUpdateSchedulerEvent)36 DrainDispatcher (org.apache.hadoop.yarn.event.DrainDispatcher)33 Container (org.apache.hadoop.yarn.api.records.Container)29 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)28 Job (org.apache.hadoop.mapreduce.v2.app.job.Job)23 RMContainer (org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainer)22