Search in sources :

Example 36 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestAMRestart method testRMRestartOrFailoverNotCountedForAMFailures.

// Test regular RM restart/failover, new RM should not count
// AM failure towards the max-retry-account and should be able to
// re-launch the AM.
@Test(timeout = 50000)
public void testRMRestartOrFailoverNotCountedForAMFailures() throws Exception {
    YarnConfiguration conf = new YarnConfiguration();
    conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
    conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
    conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
    conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
    // explicitly set max-am-retry count as 1.
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    MockRM rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    // AM should be restarted even though max-am-attempt is 1.
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
    // Restart rm.
    MockRM rm2 = new MockRM(conf, memStore);
    rm2.start();
    ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
    // re-register the NM
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NMContainerStatus status = Records.newRecord(NMContainerStatus.class);
    status.setContainerExitStatus(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER);
    status.setContainerId(attempt1.getMasterContainer().getId());
    status.setContainerState(ContainerState.COMPLETE);
    status.setDiagnostics("");
    nm1.registerNode(Collections.singletonList(status), null);
    rm2.waitForState(attempt1.getAppAttemptId(), RMAppAttemptState.FAILED);
    Assert.assertEquals(ContainerExitStatus.KILLED_BY_RESOURCEMANAGER, appState.getAttempt(am1.getApplicationAttemptId()).getAMContainerExitStatus());
    // Will automatically start a new AppAttempt in rm2
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am2 = rm2.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
    MockRM.finishAMAndVerifyAppState(app1, rm2, nm1, am2);
    RMAppAttempt attempt3 = rm2.getRMContext().getRMApps().get(app1.getApplicationId()).getCurrentAppAttempt();
    Assert.assertTrue(attempt3.shouldCountTowardsMaxAttemptRetry());
    Assert.assertEquals(ContainerExitStatus.INVALID, appState.getAttempt(am2.getApplicationAttemptId()).getAMContainerExitStatus());
    rm1.stop();
    rm2.stop();
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) MockAM(org.apache.hadoop.yarn.server.resourcemanager.MockAM) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) Test(org.junit.Test)

Example 37 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestZKRMStateStore method testDuplicateRMAppDeletion.

@Test
public void testDuplicateRMAppDeletion() throws Exception {
    TestZKRMStateStoreTester zkTester = new TestZKRMStateStoreTester();
    long submitTime = System.currentTimeMillis();
    long startTime = System.currentTimeMillis() + 1234;
    RMStateStore store = zkTester.getRMStateStore();
    TestDispatcher dispatcher = new TestDispatcher();
    store.setRMDispatcher(dispatcher);
    ApplicationAttemptId attemptIdRemoved = ApplicationAttemptId.fromString("appattempt_1352994193343_0002_000001");
    ApplicationId appIdRemoved = attemptIdRemoved.getApplicationId();
    storeApp(store, appIdRemoved, submitTime, startTime);
    storeAttempt(store, attemptIdRemoved, "container_1352994193343_0002_01_000001", null, null, dispatcher);
    ApplicationSubmissionContext context = new ApplicationSubmissionContextPBImpl();
    context.setApplicationId(appIdRemoved);
    ApplicationStateData appStateRemoved = ApplicationStateData.newInstance(submitTime, startTime, context, "user1");
    appStateRemoved.attempts.put(attemptIdRemoved, null);
    store.removeApplicationStateInternal(appStateRemoved);
    try {
        store.removeApplicationStateInternal(appStateRemoved);
    } catch (KeeperException.NoNodeException nne) {
        Assert.fail("NoNodeException should not happen.");
    }
    store.close();
}
Also used : ApplicationSubmissionContextPBImpl(org.apache.hadoop.yarn.api.records.impl.pb.ApplicationSubmissionContextPBImpl) ApplicationSubmissionContext(org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) KeeperException(org.apache.zookeeper.KeeperException) Test(org.junit.Test)

Example 38 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class CapacityScheduler method updateApplicationPriority.

@Override
public Priority updateApplicationPriority(Priority newPriority, ApplicationId applicationId, SettableFuture<Object> future, UserGroupInformation user) throws YarnException {
    try {
        writeLock.lock();
        Priority appPriority = null;
        SchedulerApplication<FiCaSchedulerApp> application = applications.get(applicationId);
        if (application == null) {
            throw new YarnException("Application '" + applicationId + "' is not present, hence could not change priority.");
        }
        RMApp rmApp = rmContext.getRMApps().get(applicationId);
        appPriority = checkAndGetApplicationPriority(newPriority, user, rmApp.getQueue(), applicationId);
        if (application.getPriority().equals(appPriority)) {
            future.set(null);
            return appPriority;
        }
        // Update new priority in Submission Context to update to StateStore.
        rmApp.getApplicationSubmissionContext().setPriority(appPriority);
        // Update to state store
        ApplicationStateData appState = ApplicationStateData.newInstance(rmApp.getSubmitTime(), rmApp.getStartTime(), rmApp.getApplicationSubmissionContext(), rmApp.getUser(), rmApp.getCallerContext());
        appState.setApplicationTimeouts(rmApp.getApplicationTimeouts());
        rmContext.getStateStore().updateApplicationStateSynchronously(appState, false, future);
        // As we use iterator over a TreeSet for OrderingPolicy, once we change
        // priority then reinsert back to make order correct.
        LeafQueue queue = (LeafQueue) getQueue(rmApp.getQueue());
        queue.updateApplicationPriority(application, appPriority);
        LOG.info("Priority '" + appPriority + "' is updated in queue :" + rmApp.getQueue() + " for application: " + applicationId + " for the user: " + rmApp.getUser());
        return appPriority;
    } finally {
        writeLock.unlock();
    }
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) Priority(org.apache.hadoop.yarn.api.records.Priority) FiCaSchedulerApp(org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) YarnException(org.apache.hadoop.yarn.exceptions.YarnException)

Example 39 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestWorkPreservingRMRestart method testAppStateSavedButAttemptStateNotSaved.

// Test that if application state was saved, but attempt state was not saved.
// RM should start correctly.
@Test(timeout = 20000)
public void testAppStateSavedButAttemptStateNotSaved() throws Exception {
    MemoryRMStateStore memStore = new MemoryRMStateStore() {

        @Override
        public synchronized void updateApplicationAttemptStateInternal(ApplicationAttemptId appAttemptId, ApplicationAttemptStateData attemptState) {
        // do nothing;
        // simulate the failure that attempt final state is not saved.
        }
    };
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200);
    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
    MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am1);
    ApplicationStateData appSavedState = memStore.getState().getApplicationState().get(app1.getApplicationId());
    // check that app state is  saved.
    assertEquals(RMAppState.FINISHED, appSavedState.getState());
    // check that attempt state is not saved.
    assertNull(appSavedState.getAttempt(am1.getApplicationAttemptId()).getState());
    rm2 = new MockRM(conf, memStore);
    rm2.start();
    RMApp recoveredApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    assertEquals(RMAppState.FINISHED, recoveredApp1.getState());
    // check that attempt state is recovered correctly.
    assertEquals(RMAppAttemptState.FINISHED, recoveredApp1.getCurrentAppAttempt().getState());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) TestSecurityMockRM(org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ApplicationAttemptStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateData) Test(org.junit.Test)

Example 40 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestRMRestart method testRMRestartWaitForPreviousAMToFinish.

@Test(timeout = 60000)
public void testRMRestartWaitForPreviousAMToFinish() throws Exception {
    // testing 3 cases
    // After RM restarts
    // 1) New application attempt is not started until previous AM container
    // finish event is reported back to RM as a part of nm registration.
    // 2) If previous AM container finish event is never reported back (i.e.
    // node manager on which this AM container was running also went down) in
    // that case AMLivenessMonitor should time out previous attempt and start
    // new attempt.
    // 3) If all the stored attempts had finished then new attempt should
    // be started immediately.
    YarnConfiguration conf = new YarnConfiguration(this.conf);
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 40);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    RMState rmState = memStore.getState();
    Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
    // start RM
    final MockRM rm1 = createMockRM(conf, memStore);
    rm1.start();
    AbstractYarnScheduler ys = (AbstractYarnScheduler) rm1.getResourceScheduler();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 16382, rm1.getResourceTrackerService());
    nm1.registerNode();
    // submitting app
    RMApp app1 = rm1.submitApp(200);
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    MockAM am1 = launchAM(app1, rm1, nm1);
    nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    // Fail first AM.
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am1.getApplicationAttemptId());
    // launch another AM.
    MockAM am2 = launchAM(app1, rm1, nm1);
    Assert.assertEquals(1, rmAppState.size());
    Assert.assertEquals(app1.getState(), RMAppState.RUNNING);
    Assert.assertEquals(app1.getAppAttempts().get(app1.getCurrentAppAttempt().getAppAttemptId()).getAppAttemptState(), RMAppAttemptState.RUNNING);
    //  start new RM.
    MockRM rm2 = createMockRM(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    NodeHeartbeatResponse res = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.RESYNC, res.getNodeAction());
    RMApp rmApp = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    // application should be in ACCEPTED state
    rm2.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
    // new attempt should not be started
    Assert.assertEquals(2, rmApp.getAppAttempts().size());
    // am1 attempt should be in FAILED state where as am2 attempt should be in
    // LAUNCHED state
    rm2.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.LAUNCHED);
    Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
    Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
    NMContainerStatus status = TestRMRestart.createNMContainerStatus(am2.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(status), null);
    rm2.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    ys = (AbstractYarnScheduler) rm2.getResourceScheduler();
    TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(ys, am2.getApplicationAttemptId());
    launchAM(rmApp, rm2, nm1);
    Assert.assertEquals(3, rmApp.getAppAttempts().size());
    rm2.waitForState(rmApp.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.RUNNING);
    // Now restart RM ...
    // Setting AMLivelinessMonitor interval to be 10 Secs. 
    conf.setInt(YarnConfiguration.RM_AM_EXPIRY_INTERVAL_MS, 10000);
    MockRM rm3 = createMockRM(conf, memStore);
    rm3.start();
    // Wait for RM to process all the events as a part of rm recovery.
    nm1.setResourceTrackerService(rm3.getResourceTrackerService());
    rmApp = rm3.getRMContext().getRMApps().get(app1.getApplicationId());
    // application should be in ACCEPTED state
    rm3.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(rmApp.getState(), RMAppState.ACCEPTED);
    // new attempt should not be started
    Assert.assertEquals(3, rmApp.getAppAttempts().size());
    // am1 and am2 attempts should be in FAILED state where as am3 should be
    // in LAUNCHED state
    rm3.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    rm3.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    ApplicationAttemptId latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
    rm3.waitForState(latestAppAttemptId, RMAppAttemptState.LAUNCHED);
    Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am1.getApplicationAttemptId()).getAppAttemptState());
    Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(am2.getApplicationAttemptId()).getAppAttemptState());
    Assert.assertEquals(RMAppAttemptState.LAUNCHED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
    rm3.waitForState(latestAppAttemptId, RMAppAttemptState.FAILED);
    rm3.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
    final int maxRetry = 10;
    final RMApp rmAppForCheck = rmApp;
    GenericTestUtils.waitFor(new Supplier<Boolean>() {

        @Override
        public Boolean get() {
            return new Boolean(rmAppForCheck.getAppAttempts().size() == 4);
        }
    }, 100, maxRetry);
    Assert.assertEquals(RMAppAttemptState.FAILED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
    latestAppAttemptId = rmApp.getCurrentAppAttempt().getAppAttemptId();
    // The 4th attempt has started but is not yet saved into RMStateStore
    // It will be saved only when we launch AM.
    // submitting app but not starting AM for it.
    RMApp app2 = rm3.submitApp(200);
    rm3.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(1, app2.getAppAttempts().size());
    Assert.assertEquals(0, memStore.getState().getApplicationState().get(app2.getApplicationId()).getAttemptCount());
    MockRM rm4 = createMockRM(conf, memStore);
    rm4.start();
    rmApp = rm4.getRMContext().getRMApps().get(app1.getApplicationId());
    rm4.waitForState(rmApp.getApplicationId(), RMAppState.ACCEPTED);
    // wait for the attempt to be created.
    int timeoutSecs = 0;
    while (rmApp.getAppAttempts().size() != 2 && timeoutSecs++ < 40) {
        Thread.sleep(200);
    }
    Assert.assertEquals(4, rmApp.getAppAttempts().size());
    Assert.assertEquals(RMAppState.ACCEPTED, rmApp.getState());
    rm4.waitForState(latestAppAttemptId, RMAppAttemptState.SCHEDULED);
    Assert.assertEquals(RMAppAttemptState.SCHEDULED, rmApp.getAppAttempts().get(latestAppAttemptId).getAppAttemptState());
    // The initial application for which an AM was not started should be in
    // ACCEPTED state with one application attempt started.
    app2 = rm4.getRMContext().getRMApps().get(app2.getApplicationId());
    rm4.waitForState(app2.getApplicationId(), RMAppState.ACCEPTED);
    Assert.assertEquals(RMAppState.ACCEPTED, app2.getState());
    Assert.assertEquals(1, app2.getAppAttempts().size());
    rm4.waitForState(app2.getCurrentAppAttempt().getAppAttemptId(), RMAppAttemptState.SCHEDULED);
    Assert.assertEquals(RMAppAttemptState.SCHEDULED, app2.getCurrentAppAttempt().getAppAttemptState());
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) AbstractYarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) RMState(org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState) Test(org.junit.Test)

Aggregations

ApplicationStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData)51 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)32 Test (org.junit.Test)29 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)27 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)26 RMState (org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState)21 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)14 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)13 ApplicationAttemptStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateData)12 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)8 ApplicationAccessType (org.apache.hadoop.yarn.api.records.ApplicationAccessType)7 IOException (java.io.IOException)6 ApplicationSubmissionContext (org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext)6 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)6 MockRM (org.apache.hadoop.yarn.server.resourcemanager.MockRM)6 ArrayList (java.util.ArrayList)5 YarnRuntimeException (org.apache.hadoop.yarn.exceptions.YarnRuntimeException)5 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)5 MockAM (org.apache.hadoop.yarn.server.resourcemanager.MockAM)5 HashMap (java.util.HashMap)4