Search in sources :

Example 41 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestRMRestart method testAppRecoveredInOrderOnRMRestart.

@Test(timeout = 20000)
public void testAppRecoveredInOrderOnRMRestart() throws Exception {
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    for (int i = 10; i > 0; i--) {
        ApplicationStateData appState = mock(ApplicationStateData.class);
        ApplicationSubmissionContext context = mock(ApplicationSubmissionContext.class);
        when(appState.getApplicationSubmissionContext()).thenReturn(context);
        when(context.getApplicationId()).thenReturn(ApplicationId.newInstance(1234, i));
        memStore.getState().getApplicationState().put(appState.getApplicationSubmissionContext().getApplicationId(), appState);
    }
    MockRM rm1 = new MockRM(conf, memStore) {

        @Override
        protected RMAppManager createRMAppManager() {
            return new TestRMAppManager(this.rmContext, this.scheduler, this.masterService, this.applicationACLsManager, conf);
        }

        class TestRMAppManager extends RMAppManager {

            ApplicationId prevId = ApplicationId.newInstance(1234, 0);

            public TestRMAppManager(RMContext context, YarnScheduler scheduler, ApplicationMasterService masterService, ApplicationACLsManager applicationACLsManager, Configuration conf) {
                super(context, scheduler, masterService, applicationACLsManager, conf);
            }

            @Override
            protected void recoverApplication(ApplicationStateData appState, RMState rmState) throws Exception {
                // check application is recovered in order.
                Assert.assertTrue(rmState.getApplicationState().size() > 0);
                Assert.assertTrue(appState.getApplicationSubmissionContext().getApplicationId().compareTo(prevId) > 0);
                prevId = appState.getApplicationSubmissionContext().getApplicationId();
            }
        }
    };
    try {
        rm1.start();
    } finally {
        rm1.stop();
    }
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) IOException(java.io.IOException) ApplicationAttemptNotFoundException(org.apache.hadoop.yarn.exceptions.ApplicationAttemptNotFoundException) ApplicationACLsManager(org.apache.hadoop.yarn.server.security.ApplicationACLsManager) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) AbstractYarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler) YarnScheduler(org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler) ApplicationSubmissionContext(org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) RMState(org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState) Test(org.junit.Test)

Example 42 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestRMRestart method testRMRestartFailedApp.

@Test(timeout = 60000)
public void testRMRestartFailedApp() throws Exception {
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    RMState rmState = memStore.getState();
    Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
    // start RM
    MockRM rm1 = createMockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // create app and launch the AM
    RMApp app0 = rm1.submitApp(200);
    MockAM am0 = launchAM(app0, rm1, nm1);
    // fail the AM by sending CONTAINER_FINISHED event without registering.
    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    rm1.waitForState(app0.getApplicationId(), RMAppState.FAILED);
    // assert the app/attempt failed state is saved.
    ApplicationStateData appState = rmAppState.get(app0.getApplicationId());
    Assert.assertEquals(RMAppState.FAILED, appState.getState());
    Assert.assertEquals(RMAppAttemptState.FAILED, appState.getAttempt(am0.getApplicationAttemptId()).getState());
    // start new RM
    MockRM rm2 = createMockRM(conf, memStore);
    rm2.start();
    RMApp loadedApp0 = rm2.getRMContext().getRMApps().get(app0.getApplicationId());
    rm2.waitForState(app0.getApplicationId(), RMAppState.FAILED);
    rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    // no new attempt is created.
    Assert.assertEquals(1, loadedApp0.getAppAttempts().size());
    verifyAppReportAfterRMRestart(app0, rm2);
    Assert.assertTrue(app0.getDiagnostics().toString().contains("Failing the application."));
// failed diagnostics from attempt is lost because the diagnostics from
// attempt is not yet available by the time app is saving the app state.
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) RMState(org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState) Test(org.junit.Test)

Example 43 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestApplicationLifetimeMonitor method testUpdateApplicationTimeoutForStateStoreUpdateFail.

@Test(timeout = 60000)
public void testUpdateApplicationTimeoutForStateStoreUpdateFail() throws Exception {
    MockRM rm1 = null;
    try {
        conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
        MemoryRMStateStore memStore = new MemoryRMStateStore() {

            private int count = 0;

            @Override
            public synchronized void updateApplicationStateInternal(ApplicationId appId, ApplicationStateData appState) throws Exception {
                // fail only 1 time.
                if (count++ == 0) {
                    throw new Exception("State-store update failed");
                }
                super.updateApplicationStateInternal(appId, appState);
            }
        };
        memStore.init(conf);
        rm1 = new MockRM(conf, memStore);
        rm1.start();
        MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
        nm1.registerNode();
        nm1.nodeHeartbeat(true);
        long appLifetime = 30L;
        Map<ApplicationTimeoutType, Long> timeouts = new HashMap<ApplicationTimeoutType, Long>();
        timeouts.put(ApplicationTimeoutType.LIFETIME, appLifetime);
        RMApp app1 = rm1.submitApp(200, Priority.newInstance(0), timeouts);
        Map<ApplicationTimeoutType, String> updateTimeout = new HashMap<ApplicationTimeoutType, String>();
        long newLifetime = 10L;
        // update 10L seconds more to timeout i.e 30L seconds overall
        updateTimeout.put(ApplicationTimeoutType.LIFETIME, Times.formatISO8601(System.currentTimeMillis() + newLifetime * 1000));
        UpdateApplicationTimeoutsRequest request = UpdateApplicationTimeoutsRequest.newInstance(app1.getApplicationId(), updateTimeout);
        Map<ApplicationTimeoutType, Long> applicationTimeouts = app1.getApplicationTimeouts();
        // has old timeout time
        long beforeUpdate = applicationTimeouts.get(ApplicationTimeoutType.LIFETIME);
        try {
            // update app2 lifetime to new time i.e now + timeout
            rm1.getRMContext().getClientRMService().updateApplicationTimeouts(request);
            fail("Update application should fail.");
        } catch (YarnException e) {
            // expected
            assertTrue("State-store exception does not containe appId", e.getMessage().contains(app1.getApplicationId().toString()));
        }
        applicationTimeouts = app1.getApplicationTimeouts();
        // has old timeout time
        long afterUpdate = applicationTimeouts.get(ApplicationTimeoutType.LIFETIME);
        Assert.assertEquals("Application timeout is updated", beforeUpdate, afterUpdate);
        rm1.waitForState(app1.getApplicationId(), RMAppState.KILLED);
        // verify for app killed with updated lifetime
        Assert.assertTrue("Application killed before lifetime value", app1.getFinishTime() > afterUpdate);
    } finally {
        stopRM(rm1);
    }
}
Also used : HashMap(java.util.HashMap) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) ApplicationTimeoutType(org.apache.hadoop.yarn.api.records.ApplicationTimeoutType) MockRM(org.apache.hadoop.yarn.server.resourcemanager.MockRM) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) IOException(java.io.IOException) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) UpdateApplicationTimeoutsRequest(org.apache.hadoop.yarn.api.protocolrecords.UpdateApplicationTimeoutsRequest) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) Test(org.junit.Test)

Example 44 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestLeaderElectorService method testStateStoreFailureCauseFailover.

// 1. rm1 active
// 2. rm2 standby
// 3. submit a job to rm1 which triggers state-store failure.
// 4. rm2 become
@Test
public void testStateStoreFailureCauseFailover() throws Exception {
    conf.set(YarnConfiguration.RM_HA_ID, "rm1");
    MemoryRMStateStore memStore = new MemoryRMStateStore() {

        @Override
        public synchronized void storeApplicationStateInternal(ApplicationId appId, ApplicationStateData appState) throws Exception {
            throw new Exception("store app failure.");
        }
    };
    memStore.init(conf);
    rm1 = new MockRM(conf, memStore, true);
    rm1.init(conf);
    rm1.start();
    waitFor(rm1, HAServiceState.ACTIVE);
    rm2 = startRM("rm2", HAServiceState.STANDBY);
    // submit an app which will trigger state-store failure.
    rm1.submitApp(200, "app1", "user1", null, "default", false);
    waitFor(rm1, HAServiceState.STANDBY);
    // rm2 should become active;
    waitFor(rm2, HAServiceState.ACTIVE);
    rm2.stop();
    // rm1 will become active again
    waitFor(rm1, HAServiceState.ACTIVE);
}
Also used : MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) Test(org.junit.Test)

Example 45 with ApplicationStateData

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData in project hadoop by apache.

the class TestRMHA method testTransitionedToStandbyShouldNotHang.

@Test
public void testTransitionedToStandbyShouldNotHang() throws Exception {
    configuration.setBoolean(YarnConfiguration.AUTO_FAILOVER_ENABLED, false);
    Configuration conf = new YarnConfiguration(configuration);
    MemoryRMStateStore memStore = new MemoryRMStateStore() {

        @Override
        public void updateApplicationState(ApplicationStateData appState) {
            notifyStoreOperationFailed(new StoreFencedException());
        }
    };
    memStore.init(conf);
    rm = new MockRM(conf, memStore) {

        @Override
        void stopActiveServices() {
            try {
                Thread.sleep(10000);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            super.stopActiveServices();
        }
    };
    rm.init(conf);
    final StateChangeRequestInfo requestInfo = new StateChangeRequestInfo(HAServiceProtocol.RequestSource.REQUEST_BY_USER);
    assertEquals(STATE_ERR, HAServiceState.INITIALIZING, rm.adminService.getServiceStatus().getState());
    assertFalse("RM is ready to become active before being started", rm.adminService.getServiceStatus().isReadyToBecomeActive());
    checkMonitorHealth();
    rm.start();
    checkMonitorHealth();
    checkStandbyRMFunctionality();
    // 2. Transition to Active.
    rm.adminService.transitionToActive(requestInfo);
    // 3. Try Transition to standby
    Thread t = new Thread(new Runnable() {

        @Override
        public void run() {
            try {
                rm.transitionToStandby(true);
            } catch (IOException e) {
                e.printStackTrace();
            } catch (Exception e) {
                // TODO Auto-generated catch block
                e.printStackTrace();
            }
        }
    });
    t.start();
    rm.getRMContext().getStateStore().updateApplicationState(null);
    // wait for thread to finish
    t.join();
    rm.adminService.transitionToStandby(requestInfo);
    checkStandbyRMFunctionality();
    rm.stop();
}
Also used : YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) IOException(java.io.IOException) StoreFencedException(org.apache.hadoop.yarn.server.resourcemanager.recovery.StoreFencedException) ServiceFailedException(org.apache.hadoop.ha.ServiceFailedException) HealthCheckFailedException(org.apache.hadoop.ha.HealthCheckFailedException) IOException(java.io.IOException) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) JSONException(org.codehaus.jettison.json.JSONException) AccessControlException(org.apache.hadoop.security.AccessControlException) StoreFencedException(org.apache.hadoop.yarn.server.resourcemanager.recovery.StoreFencedException) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) YarnRuntimeException(org.apache.hadoop.yarn.exceptions.YarnRuntimeException) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) StateChangeRequestInfo(org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo) Test(org.junit.Test)

Aggregations

ApplicationStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData)51 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)32 Test (org.junit.Test)29 MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)27 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)26 RMState (org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState)21 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)14 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)13 ApplicationAttemptStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationAttemptStateData)12 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)8 ApplicationAccessType (org.apache.hadoop.yarn.api.records.ApplicationAccessType)7 IOException (java.io.IOException)6 ApplicationSubmissionContext (org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext)6 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)6 MockRM (org.apache.hadoop.yarn.server.resourcemanager.MockRM)6 ArrayList (java.util.ArrayList)5 YarnRuntimeException (org.apache.hadoop.yarn.exceptions.YarnRuntimeException)5 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)5 MockAM (org.apache.hadoop.yarn.server.resourcemanager.MockAM)5 HashMap (java.util.HashMap)4