Search in sources :

Example 31 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestAMRMClientOnRMRestart method testAMRMClientForUnregisterAMOnRMRestart.

// Test verify for
// 1. AM try to unregister without registering
// 2. AM register to RM, and try to unregister immediately after RM restart
@Test(timeout = 60000)
public void testAMRMClientForUnregisterAMOnRMRestart() throws Exception {
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    // Phase-1 Start 1st RM
    MyResourceManager rm1 = new MyResourceManager(conf, memStore);
    rm1.start();
    DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
    // Submit the application
    RMApp app = rm1.submitApp(1024);
    dispatcher.await();
    MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // Node heartbeat
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
    rm1.sendAMLaunched(appAttemptId);
    dispatcher.await();
    org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> token = rm1.getRMContext().getRMApps().get(appAttemptId.getApplicationId()).getRMAppAttempt(appAttemptId).getAMRMToken();
    UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
    ugi.addTokenIdentifier(token.decodeIdentifier());
    AMRMClient<ContainerRequest> amClient = new MyAMRMClientImpl(rm1);
    amClient.init(conf);
    amClient.start();
    amClient.registerApplicationMaster("h1", 10000, "");
    amClient.allocate(0.1f);
    // Phase-2 start 2nd RM is up
    MyResourceManager rm2 = new MyResourceManager(conf, memStore);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    ((MyAMRMClientImpl) amClient).updateRMProxy(rm2);
    dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
    // NM should be rebooted on heartbeat, even first heartbeat for nm2
    NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
    Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
    // new NM to represent NM re-register
    nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
    ContainerId containerId = ContainerId.newContainerId(appAttemptId, 1);
    NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, ContainerState.RUNNING, Resource.newInstance(1024, 1), "recover container", 0, Priority.newInstance(0), 0);
    nm1.registerNode(Arrays.asList(containerReport), null);
    nm1.nodeHeartbeat(true);
    dispatcher.await();
    amClient.unregisterApplicationMaster(FinalApplicationStatus.SUCCEEDED, null, null);
    rm2.waitForState(appAttemptId, RMAppAttemptState.FINISHING);
    nm1.nodeHeartbeat(appAttemptId, 1, ContainerState.COMPLETE);
    rm2.waitForState(appAttemptId, RMAppAttemptState.FINISHED);
    rm2.waitForState(app.getApplicationId(), RMAppState.FINISHED);
    amClient.stop();
    rm1.stop();
    rm2.stop();
}
Also used : DrainDispatcher(org.apache.hadoop.yarn.event.DrainDispatcher) RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) NodeHeartbeatResponse(org.apache.hadoop.yarn.server.api.protocolrecords.NodeHeartbeatResponse) MockNM(org.apache.hadoop.yarn.server.resourcemanager.MockNM) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) AMRMTokenIdentifier(org.apache.hadoop.yarn.security.AMRMTokenIdentifier) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ContainerRequest(org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest) UpdateContainerRequest(org.apache.hadoop.yarn.api.records.UpdateContainerRequest) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) Test(org.junit.Test)

Example 32 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestRMRestart method testRMRestartRecoveringNodeLabelManager.

// Test does following verification
// 1. Start RM1 with store patch /tmp
// 2. Add/remove/replace labels to cluster and node lable and verify
// 3. Start RM2 with store patch /tmp only
// 4. Get cluster and node lobel, it should be present by recovering it
@Test(timeout = 20000)
public void testRMRestartRecoveringNodeLabelManager() throws Exception {
    // Initial FS node label store root dir to a random tmp dir
    File nodeLabelFsStoreDir = new File("target", this.getClass().getSimpleName() + "-testRMRestartRecoveringNodeLabelManager");
    if (nodeLabelFsStoreDir.exists()) {
        FileUtils.deleteDirectory(nodeLabelFsStoreDir);
    }
    nodeLabelFsStoreDir.deleteOnExit();
    String nodeLabelFsStoreDirURI = nodeLabelFsStoreDir.toURI().toString();
    conf.set(YarnConfiguration.FS_NODE_LABELS_STORE_ROOT_DIR, nodeLabelFsStoreDirURI);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    conf.setBoolean(YarnConfiguration.NODE_LABELS_ENABLED, true);
    MockRM rm1 = new MockRM(conf, memStore) {

        @Override
        protected RMNodeLabelsManager createNodeLabelManager() {
            RMNodeLabelsManager mgr = new RMNodeLabelsManager();
            mgr.init(getConfig());
            return mgr;
        }
    };
    rm1.init(conf);
    rm1.start();
    RMNodeLabelsManager nodeLabelManager = rm1.getRMContext().getNodeLabelManager();
    Set<String> clusterNodeLabels = new HashSet<String>();
    clusterNodeLabels.add("x");
    clusterNodeLabels.add("y");
    clusterNodeLabels.add("z");
    // Add node label x,y,z
    nodeLabelManager.addToCluserNodeLabelsWithDefaultExclusivity(clusterNodeLabels);
    // Add node Label to Node h1->x
    NodeId n1 = NodeId.newInstance("h1", 0);
    nodeLabelManager.addLabelsToNode(ImmutableMap.of(n1, toSet("x")));
    clusterNodeLabels.remove("z");
    // Remove cluster label z
    nodeLabelManager.removeFromClusterNodeLabels(toSet("z"));
    // Replace nodelabel h1->x,y
    nodeLabelManager.replaceLabelsOnNode(ImmutableMap.of(n1, toSet("y")));
    // Wait for updating store.It is expected NodeStore update should happen
    // very fast since it has separate dispatcher. So waiting for max 5 seconds,
    // which is sufficient time to update NodeStore.
    int count = 10;
    while (count-- > 0) {
        if (nodeLabelManager.getNodeLabels().size() > 0) {
            break;
        }
        Thread.sleep(500);
    }
    Assert.assertEquals(clusterNodeLabels.size(), nodeLabelManager.getClusterNodeLabelNames().size());
    Map<NodeId, Set<String>> nodeLabels = nodeLabelManager.getNodeLabels();
    Assert.assertEquals(1, nodeLabelManager.getNodeLabels().size());
    Assert.assertTrue(nodeLabels.get(n1).equals(toSet("y")));
    MockRM rm2 = new MockRM(conf, memStore) {

        @Override
        protected RMNodeLabelsManager createNodeLabelManager() {
            RMNodeLabelsManager mgr = new RMNodeLabelsManager();
            mgr.init(getConfig());
            return mgr;
        }
    };
    rm2.init(conf);
    rm2.start();
    nodeLabelManager = rm2.getRMContext().getNodeLabelManager();
    Assert.assertEquals(clusterNodeLabels.size(), nodeLabelManager.getClusterNodeLabelNames().size());
    nodeLabels = nodeLabelManager.getNodeLabels();
    Assert.assertEquals(1, nodeLabelManager.getNodeLabels().size());
    Assert.assertTrue(nodeLabels.get(n1).equals(toSet("y")));
    rm1.stop();
    rm2.stop();
}
Also used : MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) EnumSet(java.util.EnumSet) Set(java.util.Set) HashSet(java.util.HashSet) NodeId(org.apache.hadoop.yarn.api.records.NodeId) File(java.io.File) RMNodeLabelsManager(org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 33 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestRMRestart method testRMRestartAppRunningAMFailed.

@Test(timeout = 60000)
public void testRMRestartAppRunningAMFailed() throws Exception {
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    RMState rmState = memStore.getState();
    Map<ApplicationId, ApplicationStateData> rmAppState = rmState.getApplicationState();
    // start RM
    MockRM rm1 = createMockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    // create app and launch the AM
    RMApp app0 = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", -1, null, "MAPREDUCE", true, true);
    MockAM am0 = launchAM(app0, rm1, nm1);
    // fail the AM by sending CONTAINER_FINISHED event without registering.
    nm1.nodeHeartbeat(am0.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
    rm1.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
    ApplicationStateData appState = rmAppState.get(app0.getApplicationId());
    // assert the AM failed state is saved.
    Assert.assertEquals(RMAppAttemptState.FAILED, appState.getAttempt(am0.getApplicationAttemptId()).getState());
    // assert app state has not been saved.
    Assert.assertNull(rmAppState.get(app0.getApplicationId()).getState());
    // new AM started but not registered, app still stays at ACCECPTED state.
    rm1.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
    // start new RM
    MockRM rm2 = createMockRM(conf, memStore);
    rm2.start();
    // assert the previous AM state is loaded back on RM recovery.
    rm2.waitForState(am0.getApplicationAttemptId(), RMAppAttemptState.FAILED);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ApplicationAccessType(org.apache.hadoop.yarn.api.records.ApplicationAccessType) ApplicationStateData(org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) RMState(org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState) Test(org.junit.Test)

Example 34 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestRMRestart method testQueueMetricsOnRMRestart.

@SuppressWarnings("resource")
@Test(timeout = 60000)
public void testQueueMetricsOnRMRestart() throws Exception {
    conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
    MemoryRMStateStore memStore = new MemoryRMStateStore();
    memStore.init(conf);
    // PHASE 1: create state in an RM
    // start RM
    MockRM rm1 = createMockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    QueueMetrics qm1 = rm1.getResourceScheduler().getRootQueueMetrics();
    resetQueueMetrics(qm1);
    assertQueueMetrics(qm1, 0, 0, 0, 0);
    // create app that gets launched and does allocate before RM restart
    RMApp app1 = rm1.submitApp(200);
    // Need to wait first for AppAttempt to be started (RMAppState.ACCEPTED)
    // and then for it to reach RMAppAttemptState.SCHEDULED
    // inorder to ensure appsPending metric is incremented
    rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
    RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
    ApplicationAttemptId attemptId1 = attempt1.getAppAttemptId();
    rm1.waitForState(attemptId1, RMAppAttemptState.SCHEDULED);
    assertQueueMetrics(qm1, 1, 1, 0, 0);
    nm1.nodeHeartbeat(true);
    rm1.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
    MockAM am1 = rm1.sendAMLaunched(attempt1.getAppAttemptId());
    am1.registerAppAttempt();
    am1.allocate("127.0.0.1", 1000, 1, new ArrayList<ContainerId>());
    nm1.nodeHeartbeat(true);
    List<Container> conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
    while (conts.size() == 0) {
        nm1.nodeHeartbeat(true);
        conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
        Thread.sleep(500);
    }
    assertQueueMetrics(qm1, 1, 0, 1, 0);
    // PHASE 2: create new RM and start from old state
    // create new RM to represent restart and recover state
    MockRM rm2 = createMockRM(conf, memStore);
    QueueMetrics qm2 = rm2.getResourceScheduler().getRootQueueMetrics();
    resetQueueMetrics(qm2);
    assertQueueMetrics(qm2, 0, 0, 0, 0);
    rm2.start();
    nm1.setResourceTrackerService(rm2.getResourceTrackerService());
    // recover app
    RMApp loadedApp1 = rm2.getRMContext().getRMApps().get(app1.getApplicationId());
    nm1.nodeHeartbeat(true);
    nm1 = new MockNM("127.0.0.1:1234", 15120, rm2.getResourceTrackerService());
    NMContainerStatus status = TestRMRestart.createNMContainerStatus(loadedApp1.getCurrentAppAttempt().getAppAttemptId(), 1, ContainerState.COMPLETE);
    nm1.registerNode(Arrays.asList(status), null);
    while (loadedApp1.getAppAttempts().size() != 2) {
        Thread.sleep(200);
    }
    attempt1 = loadedApp1.getCurrentAppAttempt();
    attemptId1 = attempt1.getAppAttemptId();
    rm2.waitForState(attemptId1, RMAppAttemptState.SCHEDULED);
    assertQueueMetrics(qm2, 1, 1, 0, 0);
    nm1.nodeHeartbeat(true);
    rm2.waitForState(attemptId1, RMAppAttemptState.ALLOCATED);
    assertQueueMetrics(qm2, 1, 0, 1, 0);
    am1 = rm2.sendAMLaunched(attempt1.getAppAttemptId());
    am1.registerAppAttempt();
    am1.allocate("127.0.0.1", 1000, 3, new ArrayList<ContainerId>());
    nm1.nodeHeartbeat(true);
    conts = am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
    while (conts.size() == 0) {
        nm1.nodeHeartbeat(true);
        conts.addAll(am1.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
        Thread.sleep(500);
    }
    // finish the AMs
    finishApplicationMaster(loadedApp1, rm2, nm1, am1);
    // now AppAttempt and App becomes FINISHED,
    // we should also grant APP_ATTEMPT_REMOVE/APP_REMOVE event
    // had processed by scheduler
    rm2.waitForAppRemovedFromScheduler(loadedApp1.getApplicationId());
    assertQueueMetrics(qm2, 1, 0, 0, 1);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) RMAppAttempt(org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt) ArrayList(java.util.ArrayList) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) QueueMetrics(org.apache.hadoop.yarn.server.resourcemanager.scheduler.QueueMetrics) Container(org.apache.hadoop.yarn.api.records.Container) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) ContainerId(org.apache.hadoop.yarn.api.records.ContainerId) NMContainerStatus(org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus) ResourceRequest(org.apache.hadoop.yarn.api.records.ResourceRequest) Test(org.junit.Test)

Example 35 with MemoryRMStateStore

use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.

the class TestRMRestart method testClientRetryOnKillingApplication.

// This is to test Killing application should be able to wait until app
// reaches killed state and also check that attempt state is saved before app
// state is saved.
@Test(timeout = 60000)
public void testClientRetryOnKillingApplication() throws Exception {
    MemoryRMStateStore memStore = new TestMemoryRMStateStore();
    memStore.init(conf);
    // start RM
    MockRM rm1 = createMockRM(conf, memStore);
    rm1.start();
    MockNM nm1 = new MockNM("127.0.0.1:1234", 15120, rm1.getResourceTrackerService());
    nm1.registerNode();
    RMApp app1 = rm1.submitApp(200, "name", "user", null, false, "default", 1, null, "myType");
    MockAM am1 = launchAM(app1, rm1, nm1);
    KillApplicationResponse response;
    int count = 0;
    while (true) {
        response = rm1.killApp(app1.getApplicationId());
        if (response.getIsKillCompleted()) {
            break;
        }
        Thread.sleep(100);
        count++;
    }
    // we expect at least 2 calls for killApp as the first killApp always return
    // false.
    Assert.assertTrue(count >= 1);
    rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.KILLED);
    rm1.waitForState(app1.getApplicationId(), RMAppState.KILLED);
    Assert.assertEquals(1, ((TestMemoryRMStateStore) memStore).updateAttempt);
    Assert.assertEquals(2, ((TestMemoryRMStateStore) memStore).updateApp);
}
Also used : RMApp(org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp) MemoryRMStateStore(org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore) KillApplicationResponse(org.apache.hadoop.yarn.api.protocolrecords.KillApplicationResponse) Test(org.junit.Test)

Aggregations

MemoryRMStateStore (org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore)84 Test (org.junit.Test)81 RMApp (org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp)68 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)28 ApplicationStateData (org.apache.hadoop.yarn.server.resourcemanager.recovery.records.ApplicationStateData)27 NMContainerStatus (org.apache.hadoop.yarn.server.api.protocolrecords.NMContainerStatus)23 TestSecurityMockRM (org.apache.hadoop.yarn.server.resourcemanager.TestRMRestart.TestSecurityMockRM)22 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)21 RMState (org.apache.hadoop.yarn.server.resourcemanager.recovery.RMStateStore.RMState)21 ContainerId (org.apache.hadoop.yarn.api.records.ContainerId)20 MockNM (org.apache.hadoop.yarn.server.resourcemanager.MockNM)20 MockRM (org.apache.hadoop.yarn.server.resourcemanager.MockRM)19 ApplicationAttemptId (org.apache.hadoop.yarn.api.records.ApplicationAttemptId)16 RMAppAttempt (org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt)16 IOException (java.io.IOException)15 MockAM (org.apache.hadoop.yarn.server.resourcemanager.MockAM)14 Configuration (org.apache.hadoop.conf.Configuration)12 AbstractYarnScheduler (org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler)10 ArrayList (java.util.ArrayList)9 ApplicationAccessType (org.apache.hadoop.yarn.api.records.ApplicationAccessType)9