use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestLeaderElectorService method testStateStoreFailureCauseFailover.
// 1. rm1 active
// 2. rm2 standby
// 3. submit a job to rm1 which triggers state-store failure.
// 4. rm2 become
@Test
public void testStateStoreFailureCauseFailover() throws Exception {
conf.set(YarnConfiguration.RM_HA_ID, "rm1");
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override
public synchronized void storeApplicationStateInternal(ApplicationId appId, ApplicationStateData appState) throws Exception {
throw new Exception("store app failure.");
}
};
memStore.init(conf);
rm1 = new MockRM(conf, memStore, true);
rm1.init(conf);
rm1.start();
waitFor(rm1, HAServiceState.ACTIVE);
rm2 = startRM("rm2", HAServiceState.STANDBY);
// submit an app which will trigger state-store failure.
rm1.submitApp(200, "app1", "user1", null, "default", false);
waitFor(rm1, HAServiceState.STANDBY);
// rm2 should become active;
waitFor(rm2, HAServiceState.ACTIVE);
rm2.stop();
// rm1 will become active again
waitFor(rm1, HAServiceState.ACTIVE);
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestContainerResourceUsage method testUsageWithMultipleContainersAndRMRestart.
@Test(timeout = 120000)
public void testUsageWithMultipleContainersAndRMRestart() throws Exception {
// Set max attempts to 1 so that when the first attempt fails, the app
// won't try to start a new one.
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, false);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm0 = new MockRM(conf, memStore);
rm0.start();
MockNM nm = new MockNM("127.0.0.1:1234", 65536, rm0.getResourceTrackerService());
nm.registerNode();
RMApp app0 = rm0.submitApp(200);
rm0.waitForState(app0.getApplicationId(), RMAppState.ACCEPTED);
RMAppAttempt attempt0 = app0.getCurrentAppAttempt();
ApplicationAttemptId attemptId0 = attempt0.getAppAttemptId();
rm0.waitForState(attemptId0, RMAppAttemptState.SCHEDULED);
nm.nodeHeartbeat(true);
rm0.waitForState(attemptId0, RMAppAttemptState.ALLOCATED);
MockAM am0 = rm0.sendAMLaunched(attempt0.getAppAttemptId());
am0.registerAppAttempt();
int NUM_CONTAINERS = 2;
am0.allocate("127.0.0.1", 1000, NUM_CONTAINERS, new ArrayList<ContainerId>());
nm.nodeHeartbeat(true);
List<Container> conts = am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers();
while (conts.size() != NUM_CONTAINERS) {
nm.nodeHeartbeat(true);
conts.addAll(am0.allocate(new ArrayList<ResourceRequest>(), new ArrayList<ContainerId>()).getAllocatedContainers());
Thread.sleep(500);
}
// launch the 2nd and 3rd containers.
for (Container c : conts) {
nm.nodeHeartbeat(attempt0.getAppAttemptId(), c.getId().getContainerId(), ContainerState.RUNNING);
rm0.waitForState(nm, c.getId(), RMContainerState.RUNNING);
}
// Get the RMContainers for all of the live containers, to be used later
// for metrics calculations and comparisons.
Collection<RMContainer> rmContainers = rm0.scheduler.getSchedulerAppInfo(attempt0.getAppAttemptId()).getLiveContainers();
// Allow metrics to accumulate.
int sleepInterval = 1000;
int cumulativeSleepTime = 0;
while (app0.getRMAppMetrics().getMemorySeconds() <= 0 && cumulativeSleepTime < 5000) {
Thread.sleep(sleepInterval);
cumulativeSleepTime += sleepInterval;
}
// Stop all non-AM containers
for (Container c : conts) {
if (c.getId().getContainerId() == 1)
continue;
nm.nodeHeartbeat(attempt0.getAppAttemptId(), c.getId().getContainerId(), ContainerState.COMPLETE);
rm0.waitForState(nm, c.getId(), RMContainerState.COMPLETED);
}
// After all other containers have completed, manually complete the master
// container in order to trigger a save to the state store of the resource
// usage metrics. This will cause the attempt to fail, and, since the max
// attempt retries is 1, the app will also fail. This is intentional so
// that all containers will complete prior to saving.
ContainerId cId = ContainerId.newContainerId(attempt0.getAppAttemptId(), 1);
nm.nodeHeartbeat(attempt0.getAppAttemptId(), cId.getContainerId(), ContainerState.COMPLETE);
rm0.waitForState(nm, cId, RMContainerState.COMPLETED);
// Check that the container metrics match those from the app usage report.
long memorySeconds = 0;
long vcoreSeconds = 0;
for (RMContainer c : rmContainers) {
AggregateAppResourceUsage ru = calculateContainerResourceMetrics(c);
memorySeconds += ru.getMemorySeconds();
vcoreSeconds += ru.getVcoreSeconds();
}
RMAppMetrics metricsBefore = app0.getRMAppMetrics();
Assert.assertEquals("Unexpected MemorySeconds value", memorySeconds, metricsBefore.getMemorySeconds());
Assert.assertEquals("Unexpected VcoreSeconds value", vcoreSeconds, metricsBefore.getVcoreSeconds());
// create new RM to represent RM restart. Load up the state store.
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
RMApp app0After = rm1.getRMContext().getRMApps().get(app0.getApplicationId());
// Compare container resource usage metrics from before and after restart.
RMAppMetrics metricsAfter = app0After.getRMAppMetrics();
Assert.assertEquals("Vcore seconds were not the same after RM Restart", metricsBefore.getVcoreSeconds(), metricsAfter.getVcoreSeconds());
Assert.assertEquals("Memory seconds were not the same after RM Restart", metricsBefore.getMemorySeconds(), metricsAfter.getMemorySeconds());
rm0.stop();
rm0.close();
rm1.stop();
rm1.close();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestRMHA method testTransitionedToStandbyShouldNotHang.
@Test
public void testTransitionedToStandbyShouldNotHang() throws Exception {
configuration.setBoolean(YarnConfiguration.AUTO_FAILOVER_ENABLED, false);
Configuration conf = new YarnConfiguration(configuration);
MemoryRMStateStore memStore = new MemoryRMStateStore() {
@Override
public void updateApplicationState(ApplicationStateData appState) {
notifyStoreOperationFailed(new StoreFencedException());
}
};
memStore.init(conf);
rm = new MockRM(conf, memStore) {
@Override
void stopActiveServices() {
try {
Thread.sleep(10000);
} catch (Exception e) {
throw new RuntimeException(e);
}
super.stopActiveServices();
}
};
rm.init(conf);
final StateChangeRequestInfo requestInfo = new StateChangeRequestInfo(HAServiceProtocol.RequestSource.REQUEST_BY_USER);
assertEquals(STATE_ERR, HAServiceState.INITIALIZING, rm.adminService.getServiceStatus().getState());
assertFalse("RM is ready to become active before being started", rm.adminService.getServiceStatus().isReadyToBecomeActive());
checkMonitorHealth();
rm.start();
checkMonitorHealth();
checkStandbyRMFunctionality();
// 2. Transition to Active.
rm.adminService.transitionToActive(requestInfo);
// 3. Try Transition to standby
Thread t = new Thread(new Runnable() {
@Override
public void run() {
try {
rm.transitionToStandby(true);
} catch (IOException e) {
e.printStackTrace();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
});
t.start();
rm.getRMContext().getStateStore().updateApplicationState(null);
// wait for thread to finish
t.join();
rm.adminService.transitionToStandby(requestInfo);
checkStandbyRMFunctionality();
rm.stop();
}
use of org.apache.hadoop.yarn.server.resourcemanager.recovery.MemoryRMStateStore in project hadoop by apache.
the class TestAMRMClientOnRMRestart method testAMRMClientResendsRequestsOnRMRestart.
// Test does major 6 steps verification.
// Step-1 : AMRMClient send allocate request for 3 container requests
// Step-2 : 3 containers are allocated by RM.
// Step-3 : AM Send 1 containerRequest(cRequest4) and 1 releaseRequests to
// RM
// Step-3.5 : AM Send 1 container resource increase request to RM
// Step-4 : On RM restart, AM(does not know RM is restarted) sends additional
// containerRequest(cRequest5) and blacklisted nodes.
// Intern RM send resync command
// Verify AM can recover increase request after resync
// Step-5 : Allocater after resync command & new containerRequest(cRequest6)
// Step-6 : RM allocates containers i.e cRequest4,cRequest5 and cRequest6
@Test(timeout = 60000)
public void testAMRMClientResendsRequestsOnRMRestart() throws Exception {
UserGroupInformation.setLoginUser(null);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
// Phase-1 Start 1st RM
MyResourceManager rm1 = new MyResourceManager(conf, memStore);
rm1.start();
DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
// Submit the application
RMApp app = rm1.submitApp(1024);
dispatcher.await();
MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
rm1.sendAMLaunched(appAttemptId);
dispatcher.await();
org.apache.hadoop.security.token.Token<AMRMTokenIdentifier> token = rm1.getRMContext().getRMApps().get(appAttemptId.getApplicationId()).getRMAppAttempt(appAttemptId).getAMRMToken();
UserGroupInformation ugi = UserGroupInformation.getCurrentUser();
ugi.addTokenIdentifier(token.decodeIdentifier());
// Step-1 : AMRMClient send allocate request for 3 ContainerRequest
// cRequest1 = h1, cRequest2 = h1,h2 and cRequest3 = h1
// blacklisted nodes = h2
AMRMClient<ContainerRequest> amClient = new MyAMRMClientImpl(rm1);
amClient.init(conf);
amClient.start();
amClient.registerApplicationMaster("Host", 10000, "");
ContainerRequest cRequest1 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest1);
ContainerRequest cRequest2 = createReq(1, 1024, new String[] { "h1", "h2" });
amClient.addContainerRequest(cRequest2);
ContainerRequest cRequest3 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest3);
List<String> blacklistAdditions = new ArrayList<String>();
List<String> blacklistRemoval = new ArrayList<String>();
blacklistAdditions.add("h2");
blacklistRemoval.add("h10");
amClient.updateBlacklist(blacklistAdditions, blacklistRemoval);
// remove from local list
blacklistAdditions.remove("h2");
AllocateResponse allocateResponse = amClient.allocate(0.1f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
// Why 4 ask, why not 3 ask even h2 is blacklisted?
// On blacklisting host,applicationmaster has to remove ask request from
// remoterequest table.Here,test does not remove explicitely
assertAsksAndReleases(4, 0, rm1);
assertBlacklistAdditionsAndRemovals(1, 1, rm1);
// Step-2 : NM heart beat is sent.
// On 2nd AM allocate request, RM allocates 3 containers to AM
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
allocateResponse = amClient.allocate(0.2f);
dispatcher.await();
// 3 containers are allocated i.e for cRequest1, cRequest2 and cRequest3.
Assert.assertEquals("No of assignments must be 0", 3, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(0, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
List<Container> allocatedContainers = allocateResponse.getAllocatedContainers();
// removed allocated container requests
amClient.removeContainerRequest(cRequest1);
amClient.removeContainerRequest(cRequest2);
amClient.removeContainerRequest(cRequest3);
allocateResponse = amClient.allocate(0.2f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(4, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
// Step-3 : Send 1 containerRequest and 1 releaseRequests to RM
ContainerRequest cRequest4 = createReq(1, 1024, new String[] { "h1" });
amClient.addContainerRequest(cRequest4);
int pendingRelease = 0;
Iterator<Container> it = allocatedContainers.iterator();
while (it.hasNext()) {
amClient.releaseAssignedContainer(it.next().getId());
pendingRelease++;
it.remove();
// remove one container
break;
}
// Step-3.5 : Send 1 container resource increase request to RM
Container container = it.next();
ContainerId containerId = container.getId();
// Make sure that container is in RUNNING state before sending increase
// request
nm1.nodeHeartbeat(containerId.getApplicationAttemptId(), containerId.getContainerId(), ContainerState.RUNNING);
dispatcher.await();
amClient.requestContainerUpdate(container, UpdateContainerRequest.newInstance(container.getVersion(), container.getId(), ContainerUpdateType.INCREASE_RESOURCE, Resource.newInstance(2048, 1), null));
it.remove();
allocateResponse = amClient.allocate(0.3f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(3, pendingRelease, rm1);
// Verify there is one increase and zero decrease
assertChanges(1, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
int completedContainer = allocateResponse.getCompletedContainersStatuses().size();
pendingRelease -= completedContainer;
// Phase-2 start 2nd RM is up
MyResourceManager rm2 = new MyResourceManager(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
((MyAMRMClientImpl) amClient).updateRMProxy(rm2);
dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
// NM should be rebooted on heartbeat, even first heartbeat for nm2
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
// new NM to represent NM re-register
nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
NMContainerStatus containerReport = NMContainerStatus.newInstance(containerId, 0, ContainerState.RUNNING, Resource.newInstance(1024, 1), "recover container", 0, Priority.newInstance(0), 0);
nm1.registerNode(Collections.singletonList(containerReport), Collections.singletonList(containerId.getApplicationAttemptId().getApplicationId()));
nm1.nodeHeartbeat(true);
dispatcher.await();
blacklistAdditions.add("h3");
amClient.updateBlacklist(blacklistAdditions, null);
blacklistAdditions.remove("h3");
it = allocatedContainers.iterator();
while (it.hasNext()) {
amClient.releaseAssignedContainer(it.next().getId());
pendingRelease++;
it.remove();
}
ContainerRequest cRequest5 = createReq(1, 1024, new String[] { "h1", "h2" });
amClient.addContainerRequest(cRequest5);
// Step-4 : On RM restart, AM(does not know RM is restarted) sends
// additional
// containerRequest and blacklisted nodes.
// Intern RM send resync command,AMRMClient resend allocate request
allocateResponse = amClient.allocate(0.3f);
dispatcher.await();
completedContainer = allocateResponse.getCompletedContainersStatuses().size();
pendingRelease -= completedContainer;
assertAsksAndReleases(4, pendingRelease, rm2);
// Verify there is one increase and zero decrease
assertChanges(1, 0, rm2);
assertBlacklistAdditionsAndRemovals(2, 0, rm2);
ContainerRequest cRequest6 = createReq(1, 1024, new String[] { "h1", "h2", "h3" });
amClient.addContainerRequest(cRequest6);
// Step-5 : Allocater after resync command
allocateResponse = amClient.allocate(0.5f);
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, allocateResponse.getAllocatedContainers().size());
assertAsksAndReleases(5, 0, rm2);
// Verify there is no increase or decrease requests any more
assertChanges(0, 0, rm2);
assertBlacklistAdditionsAndRemovals(0, 0, rm2);
int noAssignedContainer = 0;
int count = 5;
while (count-- > 0) {
nm1.nodeHeartbeat(true);
dispatcher.await();
allocateResponse = amClient.allocate(0.5f);
dispatcher.await();
noAssignedContainer += allocateResponse.getAllocatedContainers().size();
if (noAssignedContainer == 3) {
break;
}
Thread.sleep(1000);
}
// Step-6 : RM allocates containers i.e cRequest4,cRequest5 and cRequest6
Assert.assertEquals("Number of container should be 3", 3, noAssignedContainer);
amClient.stop();
rm1.stop();
rm2.stop();
}
Aggregations