use of org.apache.hadoop.yarn.api.records.ContainerStatus in project hadoop by apache.
the class TestAMRestart method testShouldNotCountFailureToMaxAttemptRetry.
// AM container preempted, nm disk failure
// should not be counted towards AM max retry count.
@Test(timeout = 100000)
public void testShouldNotCountFailureToMaxAttemptRetry() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
// explicitly set max-am-retry count as 1.
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 1);
conf.setBoolean(YarnConfiguration.RECOVERY_ENABLED, true);
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MockRM rm1 = new MockRM(conf, memStore);
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8000, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
CapacityScheduler scheduler = (CapacityScheduler) rm1.getResourceScheduler();
ContainerId amContainer = ContainerId.newContainerId(am1.getApplicationAttemptId(), 1);
// Preempt the first attempt;
scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer));
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am1.getApplicationAttemptId());
Assert.assertTrue(!attempt1.shouldCountTowardsMaxAttemptRetry());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
ApplicationStateData appState = memStore.getState().getApplicationState().get(app1.getApplicationId());
// AM should be restarted even though max-am-attempt is 1.
MockAM am2 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 2, nm1);
RMAppAttempt attempt2 = app1.getCurrentAppAttempt();
// Preempt the second attempt.
ContainerId amContainer2 = ContainerId.newContainerId(am2.getApplicationAttemptId(), 1);
scheduler.markContainerForKillable(scheduler.getRMContainer(amContainer2));
rm1.waitForState(am2.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am2.getApplicationAttemptId());
Assert.assertTrue(!attempt2.shouldCountTowardsMaxAttemptRetry());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am3 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 3, nm1);
RMAppAttempt attempt3 = app1.getCurrentAppAttempt();
// mimic NM disk_failure
ContainerStatus containerStatus = Records.newRecord(ContainerStatus.class);
containerStatus.setContainerId(attempt3.getMasterContainer().getId());
containerStatus.setDiagnostics("mimic NM disk_failure");
containerStatus.setState(ContainerState.COMPLETE);
containerStatus.setExitStatus(ContainerExitStatus.DISKS_FAILED);
Map<ApplicationId, List<ContainerStatus>> conts = new HashMap<ApplicationId, List<ContainerStatus>>();
conts.put(app1.getApplicationId(), Collections.singletonList(containerStatus));
nm1.nodeHeartbeat(conts, true);
rm1.waitForState(am3.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am3.getApplicationAttemptId());
Assert.assertTrue(!attempt3.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.DISKS_FAILED, appState.getAttempt(am3.getApplicationAttemptId()).getAMContainerExitStatus());
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
MockAM am4 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 4, nm1);
RMAppAttempt attempt4 = app1.getCurrentAppAttempt();
// create second NM, and register to rm1
MockNM nm2 = new MockNM("127.0.0.1:2234", 8000, rm1.getResourceTrackerService());
nm2.registerNode();
// nm1 heartbeats to report unhealthy
// This will mimic ContainerExitStatus.ABORT
nm1.nodeHeartbeat(false);
rm1.waitForState(am4.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am4.getApplicationAttemptId());
Assert.assertTrue(!attempt4.shouldCountTowardsMaxAttemptRetry());
Assert.assertEquals(ContainerExitStatus.ABORTED, appState.getAttempt(am4.getApplicationAttemptId()).getAMContainerExitStatus());
// launch next AM in nm2
MockAM am5 = rm1.waitForNewAMToLaunchAndRegister(app1.getApplicationId(), 5, nm2);
RMAppAttempt attempt5 = app1.getCurrentAppAttempt();
// fail the AM normally
nm2.nodeHeartbeat(am5.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(am5.getApplicationAttemptId(), RMAppAttemptState.FAILED);
TestSchedulerUtils.waitSchedulerApplicationAttemptStopped(scheduler, am5.getApplicationAttemptId());
Assert.assertTrue(attempt5.shouldCountTowardsMaxAttemptRetry());
// AM should not be restarted.
rm1.waitForState(app1.getApplicationId(), RMAppState.FAILED);
Assert.assertEquals(5, app1.getAppAttempts().size());
rm1.stop();
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project hadoop by apache.
the class TestAMRestart method testAMRestartWithExistingContainers.
@Test(timeout = 30000)
public void testAMRestartWithExistingContainers() throws Exception {
YarnConfiguration conf = new YarnConfiguration();
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, 2);
MockRM rm1 = new MockRM(conf);
rm1.start();
RMApp app1 = rm1.submitApp(200, "name", "user", new HashMap<ApplicationAccessType, String>(), false, "default", -1, null, "MAPREDUCE", false, true);
MockNM nm1 = new MockNM("127.0.0.1:1234", 10240, rm1.getResourceTrackerService());
nm1.registerNode();
MockNM nm2 = new MockNM("127.0.0.1:2351", 4089, rm1.getResourceTrackerService());
nm2.registerNode();
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
int NUM_CONTAINERS = 3;
allocateContainers(nm1, am1, NUM_CONTAINERS);
// launch the 2nd container, for testing running container transferred.
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 2, ContainerState.RUNNING);
ContainerId containerId2 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 2);
rm1.waitForState(nm1, containerId2, RMContainerState.RUNNING);
// launch the 3rd container, for testing container allocated by previous
// attempt is completed by the next new attempt/
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 3, ContainerState.RUNNING);
ContainerId containerId3 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 3);
rm1.waitForState(nm1, containerId3, RMContainerState.RUNNING);
// 4th container still in AQUIRED state. for testing Acquired container is
// always killed.
ContainerId containerId4 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 4);
rm1.waitForState(nm1, containerId4, RMContainerState.ACQUIRED);
// 5th container is in Allocated state. for testing allocated container is
// always killed.
am1.allocate("127.0.0.1", 1024, 1, new ArrayList<ContainerId>());
nm1.nodeHeartbeat(true);
ContainerId containerId5 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 5);
rm1.waitForState(nm1, containerId5, RMContainerState.ALLOCATED);
// 6th container is in Reserved state.
am1.allocate("127.0.0.1", 6000, 1, new ArrayList<ContainerId>());
ContainerId containerId6 = ContainerId.newContainerId(am1.getApplicationAttemptId(), 6);
nm1.nodeHeartbeat(true);
SchedulerApplicationAttempt schedulerAttempt = ((AbstractYarnScheduler) rm1.getResourceScheduler()).getCurrentAttemptForContainer(containerId6);
while (schedulerAttempt.getReservedContainers().isEmpty()) {
System.out.println("Waiting for container " + containerId6 + " to be reserved.");
nm1.nodeHeartbeat(true);
Thread.sleep(200);
}
// assert containerId6 is reserved.
Assert.assertEquals(containerId6, schedulerAttempt.getReservedContainers().get(0).getContainerId());
// fail the AM by sending CONTAINER_FINISHED event without registering.
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FAILED);
// wait for some time. previous AM's running containers should still remain
// in scheduler even though am failed
Thread.sleep(3000);
rm1.waitForState(nm1, containerId2, RMContainerState.RUNNING);
// acquired/allocated containers are cleaned up.
Assert.assertNull(rm1.getResourceScheduler().getRMContainer(containerId4));
Assert.assertNull(rm1.getResourceScheduler().getRMContainer(containerId5));
// wait for app to start a new attempt.
rm1.waitForState(app1.getApplicationId(), RMAppState.ACCEPTED);
// assert this is a new AM.
ApplicationAttemptId newAttemptId = app1.getCurrentAppAttempt().getAppAttemptId();
Assert.assertFalse(newAttemptId.equals(am1.getApplicationAttemptId()));
// launch the new AM
MockAM am2 = rm1.launchAM(app1, rm1, nm1);
RegisterApplicationMasterResponse registerResponse = am2.registerAppAttempt();
// Assert two containers are running: container2 and container3;
Assert.assertEquals(2, registerResponse.getContainersFromPreviousAttempts().size());
boolean containerId2Exists = false, containerId3Exists = false;
for (Container container : registerResponse.getContainersFromPreviousAttempts()) {
if (container.getId().equals(containerId2)) {
containerId2Exists = true;
}
if (container.getId().equals(containerId3)) {
containerId3Exists = true;
}
}
Assert.assertTrue(containerId2Exists && containerId3Exists);
rm1.waitForState(app1.getApplicationId(), RMAppState.RUNNING);
// complete container by sending the container complete event which has earlier
// attempt's attemptId
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 3, ContainerState.COMPLETE);
// Even though the completed container containerId3 event was sent to the
// earlier failed attempt, new RMAppAttempt can also capture this container
// info.
// completed containerId4 is also transferred to the new attempt.
RMAppAttempt newAttempt = app1.getRMAppAttempt(am2.getApplicationAttemptId());
// 4 containers finished, acquired/allocated/reserved/completed.
waitForContainersToFinish(4, newAttempt);
boolean container3Exists = false, container4Exists = false, container5Exists = false, container6Exists = false;
for (ContainerStatus status : newAttempt.getJustFinishedContainers()) {
if (status.getContainerId().equals(containerId3)) {
// containerId3 is the container ran by previous attempt but finished by the
// new attempt.
container3Exists = true;
}
if (status.getContainerId().equals(containerId4)) {
// containerId4 is the Acquired Container killed by the previous attempt,
// it's now inside new attempt's finished container list.
container4Exists = true;
}
if (status.getContainerId().equals(containerId5)) {
// containerId5 is the Allocated container killed by previous failed attempt.
container5Exists = true;
}
if (status.getContainerId().equals(containerId6)) {
// containerId6 is the reserved container killed by previous failed attempt.
container6Exists = true;
}
}
Assert.assertTrue(container3Exists && container4Exists && container5Exists && container6Exists);
// New SchedulerApplicationAttempt also has the containers info.
rm1.waitForState(nm1, containerId2, RMContainerState.RUNNING);
// record the scheduler attempt for testing.
SchedulerApplicationAttempt schedulerNewAttempt = ((AbstractYarnScheduler) rm1.getResourceScheduler()).getCurrentAttemptForContainer(containerId2);
// finish this application
MockRM.finishAMAndVerifyAppState(app1, rm1, nm1, am2);
// the 2nd attempt released the 1st attempt's running container, when the
// 2nd attempt finishes.
Assert.assertFalse(schedulerNewAttempt.getLiveContainers().contains(containerId2));
// all 4 normal containers finished.
System.out.println("New attempt's just finished containers: " + newAttempt.getJustFinishedContainers());
waitForContainersToFinish(5, newAttempt);
rm1.stop();
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project hadoop by apache.
the class TestCapacityScheduler method testResourceOverCommit.
@Test
public void testResourceOverCommit() throws Exception {
int waitCount;
Configuration conf = new Configuration();
conf.setClass(YarnConfiguration.RM_SCHEDULER, CapacityScheduler.class, ResourceScheduler.class);
MockRM rm = new MockRM(conf);
rm.start();
MockNM nm1 = rm.registerNode("127.0.0.1:1234", 4 * GB);
RMApp app1 = rm.submitApp(2048);
// kick the scheduling, 2 GB given to AM1, remaining 2GB on nm1
nm1.nodeHeartbeat(true);
RMAppAttempt attempt1 = app1.getCurrentAppAttempt();
MockAM am1 = rm.sendAMLaunched(attempt1.getAppAttemptId());
am1.registerAppAttempt();
SchedulerNodeReport report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
// check node report, 2 GB used and 2 GB available
Assert.assertEquals(2 * GB, report_nm1.getUsedResource().getMemorySize());
Assert.assertEquals(2 * GB, report_nm1.getAvailableResource().getMemorySize());
// add request for containers
am1.addRequests(new String[] { "127.0.0.1", "127.0.0.2" }, 2 * GB, 1, 1);
// send the request
AllocateResponse alloc1Response = am1.schedule();
// kick the scheduler, 2 GB given to AM1, resource remaining 0
nm1.nodeHeartbeat(true);
while (alloc1Response.getAllocatedContainers().size() < 1) {
LOG.info("Waiting for containers to be created for app 1...");
Thread.sleep(100);
alloc1Response = am1.schedule();
}
List<Container> allocated1 = alloc1Response.getAllocatedContainers();
Assert.assertEquals(1, allocated1.size());
Assert.assertEquals(2 * GB, allocated1.get(0).getResource().getMemorySize());
Assert.assertEquals(nm1.getNodeId(), allocated1.get(0).getNodeId());
report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
// check node report, 4 GB used and 0 GB available
Assert.assertEquals(0, report_nm1.getAvailableResource().getMemorySize());
Assert.assertEquals(4 * GB, report_nm1.getUsedResource().getMemorySize());
// check container is assigned with 2 GB.
Container c1 = allocated1.get(0);
Assert.assertEquals(2 * GB, c1.getResource().getMemorySize());
// update node resource to 2 GB, so resource is over-consumed.
Map<NodeId, ResourceOption> nodeResourceMap = new HashMap<NodeId, ResourceOption>();
nodeResourceMap.put(nm1.getNodeId(), ResourceOption.newInstance(Resource.newInstance(2 * GB, 1), -1));
UpdateNodeResourceRequest request = UpdateNodeResourceRequest.newInstance(nodeResourceMap);
AdminService as = ((MockRM) rm).getAdminService();
as.updateNodeResource(request);
waitCount = 0;
while (waitCount++ != 20) {
report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
if (report_nm1.getAvailableResource().getMemorySize() != 0) {
break;
}
LOG.info("Waiting for RMNodeResourceUpdateEvent to be handled... Tried " + waitCount + " times already..");
Thread.sleep(1000);
}
// Now, the used resource is still 4 GB, and available resource is minus value.
report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
Assert.assertEquals(4 * GB, report_nm1.getUsedResource().getMemorySize());
Assert.assertEquals(-2 * GB, report_nm1.getAvailableResource().getMemorySize());
// Check container can complete successfully in case of resource over-commitment.
ContainerStatus containerStatus = BuilderUtils.newContainerStatus(c1.getId(), ContainerState.COMPLETE, "", 0, c1.getResource());
nm1.containerStatus(containerStatus);
waitCount = 0;
while (attempt1.getJustFinishedContainers().size() < 1 && waitCount++ != 20) {
LOG.info("Waiting for containers to be finished for app 1... Tried " + waitCount + " times already..");
Thread.sleep(100);
}
Assert.assertEquals(1, attempt1.getJustFinishedContainers().size());
Assert.assertEquals(1, am1.schedule().getCompletedContainersStatuses().size());
report_nm1 = rm.getResourceScheduler().getNodeReport(nm1.getNodeId());
Assert.assertEquals(2 * GB, report_nm1.getUsedResource().getMemorySize());
// As container return 2 GB back, the available resource becomes 0 again.
Assert.assertEquals(0 * GB, report_nm1.getAvailableResource().getMemorySize());
// Verify no NPE is trigger in schedule after resource is updated.
am1.addRequests(new String[] { "127.0.0.1", "127.0.0.2" }, 3 * GB, 1, 1);
alloc1Response = am1.schedule();
Assert.assertEquals("Shouldn't have enough resource to allocate containers", 0, alloc1Response.getAllocatedContainers().size());
int times = 0;
// try 10 times as scheduling is async process.
while (alloc1Response.getAllocatedContainers().size() < 1 && times++ < 10) {
LOG.info("Waiting for containers to be allocated for app 1... Tried " + times + " times already..");
Thread.sleep(100);
}
Assert.assertEquals("Shouldn't have enough resource to allocate containers", 0, alloc1Response.getAllocatedContainers().size());
rm.stop();
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project hadoop by apache.
the class TestSchedulerUtils method testCreateAbnormalContainerStatus.
@Test
public void testCreateAbnormalContainerStatus() {
ContainerStatus cd = SchedulerUtils.createAbnormalContainerStatus(ContainerId.newContainerId(ApplicationAttemptId.newInstance(ApplicationId.newInstance(System.currentTimeMillis(), 1), 1), 1), "x");
Assert.assertEquals(ContainerExitStatus.ABORTED, cd.getExitStatus());
}
use of org.apache.hadoop.yarn.api.records.ContainerStatus in project hadoop by apache.
the class TestRMWebServicesNodes method testNodesResourceUtilization.
@Test
public void testNodesResourceUtilization() throws JSONException, Exception {
WebResource r = resource();
RMNode rmnode1 = getRunningRMNode("h1", 1234, 5120);
NodeId nodeId1 = rmnode1.getNodeID();
RMNodeImpl node = (RMNodeImpl) rm.getRMContext().getRMNodes().get(nodeId1);
NodeHealthStatus nodeHealth = NodeHealthStatus.newInstance(true, "test health report", System.currentTimeMillis());
ResourceUtilization nodeResource = ResourceUtilization.newInstance(4096, 0, (float) 10.5);
ResourceUtilization containerResource = ResourceUtilization.newInstance(2048, 0, (float) 5.05);
NodeStatus nodeStatus = NodeStatus.newInstance(nodeId1, 0, new ArrayList<ContainerStatus>(), null, nodeHealth, containerResource, nodeResource, null);
node.handle(new RMNodeStatusEvent(nodeId1, nodeStatus, null));
rm.waitForState(nodeId1, NodeState.RUNNING);
ClientResponse response = r.path("ws").path("v1").path("cluster").path("nodes").accept(MediaType.APPLICATION_JSON).get(ClientResponse.class);
assertEquals(MediaType.APPLICATION_JSON_TYPE + "; " + JettyUtils.UTF_8, response.getType().toString());
JSONObject json = response.getEntity(JSONObject.class);
assertEquals("incorrect number of elements", 1, json.length());
JSONObject nodes = json.getJSONObject("nodes");
assertEquals("incorrect number of elements", 1, nodes.length());
JSONArray nodeArray = nodes.getJSONArray("node");
assertEquals("incorrect number of elements", 1, nodeArray.length());
JSONObject info = nodeArray.getJSONObject(0);
// verify the resource utilization
verifyNodeInfo(info, rmnode1);
}
Aggregations