use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.
the class TestRM method testKillFinishingApp.
// Test Kill an app while the app is finishing in the meanwhile.
@Test(timeout = 30000)
public void testKillFinishingApp() throws Exception {
// this dispatcher ignores RMAppAttemptEventType.KILL event
final Dispatcher dispatcher = new DrainDispatcher() {
@Override
public EventHandler<Event> getEventHandler() {
class EventArgMatcher extends ArgumentMatcher<AbstractEvent> {
@Override
public boolean matches(Object argument) {
if (argument instanceof RMAppAttemptEvent) {
if (((RMAppAttemptEvent) argument).getType().equals(RMAppAttemptEventType.KILL)) {
return true;
}
}
return false;
}
}
EventHandler handler = spy(super.getEventHandler());
doNothing().when(handler).handle(argThat(new EventArgMatcher()));
return handler;
}
};
MockRM rm1 = new MockRM(conf) {
@Override
protected Dispatcher createDispatcher() {
return dispatcher;
}
};
rm1.start();
MockNM nm1 = new MockNM("127.0.0.1:1234", 8192, rm1.getResourceTrackerService());
nm1.registerNode();
RMApp app1 = rm1.submitApp(200);
MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
rm1.killApp(app1.getApplicationId());
FinishApplicationMasterRequest req = FinishApplicationMasterRequest.newInstance(FinalApplicationStatus.SUCCEEDED, "", "");
am1.unregisterAppAttempt(req, true);
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FINISHING);
nm1.nodeHeartbeat(am1.getApplicationAttemptId(), 1, ContainerState.COMPLETE);
rm1.waitForState(am1.getApplicationAttemptId(), RMAppAttemptState.FINISHED);
rm1.waitForState(app1.getApplicationId(), RMAppState.FINISHED);
}
use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.
the class TestReservations method testFindNodeToUnreserve.
@Test
public void testFindNodeToUnreserve() throws Exception {
CapacitySchedulerConfiguration csConf = new CapacitySchedulerConfiguration();
setup(csConf);
final String user_0 = "user_0";
final ApplicationAttemptId appAttemptId_0 = TestUtils.getMockApplicationAttemptId(0, 0);
LeafQueue a = stubLeafQueue((LeafQueue) queues.get(A));
FiCaSchedulerApp app_0 = new FiCaSchedulerApp(appAttemptId_0, user_0, a, mock(ActiveUsersManager.class), spyRMContext);
String host_1 = "host_1";
FiCaSchedulerNode node_1 = TestUtils.getMockNode(host_1, DEFAULT_RACK, 0, 8 * GB);
// Setup resource-requests
Priority p = TestUtils.createMockPriority(5);
SchedulerRequestKey priorityMap = toSchedulerKey(p);
Resource capability = Resources.createResource(2 * GB, 0);
RMApplicationHistoryWriter writer = mock(RMApplicationHistoryWriter.class);
SystemMetricsPublisher publisher = mock(SystemMetricsPublisher.class);
RMContext rmContext = mock(RMContext.class);
ContainerAllocationExpirer expirer = mock(ContainerAllocationExpirer.class);
DrainDispatcher drainDispatcher = new DrainDispatcher();
when(rmContext.getContainerAllocationExpirer()).thenReturn(expirer);
when(rmContext.getDispatcher()).thenReturn(drainDispatcher);
when(rmContext.getRMApplicationHistoryWriter()).thenReturn(writer);
when(rmContext.getSystemMetricsPublisher()).thenReturn(publisher);
when(rmContext.getYarnConfiguration()).thenReturn(new YarnConfiguration());
ApplicationAttemptId appAttemptId = BuilderUtils.newApplicationAttemptId(app_0.getApplicationId(), 1);
ContainerId containerId = BuilderUtils.newContainerId(appAttemptId, 1);
Container container = TestUtils.getMockContainer(containerId, node_1.getNodeID(), Resources.createResource(2 * GB), priorityMap.getPriority());
RMContainer rmContainer = new RMContainerImpl(container, SchedulerRequestKey.extractFrom(container), appAttemptId, node_1.getNodeID(), "user", rmContext);
// nothing reserved
RMContainer toUnreserveContainer = app_0.findNodeToUnreserve(csContext.getClusterResource(), node_1, priorityMap, capability);
assertTrue(toUnreserveContainer == null);
// reserved but scheduler doesn't know about that node.
app_0.reserve(node_1, priorityMap, rmContainer, container);
node_1.reserveResource(app_0, priorityMap, rmContainer);
toUnreserveContainer = app_0.findNodeToUnreserve(csContext.getClusterResource(), node_1, priorityMap, capability);
assertTrue(toUnreserveContainer == null);
}
use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.
the class TestJobImpl method testUnusableNodeTransition.
@Test(timeout = 20000)
public void testUnusableNodeTransition() throws Exception {
Configuration conf = new Configuration();
conf.set(MRJobConfig.MR_AM_STAGING_DIR, stagingDir);
conf.setInt(MRJobConfig.NUM_REDUCES, 1);
DrainDispatcher dispatcher = new DrainDispatcher();
dispatcher.init(conf);
dispatcher.start();
CyclicBarrier syncBarrier = new CyclicBarrier(2);
OutputCommitter committer = new TestingOutputCommitter(syncBarrier, true);
CommitterEventHandler commitHandler = createCommitterEventHandler(dispatcher, committer);
commitHandler.init(conf);
commitHandler.start();
final JobImpl job = createRunningStubbedJob(conf, dispatcher, 2, null);
// add a special task event handler to put the task back to running in case
// of task rescheduling/killing
EventHandler<TaskAttemptEvent> taskAttemptEventHandler = new EventHandler<TaskAttemptEvent>() {
@Override
public void handle(TaskAttemptEvent event) {
if (event.getType() == TaskAttemptEventType.TA_KILL) {
job.decrementSucceededMapperCount();
}
}
};
dispatcher.register(TaskAttemptEventType.class, taskAttemptEventHandler);
// replace the tasks with spied versions to return the right attempts
Map<TaskId, Task> spiedTasks = new HashMap<TaskId, Task>();
List<NodeReport> nodeReports = new ArrayList<NodeReport>();
Map<NodeReport, TaskId> nodeReportsToTaskIds = new HashMap<NodeReport, TaskId>();
for (Map.Entry<TaskId, Task> e : job.tasks.entrySet()) {
TaskId taskId = e.getKey();
Task task = e.getValue();
if (taskId.getTaskType() == TaskType.MAP) {
// add an attempt to the task to simulate nodes
NodeId nodeId = mock(NodeId.class);
TaskAttempt attempt = mock(TaskAttempt.class);
when(attempt.getNodeId()).thenReturn(nodeId);
TaskAttemptId attemptId = MRBuilderUtils.newTaskAttemptId(taskId, 0);
when(attempt.getID()).thenReturn(attemptId);
// create a spied task
Task spied = spy(task);
doReturn(attempt).when(spied).getAttempt(any(TaskAttemptId.class));
spiedTasks.put(taskId, spied);
// create a NodeReport based on the node id
NodeReport report = mock(NodeReport.class);
when(report.getNodeState()).thenReturn(NodeState.UNHEALTHY);
when(report.getNodeId()).thenReturn(nodeId);
nodeReports.add(report);
nodeReportsToTaskIds.put(report, taskId);
}
}
// replace the tasks with the spied tasks
job.tasks.putAll(spiedTasks);
// complete all mappers first
for (TaskId taskId : job.tasks.keySet()) {
if (taskId.getTaskType() == TaskType.MAP) {
// generate a task attempt completed event first to populate the
// nodes-to-succeeded-attempts map
TaskAttemptCompletionEvent tce = Records.newRecord(TaskAttemptCompletionEvent.class);
TaskAttemptId attemptId = MRBuilderUtils.newTaskAttemptId(taskId, 0);
tce.setAttemptId(attemptId);
tce.setStatus(TaskAttemptCompletionEventStatus.SUCCEEDED);
job.handle(new JobTaskAttemptCompletedEvent(tce));
// complete the task itself
job.handle(new JobTaskEvent(taskId, TaskState.SUCCEEDED));
Assert.assertEquals(JobState.RUNNING, job.getState());
}
}
// add an event for a node transition
NodeReport firstMapperNodeReport = nodeReports.get(0);
NodeReport secondMapperNodeReport = nodeReports.get(1);
job.handle(new JobUpdatedNodesEvent(job.getID(), Collections.singletonList(firstMapperNodeReport)));
dispatcher.await();
// complete the reducer
for (TaskId taskId : job.tasks.keySet()) {
if (taskId.getTaskType() == TaskType.REDUCE) {
job.handle(new JobTaskEvent(taskId, TaskState.SUCCEEDED));
}
}
// add another event for a node transition for the other mapper
// this should not trigger rescheduling
job.handle(new JobUpdatedNodesEvent(job.getID(), Collections.singletonList(secondMapperNodeReport)));
// complete the first mapper that was rescheduled
TaskId firstMapper = nodeReportsToTaskIds.get(firstMapperNodeReport);
job.handle(new JobTaskEvent(firstMapper, TaskState.SUCCEEDED));
// verify the state is moving to committing
assertJobState(job, JobStateInternal.COMMITTING);
// let the committer complete and verify the job succeeds
syncBarrier.await();
assertJobState(job, JobStateInternal.SUCCEEDED);
dispatcher.stop();
commitHandler.stop();
}
use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.
the class TestRMContainerAllocator method testRMContainerAllocatorResendsRequestsOnRMRestart.
// Step-1 : AM send allocate request for 2 ContainerRequests and 1
// blackListeNode
// Step-2 : 2 containers are allocated by RM.
// Step-3 : AM Send 1 containerRequest(event3) and 1 releaseRequests to
// RM
// Step-4 : On RM restart, AM(does not know RM is restarted) sends
// additional containerRequest(event4) and blacklisted nodes.
// Intern RM send resync command
// Step-5 : On Resync,AM sends all outstanding
// asks,release,blacklistAaddition
// and another containerRequest(event5)
// Step-6 : RM allocates containers i.e event3,event4 and cRequest5
@Test
public void testRMContainerAllocatorResendsRequestsOnRMRestart() throws Exception {
Configuration conf = new Configuration();
conf.set(YarnConfiguration.RECOVERY_ENABLED, "true");
conf.set(YarnConfiguration.RM_STORE, MemoryRMStateStore.class.getName());
conf.setInt(YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS);
conf.setBoolean(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_ENABLED, true);
conf.setLong(YarnConfiguration.RM_WORK_PRESERVING_RECOVERY_SCHEDULING_WAIT_MS, 0);
conf.setBoolean(MRJobConfig.MR_AM_JOB_NODE_BLACKLISTING_ENABLE, true);
conf.setInt(MRJobConfig.MAX_TASK_FAILURES_PER_TRACKER, 1);
conf.setInt(MRJobConfig.MR_AM_IGNORE_BLACKLISTING_BLACKLISTED_NODE_PERECENT, -1);
MemoryRMStateStore memStore = new MemoryRMStateStore();
memStore.init(conf);
MyResourceManager rm1 = new MyResourceManager(conf, memStore);
rm1.start();
DrainDispatcher dispatcher = (DrainDispatcher) rm1.getRMContext().getDispatcher();
// Submit the application
RMApp app = rm1.submitApp(1024);
dispatcher.await();
MockNM nm1 = new MockNM("h1:1234", 15120, rm1.getResourceTrackerService());
nm1.registerNode();
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
rm1.sendAMLaunched(appAttemptId);
dispatcher.await();
JobId jobId = MRBuilderUtils.newJobId(appAttemptId.getApplicationId(), 0);
Job mockJob = mock(Job.class);
when(mockJob.getReport()).thenReturn(MRBuilderUtils.newJobReport(jobId, "job", "user", JobState.RUNNING, 0, 0, 0, 0, 0, 0, 0, "jobfile", null, false, ""));
MyContainerAllocator allocator = new MyContainerAllocator(rm1, conf, appAttemptId, mockJob);
// Step-1 : AM send allocate request for 2 ContainerRequests and 1
// blackListeNode
// create the container request
// send MAP request
ContainerRequestEvent event1 = createReq(jobId, 1, 1024, new String[] { "h1" });
allocator.sendRequest(event1);
ContainerRequestEvent event2 = createReq(jobId, 2, 2048, new String[] { "h1", "h2" });
allocator.sendRequest(event2);
// Send events to blacklist h2
ContainerFailedEvent f1 = createFailEvent(jobId, 1, "h2", false);
allocator.sendFailure(f1);
// send allocate request and 1 blacklisted nodes
List<TaskAttemptContainerAssignedEvent> assignedContainers = allocator.schedule();
dispatcher.await();
Assert.assertEquals("No of assignments must be 0", 0, assignedContainers.size());
// Why ask is 3, not 4? --> ask from blacklisted node h2 is removed
assertAsksAndReleases(3, 0, rm1);
assertBlacklistAdditionsAndRemovals(1, 0, rm1);
// Node heartbeat
nm1.nodeHeartbeat(true);
dispatcher.await();
// Step-2 : 2 containers are allocated by RM.
assignedContainers = allocator.schedule();
dispatcher.await();
Assert.assertEquals("No of assignments must be 2", 2, assignedContainers.size());
assertAsksAndReleases(0, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
assignedContainers = allocator.schedule();
Assert.assertEquals("No of assignments must be 0", 0, assignedContainers.size());
assertAsksAndReleases(3, 0, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
// Step-3 : AM Send 1 containerRequest(event3) and 1 releaseRequests to
// RM
// send container request
ContainerRequestEvent event3 = createReq(jobId, 3, 1000, new String[] { "h1" });
allocator.sendRequest(event3);
// send deallocate request
ContainerAllocatorEvent deallocate1 = createDeallocateEvent(jobId, 1, false);
allocator.sendDeallocate(deallocate1);
assignedContainers = allocator.schedule();
Assert.assertEquals("No of assignments must be 0", 0, assignedContainers.size());
assertAsksAndReleases(3, 1, rm1);
assertBlacklistAdditionsAndRemovals(0, 0, rm1);
// Phase-2 start 2nd RM is up
MyResourceManager rm2 = new MyResourceManager(conf, memStore);
rm2.start();
nm1.setResourceTrackerService(rm2.getResourceTrackerService());
allocator.updateSchedulerProxy(rm2);
dispatcher = (DrainDispatcher) rm2.getRMContext().getDispatcher();
// NM should be rebooted on heartbeat, even first heartbeat for nm2
NodeHeartbeatResponse hbResponse = nm1.nodeHeartbeat(true);
Assert.assertEquals(NodeAction.RESYNC, hbResponse.getNodeAction());
// new NM to represent NM re-register
nm1 = new MockNM("h1:1234", 10240, rm2.getResourceTrackerService());
nm1.registerNode();
nm1.nodeHeartbeat(true);
dispatcher.await();
// Step-4 : On RM restart, AM(does not know RM is restarted) sends
// additional containerRequest(event4) and blacklisted nodes.
// Intern RM send resync command
// send deallocate request, release=1
ContainerAllocatorEvent deallocate2 = createDeallocateEvent(jobId, 2, false);
allocator.sendDeallocate(deallocate2);
// Send events to blacklist nodes h3
ContainerFailedEvent f2 = createFailEvent(jobId, 1, "h3", false);
allocator.sendFailure(f2);
ContainerRequestEvent event4 = createReq(jobId, 4, 2000, new String[] { "h1", "h2" });
allocator.sendRequest(event4);
// send allocate request to 2nd RM and get resync command
allocator.schedule();
dispatcher.await();
// Step-5 : On Resync,AM sends all outstanding
// asks,release,blacklistAaddition
// and another containerRequest(event5)
ContainerRequestEvent event5 = createReq(jobId, 5, 3000, new String[] { "h1", "h2", "h3" });
allocator.sendRequest(event5);
// send all outstanding request again.
assignedContainers = allocator.schedule();
dispatcher.await();
assertAsksAndReleases(3, 2, rm2);
assertBlacklistAdditionsAndRemovals(2, 0, rm2);
nm1.nodeHeartbeat(true);
dispatcher.await();
// Step-6 : RM allocates containers i.e event3,event4 and cRequest5
assignedContainers = allocator.schedule();
dispatcher.await();
Assert.assertEquals("Number of container should be 3", 3, assignedContainers.size());
for (TaskAttemptContainerAssignedEvent assig : assignedContainers) {
Assert.assertTrue("Assigned count not correct", "h1".equals(assig.getContainer().getNodeId().getHost()));
}
rm1.stop();
rm2.stop();
}
use of org.apache.hadoop.yarn.event.DrainDispatcher in project hadoop by apache.
the class TestRMContainerAllocator method testReducerRampdownDiagnostics.
@Test(timeout = 30000)
public void testReducerRampdownDiagnostics() throws Exception {
LOG.info("Running tesReducerRampdownDiagnostics");
final Configuration conf = new Configuration();
conf.setFloat(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 0.0f);
final MyResourceManager rm = new MyResourceManager(conf);
rm.start();
final DrainDispatcher dispatcher = (DrainDispatcher) rm.getRMContext().getDispatcher();
final RMApp app = rm.submitApp(1024);
dispatcher.await();
final String host = "host1";
final MockNM nm = rm.registerNode(String.format("%s:1234", host), 2048);
nm.nodeHeartbeat(true);
dispatcher.await();
final ApplicationAttemptId appAttemptId = app.getCurrentAppAttempt().getAppAttemptId();
rm.sendAMLaunched(appAttemptId);
dispatcher.await();
final JobId jobId = MRBuilderUtils.newJobId(appAttemptId.getApplicationId(), 0);
final Job mockJob = mock(Job.class);
when(mockJob.getReport()).thenReturn(MRBuilderUtils.newJobReport(jobId, "job", "user", JobState.RUNNING, 0, 0, 0, 0, 0, 0, 0, "jobfile", null, false, ""));
final MyContainerAllocator allocator = new MyContainerAllocator(rm, conf, appAttemptId, mockJob, SystemClock.getInstance());
// add resources to scheduler
dispatcher.await();
// create the container request
final String[] locations = new String[] { host };
allocator.sendRequest(createReq(jobId, 0, 1024, locations, false, true));
for (int i = 0; i < 1; ) {
dispatcher.await();
i += allocator.schedule().size();
nm.nodeHeartbeat(true);
}
allocator.sendRequest(createReq(jobId, 0, 1024, locations, true, false));
while (allocator.getTaskAttemptKillEvents().size() == 0) {
dispatcher.await();
allocator.schedule().size();
nm.nodeHeartbeat(true);
}
final String killEventMessage = allocator.getTaskAttemptKillEvents().get(0).getMessage();
Assert.assertTrue("No reducer rampDown preemption message", killEventMessage.contains(RMContainerAllocator.RAMPDOWN_DIAGNOSTIC));
}
Aggregations