use of org.apache.hadoop.mapreduce.v2.app.TestRecovery.MRAppWithHistory in project hadoop by apache.
the class TestRecovery method testSpeculative.
/**
* AM with 2 maps and 1 reduce. For 1st map, one attempt fails, one attempt
* completely disappears because of failed launch, one attempt gets killed and
* one attempt succeeds. AM crashes after the first tasks finishes and
* recovers completely and succeeds in the second generation.
*
* @throws Exception
*/
@Test
public void testSpeculative() throws Exception {
int runCount = 0;
long am1StartTimeEst = System.currentTimeMillis();
MRApp app = new MRAppWithHistory(2, 1, false, this.getClass().getName(), true, ++runCount);
Configuration conf = new Configuration();
conf.setBoolean("mapred.mapper.new-api", true);
conf.setBoolean("mapred.reducer.new-api", true);
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
conf.set(FileOutputFormat.OUTDIR, outputDir.toString());
Job job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
long jobStartTime = job.getReport().getStartTime();
//all maps would be running
Assert.assertEquals("No of tasks not correct", 3, job.getTasks().size());
Iterator<Task> it = job.getTasks().values().iterator();
Task mapTask1 = it.next();
Task mapTask2 = it.next();
Task reduceTask = it.next();
// all maps must be running
app.waitForState(mapTask1, TaskState.RUNNING);
app.waitForState(mapTask2, TaskState.RUNNING);
// Launch a Speculative Task for the first Task
app.getContext().getEventHandler().handle(new TaskEvent(mapTask1.getID(), TaskEventType.T_ADD_SPEC_ATTEMPT));
int timeOut = 0;
while (mapTask1.getAttempts().size() != 2 && timeOut++ < 10) {
Thread.sleep(1000);
LOG.info("Waiting for next attempt to start");
}
Iterator<TaskAttempt> t1it = mapTask1.getAttempts().values().iterator();
TaskAttempt task1Attempt1 = t1it.next();
TaskAttempt task1Attempt2 = t1it.next();
TaskAttempt task2Attempt = mapTask2.getAttempts().values().iterator().next();
// wait for the second task attempt to be assigned.
waitForContainerAssignment(task1Attempt2);
ContainerId t1a2contId = task1Attempt2.getAssignedContainerID();
LOG.info(t1a2contId.toString());
LOG.info(task1Attempt1.getID().toString());
LOG.info(task1Attempt2.getID().toString());
// Launch container for speculative attempt
app.getContext().getEventHandler().handle(new TaskAttemptContainerLaunchedEvent(task1Attempt2.getID(), runCount));
//before sending the TA_DONE, event make sure attempt has come to
//RUNNING state
app.waitForState(task1Attempt1, TaskAttemptState.RUNNING);
app.waitForState(task1Attempt2, TaskAttemptState.RUNNING);
app.waitForState(task2Attempt, TaskAttemptState.RUNNING);
app.waitForState(reduceTask, TaskState.RUNNING);
//send the done signal to the map 1 attempt 1
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task1Attempt1.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(task1Attempt1, TaskAttemptState.SUCCEEDED);
//wait for first map task to complete
app.waitForState(mapTask1, TaskState.SUCCEEDED);
long task1StartTime = mapTask1.getReport().getStartTime();
long task1FinishTime = mapTask1.getReport().getFinishTime();
//stop the app
app.stop();
//rerun
//in rerun the 1st map will be recovered from previous run
long am2StartTimeEst = System.currentTimeMillis();
app = new MRAppWithHistory(2, 1, false, this.getClass().getName(), false, ++runCount);
conf = new Configuration();
conf.setBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
conf.setBoolean("mapred.mapper.new-api", true);
conf.setBoolean("mapred.reducer.new-api", true);
conf.set(FileOutputFormat.OUTDIR, outputDir.toString());
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
job = app.submit(conf);
app.waitForState(job, JobState.RUNNING);
//all maps would be running
Assert.assertEquals("No of tasks not correct", 3, job.getTasks().size());
it = job.getTasks().values().iterator();
mapTask1 = it.next();
mapTask2 = it.next();
reduceTask = it.next();
// first map will be recovered, no need to send done
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.RUNNING);
task2Attempt = mapTask2.getAttempts().values().iterator().next();
//before sending the TA_DONE, event make sure attempt has come to
//RUNNING state
app.waitForState(task2Attempt, TaskAttemptState.RUNNING);
//send the done signal to the 2nd map task
app.getContext().getEventHandler().handle(new TaskAttemptEvent(mapTask2.getAttempts().values().iterator().next().getID(), TaskAttemptEventType.TA_DONE));
//wait to get it completed
app.waitForState(mapTask2, TaskState.SUCCEEDED);
//wait for reduce to be running before sending done
app.waitForState(reduceTask, TaskState.RUNNING);
//send the done signal to the reduce
app.getContext().getEventHandler().handle(new TaskAttemptEvent(reduceTask.getAttempts().values().iterator().next().getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(job, JobState.SUCCEEDED);
app.verifyCompleted();
Assert.assertEquals("Job Start time not correct", jobStartTime, job.getReport().getStartTime());
Assert.assertEquals("Task Start time not correct", task1StartTime, mapTask1.getReport().getStartTime());
Assert.assertEquals("Task Finish time not correct", task1FinishTime, mapTask1.getReport().getFinishTime());
Assert.assertEquals(2, job.getAMInfos().size());
int attemptNum = 1;
// Verify AMInfo
for (AMInfo amInfo : job.getAMInfos()) {
Assert.assertEquals(attemptNum++, amInfo.getAppAttemptId().getAttemptId());
Assert.assertEquals(amInfo.getAppAttemptId(), amInfo.getContainerId().getApplicationAttemptId());
Assert.assertEquals(MRApp.NM_HOST, amInfo.getNodeManagerHost());
Assert.assertEquals(MRApp.NM_PORT, amInfo.getNodeManagerPort());
Assert.assertEquals(MRApp.NM_HTTP_PORT, amInfo.getNodeManagerHttpPort());
}
long am1StartTimeReal = job.getAMInfos().get(0).getStartTime();
long am2StartTimeReal = job.getAMInfos().get(1).getStartTime();
Assert.assertTrue(am1StartTimeReal >= am1StartTimeEst && am1StartTimeReal <= am2StartTimeEst);
Assert.assertTrue(am2StartTimeReal >= am2StartTimeEst && am2StartTimeReal <= System.currentTimeMillis());
}
use of org.apache.hadoop.mapreduce.v2.app.TestRecovery.MRAppWithHistory in project hadoop by apache.
the class TestMRApp method testUpdatedNodes.
/**
* The test verifies that the AM re-runs maps that have run on bad nodes. It
* also verifies that the AM records all success/killed events so that reduces
* are notified about map output status changes. It also verifies that the
* re-run information is preserved across AM restart
*/
@Test
public void testUpdatedNodes() throws Exception {
int runCount = 0;
Dispatcher disp = Mockito.spy(new AsyncDispatcher());
MRApp app = new MRAppWithHistory(2, 2, false, this.getClass().getName(), true, ++runCount, disp);
Configuration conf = new Configuration();
// after half of the map completion, reduce will start
conf.setFloat(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 0.5f);
// uberization forces full slowstart (1.0), so disable that
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
ContainerAllocEventHandler handler = new ContainerAllocEventHandler();
disp.register(ContainerAllocator.EventType.class, handler);
final Job job1 = app.submit(conf);
app.waitForState(job1, JobState.RUNNING);
Assert.assertEquals("Num tasks not correct", 4, job1.getTasks().size());
Iterator<Task> it = job1.getTasks().values().iterator();
Task mapTask1 = it.next();
Task mapTask2 = it.next();
// all maps must be running
app.waitForState(mapTask1, TaskState.RUNNING);
app.waitForState(mapTask2, TaskState.RUNNING);
TaskAttempt task1Attempt = mapTask1.getAttempts().values().iterator().next();
TaskAttempt task2Attempt = mapTask2.getAttempts().values().iterator().next();
NodeId node1 = task1Attempt.getNodeId();
NodeId node2 = task2Attempt.getNodeId();
Assert.assertEquals(node1, node2);
// send the done signal to the task
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task1Attempt.getID(), TaskAttemptEventType.TA_DONE));
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task2Attempt.getID(), TaskAttemptEventType.TA_DONE));
// all maps must be succeeded
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.SUCCEEDED);
final int checkIntervalMillis = 100;
final int waitForMillis = 800;
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents(0, 100);
return events.length == 2;
}
}, checkIntervalMillis, waitForMillis);
TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 2 completion events for success", 2, events.length);
// send updated nodes info
ArrayList<NodeReport> updatedNodes = new ArrayList<NodeReport>();
NodeReport nr = RecordFactoryProvider.getRecordFactory(null).newRecordInstance(NodeReport.class);
nr.setNodeId(node1);
nr.setNodeState(NodeState.UNHEALTHY);
updatedNodes.add(nr);
app.getContext().getEventHandler().handle(new JobUpdatedNodesEvent(job1.getID(), updatedNodes));
app.waitForState(task1Attempt, TaskAttemptState.KILLED);
app.waitForState(task2Attempt, TaskAttemptState.KILLED);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents(0, 100);
return events.length == 4;
}
}, checkIntervalMillis, waitForMillis);
events = job1.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 2 more completion events for killed", 4, events.length);
// 2 map task attempts which were killed above should be requested from
// container allocator with the previous map task marked as failed. If
// this happens allocator will request the container for this mapper from
// RM at a higher priority of 5(i.e. with a priority equivalent to that of
// a fail fast map).
handler.waitForFailedMapContainerReqEvents(2);
// all maps must be back to running
app.waitForState(mapTask1, TaskState.RUNNING);
app.waitForState(mapTask2, TaskState.RUNNING);
Iterator<TaskAttempt> itr = mapTask1.getAttempts().values().iterator();
itr.next();
task1Attempt = itr.next();
// send the done signal to the task
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task1Attempt.getID(), TaskAttemptEventType.TA_DONE));
// map1 must be succeeded. map2 must be running
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.RUNNING);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents(0, 100);
return events.length == 5;
}
}, checkIntervalMillis, waitForMillis);
events = job1.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 1 more completion events for success", 5, events.length);
// Crash the app again.
app.stop();
// rerun
// in rerun the 1st map will be recovered from previous run
app = new MRAppWithHistory(2, 2, false, this.getClass().getName(), false, ++runCount, (Dispatcher) new AsyncDispatcher());
conf = new Configuration();
conf.setBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
final Job job2 = app.submit(conf);
app.waitForState(job2, JobState.RUNNING);
Assert.assertEquals("No of tasks not correct", 4, job2.getTasks().size());
it = job2.getTasks().values().iterator();
mapTask1 = it.next();
mapTask2 = it.next();
Task reduceTask1 = it.next();
Task reduceTask2 = it.next();
// map 1 will be recovered, no need to send done
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.RUNNING);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job2.getTaskAttemptCompletionEvents(0, 100);
return events.length == 2;
}
}, checkIntervalMillis, waitForMillis);
events = job2.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 2 completion events for killed & success of map1", 2, events.length);
task2Attempt = mapTask2.getAttempts().values().iterator().next();
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task2Attempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(mapTask2, TaskState.SUCCEEDED);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job2.getTaskAttemptCompletionEvents(0, 100);
return events.length == 3;
}
}, checkIntervalMillis, waitForMillis);
events = job2.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 1 more completion events for success", 3, events.length);
app.waitForState(reduceTask1, TaskState.RUNNING);
app.waitForState(reduceTask2, TaskState.RUNNING);
TaskAttempt task3Attempt = reduceTask1.getAttempts().values().iterator().next();
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task3Attempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(reduceTask1, TaskState.SUCCEEDED);
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task3Attempt.getID(), TaskAttemptEventType.TA_KILL));
app.waitForState(reduceTask1, TaskState.SUCCEEDED);
TaskAttempt task4Attempt = reduceTask2.getAttempts().values().iterator().next();
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task4Attempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(reduceTask2, TaskState.SUCCEEDED);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job2.getTaskAttemptCompletionEvents(0, 100);
return events.length == 5;
}
}, checkIntervalMillis, waitForMillis);
events = job2.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 2 more completion events for reduce success", 5, events.length);
// job succeeds
app.waitForState(job2, JobState.SUCCEEDED);
}
use of org.apache.hadoop.mapreduce.v2.app.TestRecovery.MRAppWithHistory in project hadoop by apache.
the class TestJobHistoryServer method testReports.
//Test reports of JobHistoryServer. History server should get log files from MRApp and read them
@Test(timeout = 50000)
public void testReports() throws Exception {
Configuration config = new Configuration();
config.setClass(CommonConfigurationKeysPublic.NET_TOPOLOGY_NODE_SWITCH_MAPPING_IMPL_KEY, MyResolver.class, DNSToSwitchMapping.class);
RackResolver.init(config);
MRApp app = new MRAppWithHistory(1, 1, true, this.getClass().getName(), true);
app.submit(config);
Job job = app.getContext().getAllJobs().values().iterator().next();
app.waitForState(job, JobState.SUCCEEDED);
historyServer = new JobHistoryServer();
historyServer.init(config);
historyServer.start();
// search JobHistory service
JobHistory jobHistory = null;
for (Service service : historyServer.getServices()) {
if (service instanceof JobHistory) {
jobHistory = (JobHistory) service;
}
}
;
Map<JobId, Job> jobs = jobHistory.getAllJobs();
assertEquals(1, jobs.size());
assertEquals("job_0_0000", jobs.keySet().iterator().next().toString());
Task task = job.getTasks().values().iterator().next();
TaskAttempt attempt = task.getAttempts().values().iterator().next();
HistoryClientService historyService = historyServer.getClientService();
MRClientProtocol protocol = historyService.getClientHandler();
GetTaskAttemptReportRequest gtarRequest = recordFactory.newRecordInstance(GetTaskAttemptReportRequest.class);
// test getTaskAttemptReport
TaskAttemptId taId = attempt.getID();
taId.setTaskId(task.getID());
taId.getTaskId().setJobId(job.getID());
gtarRequest.setTaskAttemptId(taId);
GetTaskAttemptReportResponse response = protocol.getTaskAttemptReport(gtarRequest);
assertEquals("container_0_0000_01_000000", response.getTaskAttemptReport().getContainerId().toString());
assertTrue(response.getTaskAttemptReport().getDiagnosticInfo().isEmpty());
// counters
assertNotNull(response.getTaskAttemptReport().getCounters().getCounter(TaskCounter.PHYSICAL_MEMORY_BYTES));
assertEquals(taId.toString(), response.getTaskAttemptReport().getTaskAttemptId().toString());
// test getTaskReport
GetTaskReportRequest request = recordFactory.newRecordInstance(GetTaskReportRequest.class);
TaskId taskId = task.getID();
taskId.setJobId(job.getID());
request.setTaskId(taskId);
GetTaskReportResponse reportResponse = protocol.getTaskReport(request);
assertEquals("", reportResponse.getTaskReport().getDiagnosticsList().iterator().next());
// progress
assertEquals(1.0f, reportResponse.getTaskReport().getProgress(), 0.01);
// report has corrected taskId
assertEquals(taskId.toString(), reportResponse.getTaskReport().getTaskId().toString());
// Task state should be SUCCEEDED
assertEquals(TaskState.SUCCEEDED, reportResponse.getTaskReport().getTaskState());
// For invalid jobid, throw IOException
GetTaskReportsRequest gtreportsRequest = recordFactory.newRecordInstance(GetTaskReportsRequest.class);
gtreportsRequest.setJobId(TypeConverter.toYarn(JobID.forName("job_1415730144495_0001")));
gtreportsRequest.setTaskType(TaskType.REDUCE);
try {
protocol.getTaskReports(gtreportsRequest);
fail("IOException not thrown for invalid job id");
} catch (IOException e) {
// Expected
}
// test getTaskAttemptCompletionEvents
GetTaskAttemptCompletionEventsRequest taskAttemptRequest = recordFactory.newRecordInstance(GetTaskAttemptCompletionEventsRequest.class);
taskAttemptRequest.setJobId(job.getID());
GetTaskAttemptCompletionEventsResponse taskAttemptCompletionEventsResponse = protocol.getTaskAttemptCompletionEvents(taskAttemptRequest);
assertEquals(0, taskAttemptCompletionEventsResponse.getCompletionEventCount());
// test getDiagnostics
GetDiagnosticsRequest diagnosticRequest = recordFactory.newRecordInstance(GetDiagnosticsRequest.class);
diagnosticRequest.setTaskAttemptId(taId);
GetDiagnosticsResponse diagnosticResponse = protocol.getDiagnostics(diagnosticRequest);
// it is strange : why one empty string ?
assertEquals(1, diagnosticResponse.getDiagnosticsCount());
assertEquals("", diagnosticResponse.getDiagnostics(0));
}
use of org.apache.hadoop.mapreduce.v2.app.TestRecovery.MRAppWithHistory in project hadoop by apache.
the class TestJobHistoryEvents method testAssignedQueue.
@Test
public void testAssignedQueue() throws Exception {
Configuration conf = new Configuration();
MRApp app = new MRAppWithHistory(2, 1, true, this.getClass().getName(), true, "assignedQueue");
app.submit(conf);
Job job = app.getContext().getAllJobs().values().iterator().next();
JobId jobId = job.getID();
LOG.info("JOBID is " + TypeConverter.fromYarn(jobId).toString());
app.waitForState(job, JobState.SUCCEEDED);
//make sure all events are flushed
app.waitForState(Service.STATE.STOPPED);
/*
* Use HistoryContext to read logged events and verify the number of
* completed maps
*/
HistoryContext context = new JobHistory();
// test start and stop states
((JobHistory) context).init(conf);
((JobHistory) context).start();
Assert.assertTrue(context.getStartTime() > 0);
Assert.assertEquals(((JobHistory) context).getServiceState(), Service.STATE.STARTED);
// get job before stopping JobHistory
Job parsedJob = context.getJob(jobId);
// stop JobHistory
((JobHistory) context).stop();
Assert.assertEquals(((JobHistory) context).getServiceState(), Service.STATE.STOPPED);
Assert.assertEquals("QueueName not correct", "assignedQueue", parsedJob.getQueueName());
}
use of org.apache.hadoop.mapreduce.v2.app.TestRecovery.MRAppWithHistory in project hadoop by apache.
the class TestJobHistoryEvents method testHistoryEvents.
@Test
public void testHistoryEvents() throws Exception {
Configuration conf = new Configuration();
MRApp app = new MRAppWithHistory(2, 1, true, this.getClass().getName(), true);
app.submit(conf);
Job job = app.getContext().getAllJobs().values().iterator().next();
JobId jobId = job.getID();
LOG.info("JOBID is " + TypeConverter.fromYarn(jobId).toString());
app.waitForState(job, JobState.SUCCEEDED);
//make sure all events are flushed
app.waitForState(Service.STATE.STOPPED);
/*
* Use HistoryContext to read logged events and verify the number of
* completed maps
*/
HistoryContext context = new JobHistory();
// test start and stop states
((JobHistory) context).init(conf);
((JobHistory) context).start();
Assert.assertTrue(context.getStartTime() > 0);
Assert.assertEquals(((JobHistory) context).getServiceState(), Service.STATE.STARTED);
// get job before stopping JobHistory
Job parsedJob = context.getJob(jobId);
// stop JobHistory
((JobHistory) context).stop();
Assert.assertEquals(((JobHistory) context).getServiceState(), Service.STATE.STOPPED);
Assert.assertEquals("CompletedMaps not correct", 2, parsedJob.getCompletedMaps());
Assert.assertEquals(System.getProperty("user.name"), parsedJob.getUserName());
Map<TaskId, Task> tasks = parsedJob.getTasks();
Assert.assertEquals("No of tasks not correct", 3, tasks.size());
for (Task task : tasks.values()) {
verifyTask(task);
}
Map<TaskId, Task> maps = parsedJob.getTasks(TaskType.MAP);
Assert.assertEquals("No of maps not correct", 2, maps.size());
Map<TaskId, Task> reduces = parsedJob.getTasks(TaskType.REDUCE);
Assert.assertEquals("No of reduces not correct", 1, reduces.size());
Assert.assertEquals("CompletedReduce not correct", 1, parsedJob.getCompletedReduces());
Assert.assertEquals("Job state not currect", JobState.SUCCEEDED, parsedJob.getState());
}
Aggregations