use of org.apache.hadoop.yarn.api.records.NodeReport in project hadoop by apache.
the class RMContainerAllocator method handleUpdatedNodes.
@SuppressWarnings("unchecked")
private void handleUpdatedNodes(AllocateResponse response) {
// send event to the job about on updated nodes
List<NodeReport> updatedNodes = response.getUpdatedNodes();
if (!updatedNodes.isEmpty()) {
// send event to the job to act upon completed tasks
eventHandler.handle(new JobUpdatedNodesEvent(getJob().getID(), updatedNodes));
// act upon running tasks
HashSet<NodeId> unusableNodes = new HashSet<NodeId>();
for (NodeReport nr : updatedNodes) {
NodeState nodeState = nr.getNodeState();
if (nodeState.isUnusable()) {
unusableNodes.add(nr.getNodeId());
}
}
for (int i = 0; i < 2; ++i) {
HashMap<TaskAttemptId, Container> taskSet = i == 0 ? assignedRequests.maps : assignedRequests.reduces;
// kill running containers
for (Map.Entry<TaskAttemptId, Container> entry : taskSet.entrySet()) {
TaskAttemptId tid = entry.getKey();
NodeId taskAttemptNodeId = entry.getValue().getNodeId();
if (unusableNodes.contains(taskAttemptNodeId)) {
LOG.info("Killing taskAttempt:" + tid + " because it is running on unusable node:" + taskAttemptNodeId);
// If map, reschedule next task attempt.
boolean rescheduleNextAttempt = (i == 0) ? true : false;
eventHandler.handle(new TaskAttemptKillEvent(tid, "TaskAttempt killed because it ran on unusable node" + taskAttemptNodeId, rescheduleNextAttempt));
}
}
}
}
}
use of org.apache.hadoop.yarn.api.records.NodeReport in project hadoop by apache.
the class TestMRApp method testUpdatedNodes.
/**
* The test verifies that the AM re-runs maps that have run on bad nodes. It
* also verifies that the AM records all success/killed events so that reduces
* are notified about map output status changes. It also verifies that the
* re-run information is preserved across AM restart
*/
@Test
public void testUpdatedNodes() throws Exception {
int runCount = 0;
Dispatcher disp = Mockito.spy(new AsyncDispatcher());
MRApp app = new MRAppWithHistory(2, 2, false, this.getClass().getName(), true, ++runCount, disp);
Configuration conf = new Configuration();
// after half of the map completion, reduce will start
conf.setFloat(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 0.5f);
// uberization forces full slowstart (1.0), so disable that
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
ContainerAllocEventHandler handler = new ContainerAllocEventHandler();
disp.register(ContainerAllocator.EventType.class, handler);
final Job job1 = app.submit(conf);
app.waitForState(job1, JobState.RUNNING);
Assert.assertEquals("Num tasks not correct", 4, job1.getTasks().size());
Iterator<Task> it = job1.getTasks().values().iterator();
Task mapTask1 = it.next();
Task mapTask2 = it.next();
// all maps must be running
app.waitForState(mapTask1, TaskState.RUNNING);
app.waitForState(mapTask2, TaskState.RUNNING);
TaskAttempt task1Attempt = mapTask1.getAttempts().values().iterator().next();
TaskAttempt task2Attempt = mapTask2.getAttempts().values().iterator().next();
NodeId node1 = task1Attempt.getNodeId();
NodeId node2 = task2Attempt.getNodeId();
Assert.assertEquals(node1, node2);
// send the done signal to the task
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task1Attempt.getID(), TaskAttemptEventType.TA_DONE));
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task2Attempt.getID(), TaskAttemptEventType.TA_DONE));
// all maps must be succeeded
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.SUCCEEDED);
final int checkIntervalMillis = 100;
final int waitForMillis = 800;
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents(0, 100);
return events.length == 2;
}
}, checkIntervalMillis, waitForMillis);
TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 2 completion events for success", 2, events.length);
// send updated nodes info
ArrayList<NodeReport> updatedNodes = new ArrayList<NodeReport>();
NodeReport nr = RecordFactoryProvider.getRecordFactory(null).newRecordInstance(NodeReport.class);
nr.setNodeId(node1);
nr.setNodeState(NodeState.UNHEALTHY);
updatedNodes.add(nr);
app.getContext().getEventHandler().handle(new JobUpdatedNodesEvent(job1.getID(), updatedNodes));
app.waitForState(task1Attempt, TaskAttemptState.KILLED);
app.waitForState(task2Attempt, TaskAttemptState.KILLED);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents(0, 100);
return events.length == 4;
}
}, checkIntervalMillis, waitForMillis);
events = job1.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 2 more completion events for killed", 4, events.length);
// 2 map task attempts which were killed above should be requested from
// container allocator with the previous map task marked as failed. If
// this happens allocator will request the container for this mapper from
// RM at a higher priority of 5(i.e. with a priority equivalent to that of
// a fail fast map).
handler.waitForFailedMapContainerReqEvents(2);
// all maps must be back to running
app.waitForState(mapTask1, TaskState.RUNNING);
app.waitForState(mapTask2, TaskState.RUNNING);
Iterator<TaskAttempt> itr = mapTask1.getAttempts().values().iterator();
itr.next();
task1Attempt = itr.next();
// send the done signal to the task
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task1Attempt.getID(), TaskAttemptEventType.TA_DONE));
// map1 must be succeeded. map2 must be running
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.RUNNING);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job1.getTaskAttemptCompletionEvents(0, 100);
return events.length == 5;
}
}, checkIntervalMillis, waitForMillis);
events = job1.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 1 more completion events for success", 5, events.length);
// Crash the app again.
app.stop();
// rerun
// in rerun the 1st map will be recovered from previous run
app = new MRAppWithHistory(2, 2, false, this.getClass().getName(), false, ++runCount, (Dispatcher) new AsyncDispatcher());
conf = new Configuration();
conf.setBoolean(MRJobConfig.MR_AM_JOB_RECOVERY_ENABLE, true);
conf.setBoolean(MRJobConfig.JOB_UBERTASK_ENABLE, false);
final Job job2 = app.submit(conf);
app.waitForState(job2, JobState.RUNNING);
Assert.assertEquals("No of tasks not correct", 4, job2.getTasks().size());
it = job2.getTasks().values().iterator();
mapTask1 = it.next();
mapTask2 = it.next();
Task reduceTask1 = it.next();
Task reduceTask2 = it.next();
// map 1 will be recovered, no need to send done
app.waitForState(mapTask1, TaskState.SUCCEEDED);
app.waitForState(mapTask2, TaskState.RUNNING);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job2.getTaskAttemptCompletionEvents(0, 100);
return events.length == 2;
}
}, checkIntervalMillis, waitForMillis);
events = job2.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 2 completion events for killed & success of map1", 2, events.length);
task2Attempt = mapTask2.getAttempts().values().iterator().next();
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task2Attempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(mapTask2, TaskState.SUCCEEDED);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job2.getTaskAttemptCompletionEvents(0, 100);
return events.length == 3;
}
}, checkIntervalMillis, waitForMillis);
events = job2.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 1 more completion events for success", 3, events.length);
app.waitForState(reduceTask1, TaskState.RUNNING);
app.waitForState(reduceTask2, TaskState.RUNNING);
TaskAttempt task3Attempt = reduceTask1.getAttempts().values().iterator().next();
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task3Attempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(reduceTask1, TaskState.SUCCEEDED);
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task3Attempt.getID(), TaskAttemptEventType.TA_KILL));
app.waitForState(reduceTask1, TaskState.SUCCEEDED);
TaskAttempt task4Attempt = reduceTask2.getAttempts().values().iterator().next();
app.getContext().getEventHandler().handle(new TaskAttemptEvent(task4Attempt.getID(), TaskAttemptEventType.TA_DONE));
app.waitForState(reduceTask2, TaskState.SUCCEEDED);
waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
TaskAttemptCompletionEvent[] events = job2.getTaskAttemptCompletionEvents(0, 100);
return events.length == 5;
}
}, checkIntervalMillis, waitForMillis);
events = job2.getTaskAttemptCompletionEvents(0, 100);
Assert.assertEquals("Expecting 2 more completion events for reduce success", 5, events.length);
// job succeeds
app.waitForState(job2, JobState.SUCCEEDED);
}
use of org.apache.hadoop.yarn.api.records.NodeReport in project hadoop by apache.
the class NodeCLI method listClusterNodes.
/**
* Lists the nodes matching the given node states
*
* @param nodeStates
* @throws YarnException
* @throws IOException
*/
private void listClusterNodes(Set<NodeState> nodeStates) throws YarnException, IOException {
PrintWriter writer = new PrintWriter(new OutputStreamWriter(sysout, Charset.forName("UTF-8")));
List<NodeReport> nodesReport = client.getNodeReports(nodeStates.toArray(new NodeState[0]));
writer.println("Total Nodes:" + nodesReport.size());
writer.printf(NODES_PATTERN, "Node-Id", "Node-State", "Node-Http-Address", "Number-of-Running-Containers");
for (NodeReport nodeReport : nodesReport) {
writer.printf(NODES_PATTERN, nodeReport.getNodeId(), nodeReport.getNodeState(), nodeReport.getHttpAddress(), nodeReport.getNumContainers());
}
writer.flush();
}
use of org.apache.hadoop.yarn.api.records.NodeReport in project hadoop by apache.
the class TestAMRMClientAsync method createAllocateResponse.
private AllocateResponse createAllocateResponse(List<ContainerStatus> completed, List<Container> allocated, List<Container> increased, List<Container> decreased, List<NMToken> nmTokens) {
List<UpdatedContainer> updatedContainers = new ArrayList<>();
for (Container c : increased) {
updatedContainers.add(UpdatedContainer.newInstance(ContainerUpdateType.INCREASE_RESOURCE, c));
}
for (Container c : decreased) {
updatedContainers.add(UpdatedContainer.newInstance(ContainerUpdateType.DECREASE_RESOURCE, c));
}
AllocateResponse response = AllocateResponse.newInstance(0, completed, allocated, new ArrayList<NodeReport>(), null, null, 1, null, nmTokens, updatedContainers);
return response;
}
use of org.apache.hadoop.yarn.api.records.NodeReport in project apex-core by apache.
the class ResourceRequestHandler method getHost.
public String getHost(ContainerStartRequest csr, boolean first) {
String host = null;
PTContainer c = csr.container;
if (first) {
for (PTOperator oper : c.getOperators()) {
HostOperatorSet grpObj = oper.getNodeLocalOperators();
host = nodeLocalMapping.get(grpObj.getOperatorSet());
if (host != null) {
antiAffinityMapping.put(c, host);
return host;
}
if (grpObj.getHost() != null) {
host = grpObj.getHost();
// using the 1st host value as host for container
break;
}
}
if (host != null && nodeReportMap.get(host) != null) {
for (PTOperator oper : c.getOperators()) {
HostOperatorSet grpObj = oper.getNodeLocalOperators();
Set<PTOperator> nodeLocalSet = grpObj.getOperatorSet();
NodeReport report = nodeReportMap.get(host);
int aggrMemory = c.getRequiredMemoryMB();
int vCores = c.getRequiredVCores();
Set<PTContainer> containers = Sets.newHashSet();
containers.add(c);
for (PTOperator nodeLocalOper : nodeLocalSet) {
if (!containers.contains(nodeLocalOper.getContainer())) {
aggrMemory += nodeLocalOper.getContainer().getRequiredMemoryMB();
vCores += nodeLocalOper.getContainer().getRequiredVCores();
containers.add(nodeLocalOper.getContainer());
}
}
int memAvailable = report.getCapability().getMemory() - report.getUsed().getMemory();
int vCoresAvailable = report.getCapability().getVirtualCores() - report.getUsed().getVirtualCores();
if (memAvailable >= aggrMemory && vCoresAvailable >= vCores) {
nodeLocalMapping.put(nodeLocalSet, host);
antiAffinityMapping.put(c, host);
return host;
}
}
}
}
// the host requested didn't have the resources so looking for other hosts
host = null;
List<String> antiHosts = new ArrayList<>();
List<String> antiPreferredHosts = new ArrayList<>();
if (!c.getStrictAntiPrefs().isEmpty()) {
// Check if containers are allocated already for the anti-affinity containers
populateAntiHostList(c, antiHosts);
}
if (!c.getPreferredAntiPrefs().isEmpty()) {
populateAntiHostList(c, antiPreferredHosts);
}
LOG.info("Strict anti-affinity = {} for container with operators {}", antiHosts, StringUtils.join(c.getOperators(), ","));
for (PTOperator oper : c.getOperators()) {
HostOperatorSet grpObj = oper.getNodeLocalOperators();
Set<PTOperator> nodeLocalSet = grpObj.getOperatorSet();
if (nodeLocalSet.size() > 1 || !c.getStrictAntiPrefs().isEmpty() || !c.getPreferredAntiPrefs().isEmpty()) {
LOG.info("Finding new host for {}", nodeLocalSet);
int aggrMemory = c.getRequiredMemoryMB();
int vCores = c.getRequiredVCores();
Set<PTContainer> containers = Sets.newHashSet();
containers.add(c);
// aggregate memory required for all containers
for (PTOperator nodeLocalOper : nodeLocalSet) {
if (!containers.contains(nodeLocalOper.getContainer())) {
aggrMemory += nodeLocalOper.getContainer().getRequiredMemoryMB();
vCores += nodeLocalOper.getContainer().getRequiredVCores();
containers.add(nodeLocalOper.getContainer());
}
}
host = assignHost(host, antiHosts, antiPreferredHosts, grpObj, nodeLocalSet, aggrMemory, vCores);
if (host == null && !antiPreferredHosts.isEmpty() && !antiHosts.isEmpty()) {
// Drop the preferred constraint and try allocation
antiPreferredHosts.clear();
host = assignHost(host, antiHosts, antiPreferredHosts, grpObj, nodeLocalSet, aggrMemory, vCores);
}
if (host != null) {
antiAffinityMapping.put(c, host);
} else {
host = INVALID_HOST;
}
}
}
LOG.info("Found host {}", host);
return host;
}
Aggregations