use of org.apache.hadoop.hive.llap.LlapNodeId in project hive by apache.
the class TestLlapDaemonProtocolClientProxy method testSingleInvocationPerNode.
@Test(timeout = 5000)
public void testSingleInvocationPerNode() {
RequestManagerForTest requestManager = new RequestManagerForTest(1);
LlapNodeId nodeId1 = LlapNodeId.getInstance("host1", 1025);
Message mockMessage = mock(Message.class);
LlapProtocolClientProxy.ExecuteRequestCallback mockExecuteRequestCallback = mock(LlapProtocolClientProxy.ExecuteRequestCallback.class);
// First request for host.
requestManager.queueRequest(new CallableRequestForTest(nodeId1, mockMessage, mockExecuteRequestCallback));
requestManager.process();
assertEquals(1, requestManager.numSubmissionsCounters);
assertNotNull(requestManager.numInvocationsPerNode.get(nodeId1));
Assert.assertEquals(1, requestManager.numInvocationsPerNode.get(nodeId1).getValue().intValue());
assertEquals(0, requestManager.currentLoopSkippedRequests.size());
// Second request for host. Single invocation since the last has not completed.
requestManager.queueRequest(new CallableRequestForTest(nodeId1, mockMessage, mockExecuteRequestCallback));
requestManager.process();
assertEquals(1, requestManager.numSubmissionsCounters);
assertNotNull(requestManager.numInvocationsPerNode.get(nodeId1));
Assert.assertEquals(1, requestManager.numInvocationsPerNode.get(nodeId1).getValue().intValue());
assertEquals(1, requestManager.currentLoopSkippedRequests.size());
assertEquals(1, requestManager.currentLoopDisabledNodes.size());
assertTrue(requestManager.currentLoopDisabledNodes.contains(nodeId1));
// Complete first request. Second pending request should go through.
requestManager.requestFinished(nodeId1);
requestManager.process();
assertEquals(2, requestManager.numSubmissionsCounters);
assertNotNull(requestManager.numInvocationsPerNode.get(nodeId1));
Assert.assertEquals(2, requestManager.numInvocationsPerNode.get(nodeId1).getValue().intValue());
assertEquals(0, requestManager.currentLoopSkippedRequests.size());
assertEquals(0, requestManager.currentLoopDisabledNodes.size());
assertFalse(requestManager.currentLoopDisabledNodes.contains(nodeId1));
}
use of org.apache.hadoop.hive.llap.LlapNodeId in project hive by apache.
the class LlapTaskCommunicator method nodePinged.
void nodePinged(String hostname, String uniqueId, int port, TezAttemptArray tasks) {
// TODO: do we ever need the port? we could just do away with nodeId altogether.
LlapNodeId nodeId = LlapNodeId.getInstance(hostname, port);
registerPingingNode(nodeId);
BiMap<ContainerId, TezTaskAttemptID> biMap = entityTracker.getContainerAttemptMapForNode(nodeId);
if (biMap != null) {
HashSet<TezTaskAttemptID> attempts = new HashSet<>();
for (Writable w : tasks.get()) {
attempts.add((TezTaskAttemptID) w);
}
String error = "";
synchronized (biMap) {
for (Map.Entry<ContainerId, TezTaskAttemptID> entry : biMap.entrySet()) {
// TODO: this is a stopgap fix. We really need to change all mappings by unique node ID,
// or at least (in this case) track the latest unique ID for LlapNode and retry all
// older-node tasks proactively. For now let the heartbeats fail them.
TezTaskAttemptID attemptId = entry.getValue();
String taskNodeId = entityTracker.getUniqueNodeId(attemptId);
// Also, we prefer a missed heartbeat over a stuck query in case of discrepancy in ET.
if (taskNodeId != null && taskNodeId.equals(uniqueId)) {
if (attempts.contains(attemptId)) {
getContext().taskAlive(entry.getValue());
} else {
error += (attemptId + ", ");
}
getContext().containerAlive(entry.getKey());
}
}
}
if (!error.isEmpty()) {
LOG.info("The tasks we expected to be on the node are not there: " + error);
}
} else {
long currentTs = TimeUnit.MILLISECONDS.convert(System.nanoTime(), TimeUnit.NANOSECONDS);
if (currentTs > nodeNotFoundLogTime.get() + 5000l) {
LOG.warn("Received ping from node without any registered tasks or containers: " + hostname + ":" + port + ". Could be caused by pre-emption by the AM," + " or a mismatched hostname. Enable debug logging for mismatched host names");
nodeNotFoundLogTime.set(currentTs);
}
}
}
use of org.apache.hadoop.hive.llap.LlapNodeId in project hive by apache.
the class LlapTaskCommunicator method sendTaskTerminated.
private void sendTaskTerminated(final TezTaskAttemptID taskAttemptId, boolean invokedByContainerEnd) {
LOG.info("Attempting to send terminateRequest for fragment {} due to internal preemption invoked by {}", taskAttemptId.toString(), invokedByContainerEnd ? "containerEnd" : "taskEnd");
LlapNodeId nodeId = entityTracker.getNodeIdForTaskAttempt(taskAttemptId);
// NodeId can be null if the task gets unregistered due to failure / being killed by the daemon itself
if (nodeId != null) {
TerminateFragmentRequestProto request = TerminateFragmentRequestProto.newBuilder().setQueryIdentifier(constructQueryIdentifierProto(taskAttemptId.getTaskID().getVertexID().getDAGId().getId())).setFragmentIdentifierString(taskAttemptId.toString()).build();
communicator.sendTerminateFragment(request, nodeId.getHostname(), nodeId.getPort(), new LlapProtocolClientProxy.ExecuteRequestCallback<TerminateFragmentResponseProto>() {
@Override
public void setResponse(TerminateFragmentResponseProto response) {
}
@Override
public void indicateError(Throwable t) {
LOG.warn("Failed to send terminate fragment request for {}", taskAttemptId.toString());
processSendError(t);
}
});
} else {
LOG.info("Not sending terminate request for fragment {} since it's node is not known. Already unregistered", taskAttemptId.toString());
}
}
use of org.apache.hadoop.hive.llap.LlapNodeId in project hive by apache.
the class SourceStateTracker method registerTaskForStateUpdates.
/**
* Used to register a task for state updates. Effectively registers for state updates to go to the specific node.
* @param host
* @param port
* @param inputSpecList
*/
public synchronized void registerTaskForStateUpdates(String host, int port, List<InputSpec> inputSpecList) {
// Add tracking information. Check if source state already known and send out an update if it is.
List<String> sourcesOfInterest = getSourceInterestList(inputSpecList);
if (sourcesOfInterest != null && !sourcesOfInterest.isEmpty()) {
LlapNodeId nodeId = LlapNodeId.getInstance(host, port);
NodeInfo nodeInfo = getNodeInfo(nodeId);
// Set up the data structures, before any notifications come in.
for (String src : sourcesOfInterest) {
VertexState oldStateForNode = nodeInfo.getLastKnownStateForSource(src);
if (oldStateForNode == null) {
// Not registered for this node.
// Register and send state if it is successful.
SourceInfo srcInfo = getSourceInfo(src);
srcInfo.addNode(nodeId);
nodeInfo.addSource(src, srcInfo.lastKnownState);
if (srcInfo.lastKnownState == VertexState.SUCCEEDED) {
sendStateUpdateToNode(nodeId, src, srcInfo.lastKnownState);
}
} else {
// Already registered to send updates to this node for the specific source.
// Nothing to do for now, unless tracking tasks at a later point.
}
// Setup for actual notifications, if not already done for a previous task.
maybeRegisterForVertexUpdates(src);
}
} else {
// Don't need to track anything for this task. No new notifications, etc.
}
}
use of org.apache.hadoop.hive.llap.LlapNodeId in project hive by apache.
the class AMReporter method registerTask.
public void registerTask(String amLocation, int port, String umbilicalUser, Token<JobTokenIdentifier> jobToken, QueryIdentifier queryIdentifier, TezTaskAttemptID attemptId) {
if (LOG.isTraceEnabled()) {
LOG.trace("Registering for heartbeat: {}, queryIdentifier={}, attemptId={}", (amLocation + ":" + port), queryIdentifier, attemptId);
}
AMNodeInfo amNodeInfo;
// and discard AMNodeInfo instances per query.
synchronized (knownAppMasters) {
LlapNodeId amNodeId = LlapNodeId.getInstance(amLocation, port);
amNodeInfo = knownAppMasters.get(queryIdentifier);
if (amNodeInfo == null) {
amNodeInfo = new AMNodeInfo(amNodeId, umbilicalUser, jobToken, queryIdentifier, retryPolicy, retryTimeout, socketFactory, conf);
knownAppMasters.put(queryIdentifier, amNodeInfo);
// Add to the queue only the first time this is registered, and on
// subsequent instances when it's taken off the queue.
amNodeInfo.setNextHeartbeatTime(System.currentTimeMillis() + heartbeatInterval);
pendingHeartbeatQueeu.add(amNodeInfo);
// AMNodeInfo will only be cleared when a queryComplete is received for this query, or
// when we detect a failure on the AM side (failure to heartbeat).
// A single queueLookupCallable is added here. We have to make sure one instance stays
// in the queue till the query completes.
}
amNodeInfo.addTaskAttempt(attemptId);
}
}
Aggregations