use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.
the class TaskExecutor method offerSlotsToJobManager.
// ------------------------------------------------------------------------
// Internal job manager connection methods
// ------------------------------------------------------------------------
private void offerSlotsToJobManager(final JobID jobId) {
final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
if (jobManagerConnection == null) {
log.debug("There is no job manager connection to the leader of job {}.", jobId);
} else {
if (taskSlotTable.hasAllocatedSlots(jobId)) {
log.info("Offer reserved slots to the leader of job {}.", jobId);
final JobMasterGateway jobMasterGateway = jobManagerConnection.getJobManagerGateway();
final Iterator<TaskSlot> reservedSlotsIterator = taskSlotTable.getAllocatedSlots(jobId);
final UUID leaderId = jobManagerConnection.getLeaderId();
final Collection<SlotOffer> reservedSlots = new HashSet<>(2);
while (reservedSlotsIterator.hasNext()) {
SlotOffer offer = reservedSlotsIterator.next().generateSlotOffer();
try {
if (!taskSlotTable.markSlotActive(offer.getAllocationId())) {
// the slot is either free or releasing at the moment
final String message = "Could not mark slot " + jobId + " active.";
log.debug(message);
jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
}
} catch (SlotNotFoundException e) {
final String message = "Could not mark slot " + jobId + " active.";
jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
continue;
}
reservedSlots.add(offer);
}
Future<Iterable<SlotOffer>> acceptedSlotsFuture = jobMasterGateway.offerSlots(getResourceID(), reservedSlots, leaderId, taskManagerConfiguration.getTimeout());
acceptedSlotsFuture.thenAcceptAsync(new AcceptFunction<Iterable<SlotOffer>>() {
@Override
public void accept(Iterable<SlotOffer> acceptedSlots) {
// check if the response is still valid
if (isJobManagerConnectionValid(jobId, leaderId)) {
// mark accepted slots active
for (SlotOffer acceptedSlot : acceptedSlots) {
reservedSlots.remove(acceptedSlot);
}
final Exception e = new Exception("The slot was rejected by the JobManager.");
for (SlotOffer rejectedSlot : reservedSlots) {
freeSlot(rejectedSlot.getAllocationId(), e);
}
} else {
// discard the response since there is a new leader for the job
log.debug("Discard offer slot response since there is a new leader " + "for the job {}.", jobId);
}
}
}, getMainThreadExecutor());
acceptedSlotsFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {
@Override
public Void apply(Throwable throwable) {
if (throwable instanceof TimeoutException) {
// We ran into a timeout. Try again.
offerSlotsToJobManager(jobId);
} else {
// We encountered an exception. Free the slots and return them to the RM.
for (SlotOffer reservedSlot : reservedSlots) {
freeSlot(reservedSlot.getAllocationId(), throwable);
}
}
return null;
}
}, getMainThreadExecutor());
} else {
log.debug("There are no unassigned slots for the job {}.", jobId);
}
}
}
use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.
the class TaskExecutor method closeJobManagerConnection.
private void closeJobManagerConnection(JobID jobId, Exception cause) {
log.info("Close JobManager connection for job {}.", jobId);
// 1. fail tasks running under this JobID
Iterator<Task> tasks = taskSlotTable.getTasks(jobId);
while (tasks.hasNext()) {
tasks.next().failExternally(new Exception("JobManager responsible for " + jobId + " lost the leadership."));
}
// 2. Move the active slots to state allocated (possible to time out again)
Iterator<AllocationID> activeSlots = taskSlotTable.getActiveSlots(jobId);
while (activeSlots.hasNext()) {
AllocationID activeSlot = activeSlots.next();
try {
if (!taskSlotTable.markSlotInactive(activeSlot, taskManagerConfiguration.getTimeout())) {
freeSlot(activeSlot, new Exception("Slot could not be marked inactive."));
}
} catch (SlotNotFoundException e) {
log.debug("Could not mark the slot {} inactive.", jobId, e);
}
}
// 3. Disassociate from the JobManager
JobManagerConnection jobManagerConnection = jobManagerTable.remove(jobId);
if (jobManagerConnection != null) {
try {
jobManagerHeartbeatManager.unmonitorTarget(jobManagerConnection.getResourceID());
jobManagerConnections.remove(jobManagerConnection.getResourceID());
disassociateFromJobManager(jobManagerConnection, cause);
} catch (IOException e) {
log.warn("Could not properly disassociate from JobManager {}.", jobManagerConnection.getJobManagerGateway().getAddress(), e);
}
}
}
use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.
the class TaskExecutor method requestSlot.
// ----------------------------------------------------------------------
// Slot allocation RPCs
// ----------------------------------------------------------------------
/**
* /**
* Requests a slot from the TaskManager
*
* @param slotId identifying the requested slot
* @param jobId identifying the job for which the request is issued
* @param allocationId id for the request
* @param targetAddress of the job manager requesting the slot
* @param rmLeaderId current leader id of the ResourceManager
* @throws SlotAllocationException if the slot allocation fails
* @return answer to the slot request
*/
@RpcMethod
public TMSlotRequestReply requestSlot(final SlotID slotId, final JobID jobId, final AllocationID allocationId, final String targetAddress, final UUID rmLeaderId) throws SlotAllocationException {
log.info("Receive slot request {} for job {} from resource manager with leader id {}.", allocationId, jobId, rmLeaderId);
if (resourceManagerConnection == null) {
final String message = "TaskManager is not connected to a resource manager.";
log.debug(message);
throw new SlotAllocationException(message);
}
if (!resourceManagerConnection.getTargetLeaderId().equals(rmLeaderId)) {
final String message = "The leader id " + rmLeaderId + " does not match with the leader id of the connected resource manager " + resourceManagerConnection.getTargetLeaderId() + '.';
log.debug(message);
throw new SlotAllocationException(message);
}
if (taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
if (taskSlotTable.allocateSlot(slotId.getSlotNumber(), jobId, allocationId, taskManagerConfiguration.getTimeout())) {
log.info("Allocated slot for {}.", allocationId);
} else {
log.info("Could not allocate slot for {}.", allocationId);
throw new SlotAllocationException("Could not allocate slot.");
}
} else if (!taskSlotTable.isAllocated(slotId.getSlotNumber(), jobId, allocationId)) {
final String message = "The slot " + slotId + " has already been allocated for a different job.";
log.info(message);
throw new SlotAllocationException(message);
}
if (jobManagerTable.contains(jobId)) {
offerSlotsToJobManager(jobId);
} else {
try {
jobLeaderService.addJob(jobId, targetAddress);
} catch (Exception e) {
// free the allocated slot
try {
taskSlotTable.freeSlot(allocationId);
} catch (SlotNotFoundException slotNotFoundException) {
// slot no longer existent, this should actually never happen, because we've
// just allocated the slot. So let's fail hard in this case!
onFatalError(slotNotFoundException);
}
// sanity check
if (!taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
onFatalError(new Exception("Could not free slot " + slotId));
}
throw new SlotAllocationException("Could not add job to job leader service.", e);
}
}
return new TMSlotRequestRegistered(resourceManagerConnection.getRegistrationId(), getResourceID(), allocationId);
}
use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.
the class TaskExecutor method freeSlot.
private void freeSlot(AllocationID allocationId, Throwable cause) {
Preconditions.checkNotNull(allocationId);
try {
int freedSlotIndex = taskSlotTable.freeSlot(allocationId, cause);
if (freedSlotIndex != -1 && isConnectedToResourceManager()) {
// the slot was freed. Tell the RM about it
ResourceManagerGateway resourceManagerGateway = resourceManagerConnection.getTargetGateway();
resourceManagerGateway.notifySlotAvailable(resourceManagerConnection.getTargetLeaderId(), resourceManagerConnection.getRegistrationId(), new SlotID(getResourceID(), freedSlotIndex));
}
} catch (SlotNotFoundException e) {
log.debug("Could not free slot for allocation id {}.", allocationId, e);
}
}
use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.
the class MetricUtils method getUsedManagedMemory.
private static long getUsedManagedMemory(TaskSlotTable<?> taskSlotTable) {
Set<AllocationID> activeTaskAllocationIds = taskSlotTable.getActiveTaskSlotAllocationIds();
long usedMemory = 0L;
for (AllocationID allocationID : activeTaskAllocationIds) {
try {
MemoryManager taskSlotMemoryManager = taskSlotTable.getTaskMemoryManager(allocationID);
usedMemory += taskSlotMemoryManager.getMemorySize() - taskSlotMemoryManager.availableMemory();
} catch (SlotNotFoundException e) {
LOG.debug("The task slot {} is not present anymore and will be ignored in calculating the amount of used memory.", allocationID);
}
}
return usedMemory;
}
Aggregations