use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class SlotPool method offerSlot.
/**
* Slot offering by TaskManager with AllocationID. The AllocationID is originally generated by this pool and
* transfer through the ResourceManager to TaskManager. We use it to distinguish the different allocation
* we issued. Slot offering may be rejected if we find something mismatching or there is actually no pending
* request waiting for this slot (maybe fulfilled by some other returned slot).
*
* @param slot The offered slot
* @return True if we accept the offering
*/
@RpcMethod
public boolean offerSlot(final AllocatedSlot slot) {
validateRunsInMainThread();
// check if this TaskManager is valid
final ResourceID resourceID = slot.getTaskManagerId();
final AllocationID allocationID = slot.getSlotAllocationId();
if (!registeredTaskManagers.contains(resourceID)) {
LOG.debug("Received outdated slot offering [{}] from unregistered TaskManager: {}", slot.getSlotAllocationId(), slot);
return false;
}
// check whether we have already using this slot
if (allocatedSlots.contains(allocationID) || availableSlots.contains(allocationID)) {
LOG.debug("Received repeated offer for slot [{}]. Ignoring.", allocationID);
// and mark the offering as a success
return true;
}
// check whether we have request waiting for this slot
PendingRequest pendingRequest = pendingRequests.remove(allocationID);
if (pendingRequest != null) {
// we were waiting for this!
SimpleSlot resultSlot = createSimpleSlot(slot, Locality.UNKNOWN);
pendingRequest.future().complete(resultSlot);
allocatedSlots.add(resultSlot);
} else {
// we were actually not waiting for this:
// - could be that this request had been fulfilled
// - we are receiving the slots from TaskManagers after becoming leaders
availableSlots.add(slot, clock.relativeTimeMillis());
}
// too long and timed out
return true;
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class SlotManager method handleSlotRequestFailedAtTaskManager.
/**
* The slot request to TaskManager may be either failed by rpc communication (timeout, network error, etc.)
* or really rejected by TaskManager. We shall retry this request by:
* <ul>
* <li>1. verify and clear all the previous allocate information for this request
* <li>2. try to request slot again
* </ul>
* <p>
* This may cause some duplicate allocation, e.g. the slot request to TaskManager is successful but the response
* is lost somehow, so we may request a slot in another TaskManager, this causes two slots assigned to one request,
* but it can be taken care of by rejecting registration at JobManager.
*
* @param originalRequest The original slot request
* @param slotId The target SlotID
*/
void handleSlotRequestFailedAtTaskManager(final SlotRequest originalRequest, final SlotID slotId) {
final AllocationID originalAllocationId = originalRequest.getAllocationId();
LOG.info("Slot request failed at TaskManager, SlotID:{}, AllocationID:{}, JobID:{}", slotId, originalAllocationId, originalRequest.getJobId());
if (allocationMap.isAllocated(slotId)) {
final AllocationID expectedAllocationId = allocationMap.getAllocationID(slotId);
// check whether we have an agreement on whom this slot belongs to
if (originalAllocationId.equals(expectedAllocationId)) {
LOG.info("De-allocate this request and retry");
allocationMap.removeAllocation(expectedAllocationId);
pendingSlotRequests.put(originalRequest.getAllocationId(), originalRequest);
ResourceSlot slot = checkNotNull(getRegisteredSlot(slotId));
// treat this slot as empty and retry with a different request
handleFreeSlot(slot);
} else {
LOG.error("Slot request failed for slot {} with allocation id {}:" + " Allocation id did not match the expected allocation id {}.", slotId, originalAllocationId, expectedAllocationId);
}
} else {
LOG.error("Slot request failed for slot {} with allocation id {}: " + "Slot was not previously registered.", slotId, originalAllocationId);
}
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class SlotManager method sendSlotRequest.
private void sendSlotRequest(final ResourceSlot freeSlot, final SlotRequest slotRequest) {
final AllocationID allocationID = slotRequest.getAllocationId();
final TaskExecutorRegistration registration = freeSlot.getTaskExecutorRegistration();
final Future<TMSlotRequestReply> slotRequestReplyFuture = registration.getTaskExecutorGateway().requestSlot(freeSlot.getSlotId(), slotRequest.getJobId(), allocationID, // TODO: set proper JM address
"foobar", rmServices.getLeaderID(), timeout);
slotRequestReplyFuture.handleAsync(new BiFunction<TMSlotRequestReply, Throwable, Void>() {
@Override
public Void apply(TMSlotRequestReply slotRequestReply, Throwable throwable) {
TaskExecutorRegistration current = taskManagers.get(slotRequestReply.getResourceID());
if (current != null && current.getInstanceID().equals(slotRequestReply.getInstanceID())) {
if (throwable != null || slotRequestReply instanceof TMSlotRequestRejected) {
handleSlotRequestFailedAtTaskManager(slotRequest, freeSlot.getSlotId());
} else {
LOG.debug("Successfully registered slot {} ", freeSlot.getSlotId());
}
} else {
LOG.debug("Discarding message from obsolete TaskExecutor with InstanceID {}", slotRequestReply.getInstanceID());
}
return null;
}
}, rmServices.getMainThreadExecutor());
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class SlotManager method requestSlot.
// ------------------------------------------------------------------------
// slot managements
// ------------------------------------------------------------------------
/**
* Request a slot with requirements, we may either fulfill the request or pending it. Trigger container
* allocation if we don't have enough resource. If we have free slot which can match the request, record
* this allocation and forward the request to TaskManager through ResourceManager (we want this done by
* RPC's main thread to avoid race condition).
*
* @param request The detailed request of the slot
* @return RMSlotRequestRegistered The confirmation message to be send to the caller
*/
public RMSlotRequestRegistered requestSlot(final SlotRequest request) {
final AllocationID allocationId = request.getAllocationId();
if (isRequestDuplicated(request)) {
LOG.warn("Duplicated slot request, AllocationID:{}", allocationId);
return new RMSlotRequestRegistered(allocationId);
}
// try to fulfil the request with current free slots
final ResourceSlot slot = chooseSlotToUse(request, freeSlots);
if (slot != null) {
LOG.info("Assigning SlotID({}) to AllocationID({}), JobID:{}", slot.getSlotId(), allocationId, request.getJobId());
// record this allocation in bookkeeping
allocationMap.addAllocation(slot.getSlotId(), allocationId);
// remove selected slot from free pool
freeSlots.remove(slot.getSlotId());
sendSlotRequest(slot, request);
} else {
LOG.info("Cannot fulfil slot request, try to allocate a new container for it, " + "AllocationID:{}, JobID:{}", allocationId, request.getJobId());
Preconditions.checkState(rmServices != null, "Attempted to allocate resources but no ResourceManagerServices set.");
rmServices.allocateResource(request.getResourceProfile());
pendingSlotRequests.put(allocationId, request);
}
return new RMSlotRequestRegistered(allocationId);
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class SlotManager method handleFreeSlot.
// ------------------------------------------------------------------------
// internal behaviors
// ------------------------------------------------------------------------
/**
* When we have a free slot, try to fulfill the pending request first. If any request can be fulfilled,
* record this allocation in bookkeeping and send slot request to TaskManager, else we just add this slot
* to the free pool.
*
* @param freeSlot The free slot
*/
private void handleFreeSlot(final ResourceSlot freeSlot) {
SlotRequest chosenRequest = chooseRequestToFulfill(freeSlot, pendingSlotRequests);
if (chosenRequest != null) {
final AllocationID allocationId = chosenRequest.getAllocationId();
final SlotRequest slotRequest = pendingSlotRequests.remove(allocationId);
LOG.info("Assigning SlotID({}) to AllocationID({}), JobID:{}", freeSlot.getSlotId(), allocationId, chosenRequest.getJobId());
allocationMap.addAllocation(freeSlot.getSlotId(), allocationId);
sendSlotRequest(freeSlot, slotRequest);
} else {
freeSlots.put(freeSlot.getSlotId(), freeSlot);
}
}
Aggregations