use of org.apache.flink.runtime.clusterframework.types.ResourceSlot in project flink by apache.
the class SlotManager method handleSlotRequestFailedAtTaskManager.
/**
* The slot request to TaskManager may be either failed by rpc communication (timeout, network error, etc.)
* or really rejected by TaskManager. We shall retry this request by:
* <ul>
* <li>1. verify and clear all the previous allocate information for this request
* <li>2. try to request slot again
* </ul>
* <p>
* This may cause some duplicate allocation, e.g. the slot request to TaskManager is successful but the response
* is lost somehow, so we may request a slot in another TaskManager, this causes two slots assigned to one request,
* but it can be taken care of by rejecting registration at JobManager.
*
* @param originalRequest The original slot request
* @param slotId The target SlotID
*/
void handleSlotRequestFailedAtTaskManager(final SlotRequest originalRequest, final SlotID slotId) {
final AllocationID originalAllocationId = originalRequest.getAllocationId();
LOG.info("Slot request failed at TaskManager, SlotID:{}, AllocationID:{}, JobID:{}", slotId, originalAllocationId, originalRequest.getJobId());
if (allocationMap.isAllocated(slotId)) {
final AllocationID expectedAllocationId = allocationMap.getAllocationID(slotId);
// check whether we have an agreement on whom this slot belongs to
if (originalAllocationId.equals(expectedAllocationId)) {
LOG.info("De-allocate this request and retry");
allocationMap.removeAllocation(expectedAllocationId);
pendingSlotRequests.put(originalRequest.getAllocationId(), originalRequest);
ResourceSlot slot = checkNotNull(getRegisteredSlot(slotId));
// treat this slot as empty and retry with a different request
handleFreeSlot(slot);
} else {
LOG.error("Slot request failed for slot {} with allocation id {}:" + " Allocation id did not match the expected allocation id {}.", slotId, originalAllocationId, expectedAllocationId);
}
} else {
LOG.error("Slot request failed for slot {} with allocation id {}: " + "Slot was not previously registered.", slotId, originalAllocationId);
}
}
use of org.apache.flink.runtime.clusterframework.types.ResourceSlot in project flink by apache.
the class SlotManager method requestSlot.
// ------------------------------------------------------------------------
// slot managements
// ------------------------------------------------------------------------
/**
* Request a slot with requirements, we may either fulfill the request or pending it. Trigger container
* allocation if we don't have enough resource. If we have free slot which can match the request, record
* this allocation and forward the request to TaskManager through ResourceManager (we want this done by
* RPC's main thread to avoid race condition).
*
* @param request The detailed request of the slot
* @return RMSlotRequestRegistered The confirmation message to be send to the caller
*/
public RMSlotRequestRegistered requestSlot(final SlotRequest request) {
final AllocationID allocationId = request.getAllocationId();
if (isRequestDuplicated(request)) {
LOG.warn("Duplicated slot request, AllocationID:{}", allocationId);
return new RMSlotRequestRegistered(allocationId);
}
// try to fulfil the request with current free slots
final ResourceSlot slot = chooseSlotToUse(request, freeSlots);
if (slot != null) {
LOG.info("Assigning SlotID({}) to AllocationID({}), JobID:{}", slot.getSlotId(), allocationId, request.getJobId());
// record this allocation in bookkeeping
allocationMap.addAllocation(slot.getSlotId(), allocationId);
// remove selected slot from free pool
freeSlots.remove(slot.getSlotId());
sendSlotRequest(slot, request);
} else {
LOG.info("Cannot fulfil slot request, try to allocate a new container for it, " + "AllocationID:{}, JobID:{}", allocationId, request.getJobId());
Preconditions.checkState(rmServices != null, "Attempted to allocate resources but no ResourceManagerServices set.");
rmServices.allocateResource(request.getResourceProfile());
pendingSlotRequests.put(allocationId, request);
}
return new RMSlotRequestRegistered(allocationId);
}
use of org.apache.flink.runtime.clusterframework.types.ResourceSlot in project flink by apache.
the class SlotManager method registerTaskExecutor.
/**
* Registers a TaskExecutor
* @param resourceID TaskExecutor's ResourceID
* @param registration TaskExecutor's registration
* @param slotReport TaskExecutor's free and allocated slots
*/
public void registerTaskExecutor(ResourceID resourceID, TaskExecutorRegistration registration, SlotReport slotReport) {
if (taskManagers.get(resourceID) != null) {
notifyTaskManagerFailure(resourceID);
}
this.taskManagers.put(resourceID, registration);
for (SlotStatus slotStatus : slotReport.getSlotsStatus()) {
final SlotID slotId = slotStatus.getSlotID();
final TaskExecutorRegistration taskExecutorRegistration = taskManagers.get(slotId.getResourceID());
if (taskExecutorRegistration == null) {
LOG.info("Received SlotStatus but ResourceID {} is unknown to the SlotManager", slotId.getResourceID());
return;
}
final ResourceSlot slot = new ResourceSlot(slotId, slotStatus.getProfiler(), taskExecutorRegistration);
registerNewSlot(slot);
LOG.info("New slot appeared, SlotID:{}, AllocationID:{}", slotId, slotStatus.getAllocationID());
if (slotStatus.getAllocationID() != null) {
// slot in use, record this in bookkeeping
allocationMap.addAllocation(slotId, slotStatus.getAllocationID());
} else {
handleFreeSlot(slot);
}
}
}
use of org.apache.flink.runtime.clusterframework.types.ResourceSlot in project flink by apache.
the class SlotManagerTest method testNotifyTaskManagerFailure.
@Test
public void testNotifyTaskManagerFailure() {
TestingSlotManager slotManager = new TestingSlotManager();
ResourceID resource1 = ResourceID.generate();
ResourceID resource2 = ResourceID.generate();
ResourceSlot slot11 = new ResourceSlot(new SlotID(resource1, 1), DEFAULT_TESTING_PROFILE, taskExecutorRegistration);
ResourceSlot slot12 = new ResourceSlot(new SlotID(resource1, 2), DEFAULT_TESTING_PROFILE, taskExecutorRegistration);
ResourceSlot slot21 = new ResourceSlot(new SlotID(resource2, 1), DEFAULT_TESTING_PROFILE, taskExecutorRegistration);
ResourceSlot slot22 = new ResourceSlot(new SlotID(resource2, 2), DEFAULT_TESTING_PROFILE, taskExecutorRegistration);
slotManager.addFreeSlot(slot11);
slotManager.addFreeSlot(slot21);
slotManager.requestSlot(new SlotRequest(new JobID(), new AllocationID(), DEFAULT_TESTING_PROFILE));
slotManager.requestSlot(new SlotRequest(new JobID(), new AllocationID(), DEFAULT_TESTING_PROFILE));
assertEquals(2, slotManager.getAllocatedSlotCount());
assertEquals(0, slotManager.getFreeSlotCount());
assertEquals(0, slotManager.getPendingRequestCount());
slotManager.addFreeSlot(slot12);
slotManager.addFreeSlot(slot22);
assertEquals(2, slotManager.getAllocatedSlotCount());
assertEquals(2, slotManager.getFreeSlotCount());
assertEquals(0, slotManager.getPendingRequestCount());
slotManager.notifyTaskManagerFailure(resource2);
assertEquals(1, slotManager.getAllocatedSlotCount());
assertEquals(1, slotManager.getFreeSlotCount());
assertEquals(0, slotManager.getPendingRequestCount());
// notify an not exist resource failure
slotManager.notifyTaskManagerFailure(ResourceID.generate());
assertEquals(1, slotManager.getAllocatedSlotCount());
assertEquals(1, slotManager.getFreeSlotCount());
assertEquals(0, slotManager.getPendingRequestCount());
}
Aggregations