use of org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException in project flink by apache.
the class DefaultSlotStatusSyncer method allocateSlot.
@Override
public CompletableFuture<Void> allocateSlot(InstanceID instanceId, JobID jobId, String targetAddress, ResourceProfile resourceProfile) {
Preconditions.checkNotNull(instanceId);
Preconditions.checkNotNull(jobId);
Preconditions.checkNotNull(targetAddress);
Preconditions.checkNotNull(resourceProfile);
checkStarted();
final AllocationID allocationId = new AllocationID();
final Optional<TaskManagerInfo> taskManager = taskManagerTracker.getRegisteredTaskManager(instanceId);
Preconditions.checkState(taskManager.isPresent(), "Could not find a registered task manager for instance id " + instanceId + '.');
final TaskExecutorGateway gateway = taskManager.get().getTaskExecutorConnection().getTaskExecutorGateway();
final ResourceID resourceId = taskManager.get().getTaskExecutorConnection().getResourceID();
LOG.info("Starting allocation of slot {} from {} for job {} with resource profile {}.", allocationId, resourceId, jobId, resourceProfile);
taskManagerTracker.notifySlotStatus(allocationId, jobId, instanceId, resourceProfile, SlotState.PENDING);
resourceTracker.notifyAcquiredResource(jobId, resourceProfile);
pendingSlotAllocations.add(allocationId);
// RPC call to the task manager
CompletableFuture<Acknowledge> requestFuture = gateway.requestSlot(SlotID.getDynamicSlotID(resourceId), jobId, allocationId, resourceProfile, targetAddress, resourceManagerId, taskManagerRequestTimeout);
CompletableFuture<Void> returnedFuture = new CompletableFuture<>();
FutureUtils.assertNoException(requestFuture.handleAsync((Acknowledge acknowledge, Throwable throwable) -> {
if (!pendingSlotAllocations.remove(allocationId)) {
LOG.debug("Ignoring slot allocation update from task manager {} for allocation {} and job {}, because the allocation was already completed or cancelled.", instanceId, allocationId, jobId);
returnedFuture.complete(null);
return null;
}
if (!taskManagerTracker.getAllocatedOrPendingSlot(allocationId).isPresent()) {
LOG.debug("The slot {} has been removed before. Ignore the future.", allocationId);
requestFuture.complete(null);
return null;
}
if (acknowledge != null) {
LOG.trace("Completed allocation of allocation {} for job {}.", allocationId, jobId);
taskManagerTracker.notifySlotStatus(allocationId, jobId, instanceId, resourceProfile, SlotState.ALLOCATED);
returnedFuture.complete(null);
} else {
if (throwable instanceof SlotOccupiedException) {
LOG.error("Should not get this exception.", throwable);
} else {
// TODO If the taskManager does not have enough resource, we
// may endlessly allocate slot on it until the next heartbeat.
LOG.warn("Slot allocation for allocation {} for job {} failed.", allocationId, jobId, throwable);
resourceTracker.notifyLostResource(jobId, resourceProfile);
taskManagerTracker.notifySlotStatus(allocationId, jobId, instanceId, resourceProfile, SlotState.FREE);
}
returnedFuture.completeExceptionally(throwable);
}
return null;
}, mainThreadExecutor));
return returnedFuture;
}
use of org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException in project flink by apache.
the class DeclarativeSlotManagerTest method testSlotRequestRemovedIfTMReportsAllocation.
/**
* Tests that pending request is removed if task executor reports a slot with the same job id.
*/
@Test
public void testSlotRequestRemovedIfTMReportsAllocation() throws Exception {
final ResourceTracker resourceTracker = new DefaultResourceTracker();
final DefaultSlotTracker slotTracker = new DefaultSlotTracker();
try (final DeclarativeSlotManager slotManager = createDeclarativeSlotManagerBuilder().setResourceTracker(resourceTracker).setSlotTracker(slotTracker).buildAndStartWithDirectExec()) {
final JobID jobID = new JobID();
slotManager.processResourceRequirements(createResourceRequirementsForSingleSlot(jobID));
final BlockingQueue<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestSlotQueue = new ArrayBlockingQueue<>(1);
final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
final CompletableFuture<Acknowledge> firstManualSlotRequestResponse = new CompletableFuture<>();
responseQueue.offer(firstManualSlotRequestResponse);
final CompletableFuture<Acknowledge> secondManualSlotRequestResponse = new CompletableFuture<>();
responseQueue.offer(secondManualSlotRequestResponse);
final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(slotIDJobIDAllocationIDStringResourceManagerIdTuple6 -> {
requestSlotQueue.offer(slotIDJobIDAllocationIDStringResourceManagerIdTuple6);
try {
return responseQueue.take();
} catch (InterruptedException ignored) {
return FutureUtils.completedExceptionally(new FlinkException("Response queue was interrupted."));
}
}).createTestingTaskExecutorGateway();
final ResourceID taskExecutorResourceId = ResourceID.generate();
final TaskExecutorConnection taskExecutionConnection = new TaskExecutorConnection(taskExecutorResourceId, testingTaskExecutorGateway);
final SlotReport slotReport = new SlotReport(createFreeSlotStatus(new SlotID(taskExecutorResourceId, 0)));
slotManager.registerTaskManager(taskExecutionConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> firstRequest = requestSlotQueue.take();
// fail first request
firstManualSlotRequestResponse.completeExceptionally(new TimeoutException("Test exception to fail first allocation"));
final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> secondRequest = requestSlotQueue.take();
// fail second request
secondManualSlotRequestResponse.completeExceptionally(new SlotOccupiedException("Test exception", new AllocationID(), jobID));
assertThat(firstRequest.f1, equalTo(jobID));
assertThat(secondRequest.f1, equalTo(jobID));
assertThat(secondRequest.f0, equalTo(firstRequest.f0));
final DeclarativeTaskManagerSlot slot = slotTracker.getSlot(secondRequest.f0);
assertThat(slot.getState(), equalTo(SlotState.ALLOCATED));
assertThat(slot.getJobId(), equalTo(firstRequest.f1));
assertThat(slotManager.getNumberRegisteredSlots(), is(1));
assertThat(getTotalResourceCount(resourceTracker.getAcquiredResources(jobID)), is(1));
}
}
use of org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException in project flink by apache.
the class DeclarativeSlotManager method allocateSlot.
/**
* Allocates the given slot. This entails sending a registration message to the task manager and
* treating failures.
*
* @param taskManagerSlot slot to allocate
* @param jobId job for which the slot should be allocated for
* @param targetAddress address of the job master
* @param resourceProfile resource profile for the requirement for which the slot is used
*/
private void allocateSlot(TaskManagerSlotInformation taskManagerSlot, JobID jobId, String targetAddress, ResourceProfile resourceProfile) {
final SlotID slotId = taskManagerSlot.getSlotId();
LOG.debug("Starting allocation of slot {} for job {} with resource profile {}.", slotId, jobId, resourceProfile);
final InstanceID instanceId = taskManagerSlot.getInstanceId();
if (!taskExecutorManager.isTaskManagerRegistered(instanceId)) {
throw new IllegalStateException("Could not find a registered task manager for instance id " + instanceId + '.');
}
final TaskExecutorConnection taskExecutorConnection = taskManagerSlot.getTaskManagerConnection();
final TaskExecutorGateway gateway = taskExecutorConnection.getTaskExecutorGateway();
final AllocationID allocationId = new AllocationID();
slotTracker.notifyAllocationStart(slotId, jobId);
taskExecutorManager.markUsed(instanceId);
pendingSlotAllocations.put(slotId, allocationId);
// RPC call to the task manager
CompletableFuture<Acknowledge> requestFuture = gateway.requestSlot(slotId, jobId, allocationId, resourceProfile, targetAddress, resourceManagerId, taskManagerRequestTimeout);
CompletableFuture<Void> slotAllocationResponseProcessingFuture = requestFuture.handleAsync((Acknowledge acknowledge, Throwable throwable) -> {
final AllocationID currentAllocationForSlot = pendingSlotAllocations.get(slotId);
if (currentAllocationForSlot == null || !currentAllocationForSlot.equals(allocationId)) {
LOG.debug("Ignoring slot allocation update from task executor {} for slot {} and job {}, because the allocation was already completed or cancelled.", instanceId, slotId, jobId);
return null;
}
if (acknowledge != null) {
LOG.trace("Completed allocation of slot {} for job {}.", slotId, jobId);
slotTracker.notifyAllocationComplete(slotId, jobId);
} else {
if (throwable instanceof SlotOccupiedException) {
SlotOccupiedException exception = (SlotOccupiedException) throwable;
LOG.debug("Tried allocating slot {} for job {}, but it was already allocated for job {}.", slotId, jobId, exception.getJobId());
// report as a slot status to force the state transition
// this could be a problem if we ever assume that the task
// executor always reports about all slots
slotTracker.notifySlotStatus(Collections.singleton(new SlotStatus(slotId, taskManagerSlot.getResourceProfile(), exception.getJobId(), exception.getAllocationId())));
} else {
LOG.warn("Slot allocation for slot {} for job {} failed.", slotId, jobId, throwable);
slotTracker.notifyFree(slotId);
}
checkResourceRequirements();
}
return null;
}, mainThreadExecutor);
FutureUtils.assertNoException(slotAllocationResponseProcessingFuture);
}
Aggregations