use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.
the class TaskExecutor method requestSlot.
// ----------------------------------------------------------------------
// Slot allocation RPCs
// ----------------------------------------------------------------------
/**
* /**
* Requests a slot from the TaskManager
*
* @param slotId identifying the requested slot
* @param jobId identifying the job for which the request is issued
* @param allocationId id for the request
* @param targetAddress of the job manager requesting the slot
* @param rmLeaderId current leader id of the ResourceManager
* @throws SlotAllocationException if the slot allocation fails
* @return answer to the slot request
*/
@RpcMethod
public TMSlotRequestReply requestSlot(final SlotID slotId, final JobID jobId, final AllocationID allocationId, final String targetAddress, final UUID rmLeaderId) throws SlotAllocationException {
log.info("Receive slot request {} for job {} from resource manager with leader id {}.", allocationId, jobId, rmLeaderId);
if (resourceManagerConnection == null) {
final String message = "TaskManager is not connected to a resource manager.";
log.debug(message);
throw new SlotAllocationException(message);
}
if (!resourceManagerConnection.getTargetLeaderId().equals(rmLeaderId)) {
final String message = "The leader id " + rmLeaderId + " does not match with the leader id of the connected resource manager " + resourceManagerConnection.getTargetLeaderId() + '.';
log.debug(message);
throw new SlotAllocationException(message);
}
if (taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
if (taskSlotTable.allocateSlot(slotId.getSlotNumber(), jobId, allocationId, taskManagerConfiguration.getTimeout())) {
log.info("Allocated slot for {}.", allocationId);
} else {
log.info("Could not allocate slot for {}.", allocationId);
throw new SlotAllocationException("Could not allocate slot.");
}
} else if (!taskSlotTable.isAllocated(slotId.getSlotNumber(), jobId, allocationId)) {
final String message = "The slot " + slotId + " has already been allocated for a different job.";
log.info(message);
throw new SlotAllocationException(message);
}
if (jobManagerTable.contains(jobId)) {
offerSlotsToJobManager(jobId);
} else {
try {
jobLeaderService.addJob(jobId, targetAddress);
} catch (Exception e) {
// free the allocated slot
try {
taskSlotTable.freeSlot(allocationId);
} catch (SlotNotFoundException slotNotFoundException) {
// slot no longer existent, this should actually never happen, because we've
// just allocated the slot. So let's fail hard in this case!
onFatalError(slotNotFoundException);
}
// sanity check
if (!taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
onFatalError(new Exception("Could not free slot " + slotId));
}
throw new SlotAllocationException("Could not add job to job leader service.", e);
}
}
return new TMSlotRequestRegistered(resourceManagerConnection.getRegistrationId(), getResourceID(), allocationId);
}
use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.
the class TaskExecutor method tryLoadLocalAllocationSnapshots.
/**
* This method tries to repopulate the {@link JobTable} and {@link TaskSlotTable} from the local
* filesystem in a best-effort manner.
*/
private void tryLoadLocalAllocationSnapshots() {
Collection<SlotAllocationSnapshot> slotAllocationSnapshots = slotAllocationSnapshotPersistenceService.loadAllocationSnapshots();
log.debug("Recovered slot allocation snapshots {}.", slotAllocationSnapshots);
final Set<AllocationID> allocatedSlots = new HashSet<>();
for (SlotAllocationSnapshot slotAllocationSnapshot : slotAllocationSnapshots) {
try {
allocateSlotForJob(slotAllocationSnapshot.getJobId(), slotAllocationSnapshot.getSlotID(), slotAllocationSnapshot.getAllocationId(), slotAllocationSnapshot.getResourceProfile(), slotAllocationSnapshot.getJobTargetAddress());
} catch (SlotAllocationException e) {
log.debug("Cannot reallocate restored slot {}.", slotAllocationSnapshot, e);
}
allocatedSlots.add(slotAllocationSnapshot.getAllocationId());
}
localStateStoresManager.retainLocalStateForAllocations(allocatedSlots);
}
use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.
the class DeclarativeSlotManagerTest method testSlotRequestFailure.
/**
* Tests that the SlotManager retries allocating a slot if the TaskExecutor#requestSlot call
* fails.
*/
@Test
public void testSlotRequestFailure() throws Exception {
final DefaultSlotTracker slotTracker = new DefaultSlotTracker();
try (final DeclarativeSlotManager slotManager = createDeclarativeSlotManagerBuilder().setSlotTracker(slotTracker).buildAndStartWithDirectExec()) {
ResourceRequirements requirements = createResourceRequirementsForSingleSlot();
slotManager.processResourceRequirements(requirements);
final BlockingQueue<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestSlotQueue = new ArrayBlockingQueue<>(1);
final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
final CompletableFuture<Acknowledge> firstManualSlotRequestResponse = new CompletableFuture<>();
responseQueue.offer(firstManualSlotRequestResponse);
final CompletableFuture<Acknowledge> secondManualSlotRequestResponse = new CompletableFuture<>();
responseQueue.offer(secondManualSlotRequestResponse);
final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(slotIDJobIDAllocationIDStringResourceManagerIdTuple6 -> {
requestSlotQueue.offer(slotIDJobIDAllocationIDStringResourceManagerIdTuple6);
try {
return responseQueue.take();
} catch (InterruptedException ignored) {
return FutureUtils.completedExceptionally(new FlinkException("Response queue was interrupted."));
}
}).createTestingTaskExecutorGateway();
final ResourceID taskExecutorResourceId = ResourceID.generate();
final TaskExecutorConnection taskExecutionConnection = new TaskExecutorConnection(taskExecutorResourceId, testingTaskExecutorGateway);
final SlotReport slotReport = new SlotReport(createFreeSlotStatus(new SlotID(taskExecutorResourceId, 0)));
slotManager.registerTaskManager(taskExecutionConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> firstRequest = requestSlotQueue.take();
// fail first request
firstManualSlotRequestResponse.completeExceptionally(new SlotAllocationException("Test exception"));
final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> secondRequest = requestSlotQueue.take();
assertThat(secondRequest.f1, equalTo(firstRequest.f1));
assertThat(secondRequest.f0, equalTo(firstRequest.f0));
secondManualSlotRequestResponse.complete(Acknowledge.get());
final DeclarativeTaskManagerSlot slot = slotTracker.getSlot(secondRequest.f0);
assertThat(slot.getState(), equalTo(SlotState.ALLOCATED));
assertThat(slot.getJobId(), equalTo(secondRequest.f1));
}
}
use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.
the class TaskExecutor method requestSlot.
// ----------------------------------------------------------------------
// Slot allocation RPCs
// ----------------------------------------------------------------------
@Override
public CompletableFuture<Acknowledge> requestSlot(final SlotID slotId, final JobID jobId, final AllocationID allocationId, final ResourceProfile resourceProfile, final String targetAddress, final ResourceManagerId resourceManagerId, final Time timeout) {
// TODO: Filter invalid requests from the resource manager by using the
// instance/registration Id
log.info("Receive slot request {} for job {} from resource manager with leader id {}.", allocationId, jobId, resourceManagerId);
if (!isConnectedToResourceManager(resourceManagerId)) {
final String message = String.format("TaskManager is not connected to the resource manager %s.", resourceManagerId);
log.debug(message);
return FutureUtils.completedExceptionally(new TaskManagerException(message));
}
tryPersistAllocationSnapshot(new SlotAllocationSnapshot(slotId, jobId, targetAddress, allocationId, resourceProfile));
try {
final boolean isConnected = allocateSlotForJob(jobId, slotId, allocationId, resourceProfile, targetAddress);
if (isConnected) {
offerSlotsToJobManager(jobId);
}
return CompletableFuture.completedFuture(Acknowledge.get());
} catch (SlotAllocationException e) {
log.debug("Could not allocate slot for allocation id {}.", allocationId, e);
return FutureUtils.completedExceptionally(e);
}
}
use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.
the class TaskExecutor method allocateSlotForJob.
private boolean allocateSlotForJob(JobID jobId, SlotID slotId, AllocationID allocationId, ResourceProfile resourceProfile, String targetAddress) throws SlotAllocationException {
allocateSlot(slotId, jobId, allocationId, resourceProfile);
final JobTable.Job job;
try {
job = jobTable.getOrCreateJob(jobId, () -> registerNewJobAndCreateServices(jobId, targetAddress));
} catch (Exception e) {
// free the allocated slot
try {
taskSlotTable.freeSlot(allocationId);
} catch (SlotNotFoundException slotNotFoundException) {
// slot no longer existent, this should actually never happen, because we've
// just allocated the slot. So let's fail hard in this case!
onFatalError(slotNotFoundException);
}
// release local state under the allocation id.
localStateStoresManager.releaseLocalStateForAllocationId(allocationId);
// sanity check
if (!taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
onFatalError(new Exception("Could not free slot " + slotId));
}
throw new SlotAllocationException("Could not create new job.", e);
}
return job.isConnected();
}
Aggregations