use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.
the class DeclarativeSlotManager method allocateSlot.
/**
* Allocates the given slot. This entails sending a registration message to the task manager and
* treating failures.
*
* @param taskManagerSlot slot to allocate
* @param jobId job for which the slot should be allocated for
* @param targetAddress address of the job master
* @param resourceProfile resource profile for the requirement for which the slot is used
*/
private void allocateSlot(TaskManagerSlotInformation taskManagerSlot, JobID jobId, String targetAddress, ResourceProfile resourceProfile) {
final SlotID slotId = taskManagerSlot.getSlotId();
LOG.debug("Starting allocation of slot {} for job {} with resource profile {}.", slotId, jobId, resourceProfile);
final InstanceID instanceId = taskManagerSlot.getInstanceId();
if (!taskExecutorManager.isTaskManagerRegistered(instanceId)) {
throw new IllegalStateException("Could not find a registered task manager for instance id " + instanceId + '.');
}
final TaskExecutorConnection taskExecutorConnection = taskManagerSlot.getTaskManagerConnection();
final TaskExecutorGateway gateway = taskExecutorConnection.getTaskExecutorGateway();
final AllocationID allocationId = new AllocationID();
slotTracker.notifyAllocationStart(slotId, jobId);
taskExecutorManager.markUsed(instanceId);
pendingSlotAllocations.put(slotId, allocationId);
// RPC call to the task manager
CompletableFuture<Acknowledge> requestFuture = gateway.requestSlot(slotId, jobId, allocationId, resourceProfile, targetAddress, resourceManagerId, taskManagerRequestTimeout);
CompletableFuture<Void> slotAllocationResponseProcessingFuture = requestFuture.handleAsync((Acknowledge acknowledge, Throwable throwable) -> {
final AllocationID currentAllocationForSlot = pendingSlotAllocations.get(slotId);
if (currentAllocationForSlot == null || !currentAllocationForSlot.equals(allocationId)) {
LOG.debug("Ignoring slot allocation update from task executor {} for slot {} and job {}, because the allocation was already completed or cancelled.", instanceId, slotId, jobId);
return null;
}
if (acknowledge != null) {
LOG.trace("Completed allocation of slot {} for job {}.", slotId, jobId);
slotTracker.notifyAllocationComplete(slotId, jobId);
} else {
if (throwable instanceof SlotOccupiedException) {
SlotOccupiedException exception = (SlotOccupiedException) throwable;
LOG.debug("Tried allocating slot {} for job {}, but it was already allocated for job {}.", slotId, jobId, exception.getJobId());
// report as a slot status to force the state transition
// this could be a problem if we ever assume that the task
// executor always reports about all slots
slotTracker.notifySlotStatus(Collections.singleton(new SlotStatus(slotId, taskManagerSlot.getResourceProfile(), exception.getJobId(), exception.getAllocationId())));
} else {
LOG.warn("Slot allocation for slot {} for job {} failed.", slotId, jobId, throwable);
slotTracker.notifyFree(slotId);
}
checkResourceRequirements();
}
return null;
}, mainThreadExecutor);
FutureUtils.assertNoException(slotAllocationResponseProcessingFuture);
}
use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.
the class AbstractFineGrainedSlotManagerITCase method testAllocationUpdatesIgnoredIfTaskExecutorUnregistered.
// ---------------------------------------------------------------------------------------------
// Allocation update
// ---------------------------------------------------------------------------------------------
/**
* Verify that the ack of request slot form unregistered task manager will not cause system
* breakdown.
*/
@Test
public void testAllocationUpdatesIgnoredIfTaskExecutorUnregistered() throws Exception {
final CompletableFuture<Acknowledge> slotRequestFuture = new CompletableFuture<>();
final CompletableFuture<Void> slotRequestCallFuture = new CompletableFuture<>();
final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(ignored -> {
slotRequestCallFuture.complete(null);
return slotRequestFuture;
}).createTestingTaskExecutorGateway();
// The fatal error handler will exit the system if there is any exceptions in handling the
// ack of request slot. We need the security manager to verify that would not happen.
final SystemExitTrackingSecurityManager trackingSecurityManager = new SystemExitTrackingSecurityManager();
System.setSecurityManager(trackingSecurityManager);
final JobID jobId = new JobID();
final ResourceID taskExecutorResourceId = ResourceID.generate();
final TaskExecutorConnection taskExecutionConnection = new TaskExecutorConnection(taskExecutorResourceId, taskExecutorGateway);
final SlotReport slotReport = new SlotReport();
new Context() {
{
runTest(() -> {
runInMainThread(() -> {
getSlotManager().processResourceRequirements(createResourceRequirements(jobId, 1));
getSlotManager().registerTaskManager(taskExecutionConnection, slotReport, DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE);
});
assertFutureCompleteAndReturn(slotRequestCallFuture);
runInMainThread(() -> {
getSlotManager().unregisterTaskManager(taskExecutionConnection.getInstanceID(), TEST_EXCEPTION);
slotRequestFuture.complete(Acknowledge.get());
});
assertThat(trackingSecurityManager.getSystemExitFuture().isDone(), is(false));
});
}
};
System.setSecurityManager(null);
}
use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.
the class AbstractFineGrainedSlotManagerITCase method testSlotRequestFailure.
// ---------------------------------------------------------------------------------------------
// Slot allocation failure handling
// ---------------------------------------------------------------------------------------------
/**
* Tests that the SlotManager retries allocating a slot if the TaskExecutor#requestSlot call
* fails.
*/
@Test
public void testSlotRequestFailure() throws Exception {
final JobID jobId = new JobID();
final ResourceRequirements resourceRequirements = createResourceRequirementsForSingleSlot(jobId);
final CompletableFuture<Acknowledge> slotRequestFuture1 = new CompletableFuture<>();
final CompletableFuture<Acknowledge> slotRequestFuture2 = CompletableFuture.completedFuture(Acknowledge.get());
final Iterator<CompletableFuture<Acknowledge>> slotRequestFutureIterator = Arrays.asList(slotRequestFuture1, slotRequestFuture2).iterator();
final ArrayBlockingQueue<AllocationID> allocationIds = new ArrayBlockingQueue<>(2);
final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(FunctionUtils.uncheckedFunction(requestSlotParameters -> {
allocationIds.put(requestSlotParameters.f2);
return slotRequestFutureIterator.next();
})).createTestingTaskExecutorGateway();
final ResourceID resourceId = ResourceID.generate();
final TaskExecutorConnection taskManagerConnection = new TaskExecutorConnection(resourceId, taskExecutorGateway);
final SlotReport slotReport = new SlotReport();
new Context() {
{
runTest(() -> {
runInMainThread(() -> {
getSlotManager().registerTaskManager(taskManagerConnection, slotReport, DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE);
getSlotManager().processResourceRequirements(resourceRequirements);
});
final AllocationID firstAllocationId = allocationIds.take();
assertThat(allocationIds, is(empty()));
// let the first attempt fail --> this should trigger a second attempt
runInMainThread(() -> slotRequestFuture1.completeExceptionally(new SlotAllocationException("Test exception.")));
final AllocationID secondAllocationId = allocationIds.take();
assertThat(allocationIds, is(empty()));
final TaskManagerSlotInformation slot = getTaskManagerTracker().getAllocatedOrPendingSlot(secondAllocationId).get();
assertEquals(jobId, slot.getJobId());
assertFalse(getTaskManagerTracker().getAllocatedOrPendingSlot(firstAllocationId).isPresent());
});
}
};
}
use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.
the class AbstractFineGrainedSlotManagerITCase method testRequirementDeclaration.
private void testRequirementDeclaration(RequirementDeclarationScenario scenario) throws Exception {
final ResourceID resourceID = ResourceID.generate();
final JobID jobId = new JobID();
final SlotID slotId = SlotID.getDynamicSlotID(resourceID);
final String targetAddress = "localhost";
final ResourceRequirements requirements = ResourceRequirements.create(jobId, targetAddress, Collections.singleton(ResourceRequirement.create(DEFAULT_SLOT_RESOURCE_PROFILE, 1)));
final CompletableFuture<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestFuture = new CompletableFuture<>();
// accept an incoming slot request
final TaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(tuple6 -> {
requestFuture.complete(tuple6);
return CompletableFuture.completedFuture(Acknowledge.get());
}).createTestingTaskExecutorGateway();
final TaskExecutorConnection taskExecutorConnection = new TaskExecutorConnection(resourceID, taskExecutorGateway);
new Context() {
{
runTest(() -> {
if (scenario == RequirementDeclarationScenario.TASK_EXECUTOR_REGISTRATION_BEFORE_REQUIREMENT_DECLARATION) {
runInMainThread(() -> getSlotManager().registerTaskManager(taskExecutorConnection, new SlotReport(), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE));
}
runInMainThread(() -> getSlotManager().processResourceRequirements(requirements));
if (scenario == RequirementDeclarationScenario.TASK_EXECUTOR_REGISTRATION_AFTER_REQUIREMENT_DECLARATION) {
runInMainThread(() -> getSlotManager().registerTaskManager(taskExecutorConnection, new SlotReport(), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE));
}
assertThat(assertFutureCompleteAndReturn(requestFuture), is(equalTo(Tuple6.of(slotId, jobId, assertFutureCompleteAndReturn(requestFuture).f2, DEFAULT_SLOT_RESOURCE_PROFILE, targetAddress, getResourceManagerId()))));
final TaskManagerSlotInformation slot = getTaskManagerTracker().getAllocatedOrPendingSlot(assertFutureCompleteAndReturn(requestFuture).f2).get();
assertEquals("The slot has not been allocated to the expected allocation id.", assertFutureCompleteAndReturn(requestFuture).f2, slot.getAllocationId());
});
}
};
}
use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.
the class AbstractFineGrainedSlotManagerITCase method testResourceCanBeAllocatedForDifferentJobAfterFree.
/**
* Tests that a resource allocated for one job can be allocated for another job after being
* freed.
*/
private void testResourceCanBeAllocatedForDifferentJobAfterFree(SecondRequirementDeclarationTime secondRequirementDeclarationTime) throws Exception {
final CompletableFuture<AllocationID> allocationIdFuture1 = new CompletableFuture<>();
final CompletableFuture<AllocationID> allocationIdFuture2 = new CompletableFuture<>();
final ResourceRequirements resourceRequirements1 = createResourceRequirementsForSingleSlot();
final ResourceRequirements resourceRequirements2 = createResourceRequirementsForSingleSlot();
final TaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(tuple6 -> {
if (!allocationIdFuture1.isDone()) {
allocationIdFuture1.complete(tuple6.f2);
} else {
allocationIdFuture2.complete(tuple6.f2);
}
return CompletableFuture.completedFuture(Acknowledge.get());
}).createTestingTaskExecutorGateway();
final ResourceID resourceID = ResourceID.generate();
final TaskExecutorConnection taskManagerConnection = new TaskExecutorConnection(resourceID, taskExecutorGateway);
final SlotReport slotReport = new SlotReport();
new Context() {
{
runTest(() -> {
runInMainThread(() -> {
getSlotManager().registerTaskManager(taskManagerConnection, slotReport, DEFAULT_SLOT_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE);
getSlotManager().processResourceRequirements(resourceRequirements1);
});
final AllocationID allocationId1 = assertFutureCompleteAndReturn(allocationIdFuture1);
TaskManagerSlotInformation slot = getTaskManagerTracker().getAllocatedOrPendingSlot(allocationId1).get();
assertEquals("The slot has not been allocated to the expected job id.", resourceRequirements1.getJobId(), slot.getJobId());
if (secondRequirementDeclarationTime == SecondRequirementDeclarationTime.BEFORE_FREE) {
runInMainThread(() -> getSlotManager().processResourceRequirements(resourceRequirements2));
}
// clear resource requirements first so that the freed slot isn't
// immediately re-assigned to the job
runInMainThread(() -> {
getSlotManager().processResourceRequirements(ResourceRequirements.create(resourceRequirements1.getJobId(), resourceRequirements1.getTargetAddress(), Collections.emptyList()));
getSlotManager().freeSlot(SlotID.getDynamicSlotID(resourceID), allocationId1);
});
if (secondRequirementDeclarationTime == SecondRequirementDeclarationTime.AFTER_FREE) {
runInMainThread(() -> getSlotManager().processResourceRequirements(resourceRequirements2));
}
slot = getTaskManagerTracker().getAllocatedOrPendingSlot(assertFutureCompleteAndReturn(allocationIdFuture2)).get();
assertEquals("The slot has not been allocated to the expected job id.", resourceRequirements2.getJobId(), slot.getJobId());
});
}
};
}
Aggregations