Search in sources :

Example 6 with SlotAllocationException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.

the class AbstractFineGrainedSlotManagerITCase method testSlotRequestFailure.

// ---------------------------------------------------------------------------------------------
// Slot allocation failure handling
// ---------------------------------------------------------------------------------------------
/**
 * Tests that the SlotManager retries allocating a slot if the TaskExecutor#requestSlot call
 * fails.
 */
@Test
public void testSlotRequestFailure() throws Exception {
    final JobID jobId = new JobID();
    final ResourceRequirements resourceRequirements = createResourceRequirementsForSingleSlot(jobId);
    final CompletableFuture<Acknowledge> slotRequestFuture1 = new CompletableFuture<>();
    final CompletableFuture<Acknowledge> slotRequestFuture2 = CompletableFuture.completedFuture(Acknowledge.get());
    final Iterator<CompletableFuture<Acknowledge>> slotRequestFutureIterator = Arrays.asList(slotRequestFuture1, slotRequestFuture2).iterator();
    final ArrayBlockingQueue<AllocationID> allocationIds = new ArrayBlockingQueue<>(2);
    final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(FunctionUtils.uncheckedFunction(requestSlotParameters -> {
        allocationIds.put(requestSlotParameters.f2);
        return slotRequestFutureIterator.next();
    })).createTestingTaskExecutorGateway();
    final ResourceID resourceId = ResourceID.generate();
    final TaskExecutorConnection taskManagerConnection = new TaskExecutorConnection(resourceId, taskExecutorGateway);
    final SlotReport slotReport = new SlotReport();
    new Context() {

        {
            runTest(() -> {
                runInMainThread(() -> {
                    getSlotManager().registerTaskManager(taskManagerConnection, slotReport, DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE);
                    getSlotManager().processResourceRequirements(resourceRequirements);
                });
                final AllocationID firstAllocationId = allocationIds.take();
                assertThat(allocationIds, is(empty()));
                // let the first attempt fail --> this should trigger a second attempt
                runInMainThread(() -> slotRequestFuture1.completeExceptionally(new SlotAllocationException("Test exception.")));
                final AllocationID secondAllocationId = allocationIds.take();
                assertThat(allocationIds, is(empty()));
                final TaskManagerSlotInformation slot = getTaskManagerTracker().getAllocatedOrPendingSlot(secondAllocationId).get();
                assertEquals(jobId, slot.getJobId());
                assertFalse(getTaskManagerTracker().getAllocatedOrPendingSlot(firstAllocationId).isPresent());
            });
        }
    };
}
Also used : Acknowledge(org.apache.flink.runtime.messages.Acknowledge) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) JobID(org.apache.flink.api.common.JobID) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Test(org.junit.Test)

Example 7 with SlotAllocationException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.

the class DeclarativeSlotManagerTest method testTaskExecutorSlotAllocationTimeoutHandling.

/**
 * Tests that a slot allocation is retried if it times out on the task manager side.
 */
@Test
public void testTaskExecutorSlotAllocationTimeoutHandling() throws Exception {
    final JobID jobId = new JobID();
    final ResourceRequirements resourceRequirements = createResourceRequirementsForSingleSlot(jobId);
    final CompletableFuture<Acknowledge> slotRequestFuture1 = new CompletableFuture<>();
    final CompletableFuture<Acknowledge> slotRequestFuture2 = new CompletableFuture<>();
    final Iterator<CompletableFuture<Acknowledge>> slotRequestFutureIterator = Arrays.asList(slotRequestFuture1, slotRequestFuture2).iterator();
    final ArrayBlockingQueue<SlotID> slotIds = new ArrayBlockingQueue<>(2);
    final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(FunctionUtils.uncheckedFunction(requestSlotParameters -> {
        slotIds.put(requestSlotParameters.f0);
        return slotRequestFutureIterator.next();
    })).createTestingTaskExecutorGateway();
    final ResourceID resourceId = ResourceID.generate();
    final TaskExecutorConnection taskManagerConnection = new TaskExecutorConnection(resourceId, taskExecutorGateway);
    final SlotID slotId1 = new SlotID(resourceId, 0);
    final SlotID slotId2 = new SlotID(resourceId, 1);
    final SlotReport slotReport = new SlotReport(Arrays.asList(createFreeSlotStatus(slotId1), createFreeSlotStatus(slotId2)));
    final ResourceTracker resourceTracker = new DefaultResourceTracker();
    final DefaultSlotTracker slotTracker = new DefaultSlotTracker();
    try (DeclarativeSlotManager slotManager = createDeclarativeSlotManagerBuilder().setResourceTracker(resourceTracker).setSlotTracker(slotTracker).buildAndStartWithDirectExec()) {
        slotManager.registerTaskManager(taskManagerConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
        slotManager.processResourceRequirements(resourceRequirements);
        final SlotID firstSlotId = slotIds.take();
        assertThat(slotIds, is(empty()));
        DeclarativeTaskManagerSlot failedSlot = slotTracker.getSlot(firstSlotId);
        // let the first attempt fail --> this should trigger a second attempt
        slotRequestFuture1.completeExceptionally(new SlotAllocationException("Test exception."));
        assertThat(getTotalResourceCount(resourceTracker.getAcquiredResources(jobId)), is(1));
        // the second attempt succeeds
        slotRequestFuture2.complete(Acknowledge.get());
        final SlotID secondSlotId = slotIds.take();
        assertThat(slotIds, is(empty()));
        DeclarativeTaskManagerSlot slot = slotTracker.getSlot(secondSlotId);
        assertThat(slot.getState(), is(SlotState.ALLOCATED));
        assertEquals(jobId, slot.getJobId());
        if (!failedSlot.getSlotId().equals(slot.getSlotId())) {
            assertThat(failedSlot.getState(), is(SlotState.FREE));
        }
    }
}
Also used : Acknowledge(org.apache.flink.runtime.messages.Acknowledge) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) JobID(org.apache.flink.api.common.JobID) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Test(org.junit.Test)

Aggregations

SlotAllocationException (org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException)7 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)3 CompletableFuture (java.util.concurrent.CompletableFuture)3 TimeoutException (java.util.concurrent.TimeoutException)3 JobID (org.apache.flink.api.common.JobID)3 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)3 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)3 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)3 TaskExecutorConnection (org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection)3 ResourceRequirements (org.apache.flink.runtime.slots.ResourceRequirements)3 SlotReport (org.apache.flink.runtime.taskexecutor.SlotReport)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)2 TestingTaskExecutorGateway (org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway)2 TestingTaskExecutorGatewayBuilder (org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder)2 TaskManagerException (org.apache.flink.runtime.taskexecutor.exceptions.TaskManagerException)2 Test (org.junit.Test)2 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1