Search in sources :

Example 11 with TaskExecutorConnection

use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.

the class DeclarativeSlotManagerTest method testSlotRequestRemovedIfTMReportsAllocation.

/**
 * Tests that pending request is removed if task executor reports a slot with the same job id.
 */
@Test
public void testSlotRequestRemovedIfTMReportsAllocation() throws Exception {
    final ResourceTracker resourceTracker = new DefaultResourceTracker();
    final DefaultSlotTracker slotTracker = new DefaultSlotTracker();
    try (final DeclarativeSlotManager slotManager = createDeclarativeSlotManagerBuilder().setResourceTracker(resourceTracker).setSlotTracker(slotTracker).buildAndStartWithDirectExec()) {
        final JobID jobID = new JobID();
        slotManager.processResourceRequirements(createResourceRequirementsForSingleSlot(jobID));
        final BlockingQueue<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestSlotQueue = new ArrayBlockingQueue<>(1);
        final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
        final CompletableFuture<Acknowledge> firstManualSlotRequestResponse = new CompletableFuture<>();
        responseQueue.offer(firstManualSlotRequestResponse);
        final CompletableFuture<Acknowledge> secondManualSlotRequestResponse = new CompletableFuture<>();
        responseQueue.offer(secondManualSlotRequestResponse);
        final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(slotIDJobIDAllocationIDStringResourceManagerIdTuple6 -> {
            requestSlotQueue.offer(slotIDJobIDAllocationIDStringResourceManagerIdTuple6);
            try {
                return responseQueue.take();
            } catch (InterruptedException ignored) {
                return FutureUtils.completedExceptionally(new FlinkException("Response queue was interrupted."));
            }
        }).createTestingTaskExecutorGateway();
        final ResourceID taskExecutorResourceId = ResourceID.generate();
        final TaskExecutorConnection taskExecutionConnection = new TaskExecutorConnection(taskExecutorResourceId, testingTaskExecutorGateway);
        final SlotReport slotReport = new SlotReport(createFreeSlotStatus(new SlotID(taskExecutorResourceId, 0)));
        slotManager.registerTaskManager(taskExecutionConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
        final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> firstRequest = requestSlotQueue.take();
        // fail first request
        firstManualSlotRequestResponse.completeExceptionally(new TimeoutException("Test exception to fail first allocation"));
        final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> secondRequest = requestSlotQueue.take();
        // fail second request
        secondManualSlotRequestResponse.completeExceptionally(new SlotOccupiedException("Test exception", new AllocationID(), jobID));
        assertThat(firstRequest.f1, equalTo(jobID));
        assertThat(secondRequest.f1, equalTo(jobID));
        assertThat(secondRequest.f0, equalTo(firstRequest.f0));
        final DeclarativeTaskManagerSlot slot = slotTracker.getSlot(secondRequest.f0);
        assertThat(slot.getState(), equalTo(SlotState.ALLOCATED));
        assertThat(slot.getJobId(), equalTo(firstRequest.f1));
        assertThat(slotManager.getNumberRegisteredSlots(), is(1));
        assertThat(getTotalResourceCount(resourceTracker.getAcquiredResources(jobID)), is(1));
    }
}
Also used : ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) Arrays(java.util.Arrays) CoreMatchers.hasItem(org.hamcrest.CoreMatchers.hasItem) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple6(org.apache.flink.api.java.tuple.Tuple6) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) TimeoutException(java.util.concurrent.TimeoutException) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) Assert.assertThat(org.junit.Assert.assertThat) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) FunctionUtils(org.apache.flink.util.function.FunctionUtils) TestLogger(org.apache.flink.util.TestLogger) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) SlotOccupiedException(org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Collection(java.util.Collection) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) Set(java.util.Set) BlockingQueue(java.util.concurrent.BlockingQueue) SlotManagerMetricGroup(org.apache.flink.runtime.metrics.groups.SlotManagerMetricGroup) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TestingUtils(org.apache.flink.testutils.TestingUtils) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) List(java.util.List) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Assert.assertFalse(org.junit.Assert.assertFalse) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) Matchers.is(org.hamcrest.Matchers.is) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) FlinkException(org.apache.flink.util.FlinkException) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) CoreMatchers.not(org.hamcrest.CoreMatchers.not) CompletableFuture(java.util.concurrent.CompletableFuture) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Assert.assertSame(org.junit.Assert.assertSame) ManuallyTriggeredScheduledExecutorService(org.apache.flink.core.testutils.ManuallyTriggeredScheduledExecutorService) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) Matchers.hasSize(org.hamcrest.Matchers.hasSize) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) ThrowingConsumer(org.apache.flink.util.function.ThrowingConsumer) Matchers.empty(org.hamcrest.Matchers.empty) Iterator(java.util.Iterator) Executor(java.util.concurrent.Executor) Assert.assertNotNull(org.junit.Assert.assertNotNull) Assert.assertTrue(org.junit.Assert.assertTrue) SystemExitTrackingSecurityManager(org.apache.flink.runtime.testutils.SystemExitTrackingSecurityManager) Test(org.junit.Test) InstanceID(org.apache.flink.runtime.instance.InstanceID) Iterators(org.apache.flink.shaded.guava30.com.google.common.collect.Iterators) TimeUnit(java.util.concurrent.TimeUnit) JobID(org.apache.flink.api.common.JobID) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) CompletableFuture(java.util.concurrent.CompletableFuture) SlotOccupiedException(org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) TimeoutException(java.util.concurrent.TimeoutException) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) FlinkException(org.apache.flink.util.FlinkException) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) Tuple6(org.apache.flink.api.java.tuple.Tuple6) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 12 with TaskExecutorConnection

use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.

the class FineGrainedSlotManagerTest method testRequirementCheckOnlyTriggeredOnce.

/**
 * Test that checkResourceRequirements will only be triggered once after multiple trigger
 * function calls.
 */
@Test
public void testRequirementCheckOnlyTriggeredOnce() throws Exception {
    new Context() {

        {
            final List<CompletableFuture<Void>> checkRequirementFutures = new ArrayList<>();
            checkRequirementFutures.add(new CompletableFuture<>());
            checkRequirementFutures.add(new CompletableFuture<>());
            final long requirementCheckDelay = 50;
            resourceAllocationStrategyBuilder.setTryFulfillRequirementsFunction((ignored1, ignored2) -> {
                if (checkRequirementFutures.get(0).isDone()) {
                    checkRequirementFutures.get(1).complete(null);
                } else {
                    checkRequirementFutures.get(0).complete(null);
                }
                return ResourceAllocationResult.builder().build();
            });
            setRequirementCheckDelay(requirementCheckDelay);
            runTest(() -> {
                final ResourceRequirements resourceRequirements1 = createResourceRequirementsForSingleSlot();
                final ResourceRequirements resourceRequirements2 = createResourceRequirementsForSingleSlot();
                final ResourceRequirements resourceRequirements3 = createResourceRequirementsForSingleSlot();
                final TaskExecutorConnection taskExecutionConnection = createTaskExecutorConnection();
                final CompletableFuture<Void> registrationFuture = new CompletableFuture<>();
                final long start = System.nanoTime();
                runInMainThread(() -> {
                    getSlotManager().processResourceRequirements(resourceRequirements1);
                    getSlotManager().processResourceRequirements(resourceRequirements2);
                    getSlotManager().registerTaskManager(taskExecutionConnection, new SlotReport(), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE);
                    registrationFuture.complete(null);
                });
                assertFutureCompleteAndReturn(registrationFuture);
                final long registrationTime = (System.nanoTime() - start) / 1_000_000;
                assumeTrue("The time of process requirement and register task manager must not take longer than the requirement check delay. If it does, then this indicates a very slow machine.", registrationTime < requirementCheckDelay);
                assertFutureCompleteAndReturn(checkRequirementFutures.get(0));
                assertFutureNotComplete(checkRequirementFutures.get(1));
                // checkTimes will not increase when there's no events
                Thread.sleep(requirementCheckDelay * 2);
                assertFutureNotComplete(checkRequirementFutures.get(1));
                // checkTimes will increase again if there's another
                // processResourceRequirements
                runInMainThread(() -> getSlotManager().processResourceRequirements(resourceRequirements3));
                assertFutureCompleteAndReturn(checkRequirementFutures.get(1));
            });
        }
    };
}
Also used : CompletableFuture(java.util.concurrent.CompletableFuture) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) ArrayList(java.util.ArrayList) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Test(org.junit.Test)

Example 13 with TaskExecutorConnection

use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.

the class FineGrainedSlotManagerTest method testTimeoutForUnusedTaskManager.

// ---------------------------------------------------------------------------------------------
// Task manager timeout
// ---------------------------------------------------------------------------------------------
/**
 * Tests that formerly used task managers can timeout after all of their slots have been freed.
 */
@Test
public void testTimeoutForUnusedTaskManager() throws Exception {
    final Time taskManagerTimeout = Time.milliseconds(50L);
    final CompletableFuture<InstanceID> releaseResourceFuture = new CompletableFuture<>();
    final AllocationID allocationId = new AllocationID();
    final TaskExecutorConnection taskExecutionConnection = createTaskExecutorConnection();
    final InstanceID instanceId = taskExecutionConnection.getInstanceID();
    new Context() {

        {
            resourceActionsBuilder.setReleaseResourceConsumer((instanceID, e) -> releaseResourceFuture.complete(instanceID));
            slotManagerConfigurationBuilder.setTaskManagerTimeout(taskManagerTimeout);
            runTest(() -> {
                final CompletableFuture<Boolean> registerTaskManagerFuture = new CompletableFuture<>();
                runInMainThread(() -> registerTaskManagerFuture.complete(getSlotManager().registerTaskManager(taskExecutionConnection, new SlotReport(createAllocatedSlotStatus(allocationId, DEFAULT_SLOT_RESOURCE_PROFILE)), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE)));
                assertThat(assertFutureCompleteAndReturn(registerTaskManagerFuture), is(true));
                assertEquals(getSlotManager().getTaskManagerIdleSince(instanceId), Long.MAX_VALUE);
                final CompletableFuture<Long> idleSinceFuture = new CompletableFuture<>();
                runInMainThread(() -> {
                    getSlotManager().freeSlot(new SlotID(taskExecutionConnection.getResourceID(), 0), allocationId);
                    idleSinceFuture.complete(getSlotManager().getTaskManagerIdleSince(instanceId));
                });
                assertThat(assertFutureCompleteAndReturn(idleSinceFuture), not(equalTo(Long.MAX_VALUE)));
                assertThat(assertFutureCompleteAndReturn(releaseResourceFuture), is(equalTo(instanceId)));
                // A task manager timeout does not remove the slots from the
                // SlotManager. The receiver of the callback can then decide what to do
                // with the TaskManager.
                assertEquals(DEFAULT_NUM_SLOTS_PER_WORKER, getSlotManager().getNumberRegisteredSlots());
                final CompletableFuture<Boolean> unregisterTaskManagerFuture = new CompletableFuture<>();
                runInMainThread(() -> unregisterTaskManagerFuture.complete(getSlotManager().unregisterTaskManager(taskExecutionConnection.getInstanceID(), TEST_EXCEPTION)));
                assertThat(assertFutureCompleteAndReturn(unregisterTaskManagerFuture), is(true));
                assertEquals(0, getSlotManager().getNumberRegisteredSlots());
            });
        }
    };
}
Also used : InstanceID(org.apache.flink.runtime.instance.InstanceID) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) Time(org.apache.flink.api.common.time.Time) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) CompletableFuture(java.util.concurrent.CompletableFuture) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Test(org.junit.Test)

Example 14 with TaskExecutorConnection

use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.

the class FineGrainedSlotManagerTest method testGetResourceOverview.

@Test
public void testGetResourceOverview() throws Exception {
    final TaskExecutorConnection taskExecutorConnection1 = createTaskExecutorConnection();
    final TaskExecutorConnection taskExecutorConnection2 = createTaskExecutorConnection();
    final ResourceID resourceId1 = ResourceID.generate();
    final ResourceID resourceId2 = ResourceID.generate();
    final SlotID slotId1 = new SlotID(resourceId1, 0);
    final SlotID slotId2 = new SlotID(resourceId2, 0);
    final ResourceProfile resourceProfile1 = ResourceProfile.fromResources(1, 10);
    final ResourceProfile resourceProfile2 = ResourceProfile.fromResources(2, 20);
    final SlotStatus slotStatus1 = new SlotStatus(slotId1, resourceProfile1, new JobID(), new AllocationID());
    final SlotStatus slotStatus2 = new SlotStatus(slotId2, resourceProfile2, new JobID(), new AllocationID());
    final SlotReport slotReport1 = new SlotReport(slotStatus1);
    final SlotReport slotReport2 = new SlotReport(slotStatus2);
    new Context() {

        {
            runTest(() -> {
                final CompletableFuture<Boolean> registerTaskManagerFuture1 = new CompletableFuture<>();
                final CompletableFuture<Boolean> registerTaskManagerFuture2 = new CompletableFuture<>();
                runInMainThread(() -> {
                    registerTaskManagerFuture1.complete(getSlotManager().registerTaskManager(taskExecutorConnection1, slotReport1, resourceProfile1.multiply(2), resourceProfile1));
                    registerTaskManagerFuture2.complete(getSlotManager().registerTaskManager(taskExecutorConnection2, slotReport2, resourceProfile2.multiply(2), resourceProfile2));
                });
                assertThat(assertFutureCompleteAndReturn(registerTaskManagerFuture1), is(true));
                assertThat(assertFutureCompleteAndReturn(registerTaskManagerFuture2), is(true));
                assertThat(getSlotManager().getFreeResource(), equalTo(resourceProfile1.merge(resourceProfile2)));
                assertThat(getSlotManager().getFreeResourceOf(taskExecutorConnection1.getInstanceID()), equalTo(resourceProfile1));
                assertThat(getSlotManager().getFreeResourceOf(taskExecutorConnection2.getInstanceID()), equalTo(resourceProfile2));
                assertThat(getSlotManager().getRegisteredResource(), equalTo(resourceProfile1.merge(resourceProfile2).multiply(2)));
                assertThat(getSlotManager().getRegisteredResourceOf(taskExecutorConnection1.getInstanceID()), equalTo(resourceProfile1.multiply(2)));
                assertThat(getSlotManager().getRegisteredResourceOf(taskExecutorConnection2.getInstanceID()), equalTo(resourceProfile2.multiply(2)));
            });
        }
    };
}
Also used : SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) JobID(org.apache.flink.api.common.JobID) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Test(org.junit.Test)

Example 15 with TaskExecutorConnection

use of org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection in project flink by apache.

the class FineGrainedSlotManagerTest method testTaskManagerRegistration.

// ---------------------------------------------------------------------------------------------
// Register / unregister TaskManager and and slot status reconciliation
// ---------------------------------------------------------------------------------------------
/**
 * Tests that we can register task manager at the slot manager.
 */
@Test
public void testTaskManagerRegistration() throws Exception {
    final TaskExecutorConnection taskManagerConnection = createTaskExecutorConnection();
    new Context() {

        {
            runTest(() -> {
                final CompletableFuture<Boolean> registerTaskManagerFuture = new CompletableFuture<>();
                runInMainThread(() -> registerTaskManagerFuture.complete(getSlotManager().registerTaskManager(taskManagerConnection, new SlotReport(), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE)));
                assertThat(assertFutureCompleteAndReturn(registerTaskManagerFuture), is(true));
                assertThat(getSlotManager().getNumberRegisteredSlots(), equalTo(DEFAULT_NUM_SLOTS_PER_WORKER));
                assertThat(getTaskManagerTracker().getRegisteredTaskManagers().size(), equalTo(1));
                assertTrue(getTaskManagerTracker().getRegisteredTaskManager(taskManagerConnection.getInstanceID()).isPresent());
                assertThat(getTaskManagerTracker().getRegisteredTaskManager(taskManagerConnection.getInstanceID()).get().getAvailableResource(), equalTo(DEFAULT_TOTAL_RESOURCE_PROFILE));
                assertThat(getTaskManagerTracker().getRegisteredTaskManager(taskManagerConnection.getInstanceID()).get().getTotalResource(), equalTo(DEFAULT_TOTAL_RESOURCE_PROFILE));
            });
        }
    };
}
Also used : CompletableFuture(java.util.concurrent.CompletableFuture) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Test(org.junit.Test)

Aggregations

TaskExecutorConnection (org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection)42 SlotReport (org.apache.flink.runtime.taskexecutor.SlotReport)40 Test (org.junit.Test)38 CompletableFuture (java.util.concurrent.CompletableFuture)33 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)32 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)30 TestingTaskExecutorGatewayBuilder (org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder)30 JobID (org.apache.flink.api.common.JobID)29 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)29 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)24 TestingTaskExecutorGateway (org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway)24 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)23 ResourceRequirements (org.apache.flink.runtime.slots.ResourceRequirements)23 SlotStatus (org.apache.flink.runtime.taskexecutor.SlotStatus)22 Assert.assertThat (org.junit.Assert.assertThat)21 Tuple6 (org.apache.flink.api.java.tuple.Tuple6)20 ResourceManagerId (org.apache.flink.runtime.resourcemanager.ResourceManagerId)20 ResourceRequirement (org.apache.flink.runtime.slots.ResourceRequirement)20 TaskExecutorGateway (org.apache.flink.runtime.taskexecutor.TaskExecutorGateway)20 Matchers.empty (org.hamcrest.Matchers.empty)20