Search in sources :

Example 1 with SlotOccupiedException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException in project flink by apache.

the class DefaultSlotStatusSyncer method allocateSlot.

@Override
public CompletableFuture<Void> allocateSlot(InstanceID instanceId, JobID jobId, String targetAddress, ResourceProfile resourceProfile) {
    Preconditions.checkNotNull(instanceId);
    Preconditions.checkNotNull(jobId);
    Preconditions.checkNotNull(targetAddress);
    Preconditions.checkNotNull(resourceProfile);
    checkStarted();
    final AllocationID allocationId = new AllocationID();
    final Optional<TaskManagerInfo> taskManager = taskManagerTracker.getRegisteredTaskManager(instanceId);
    Preconditions.checkState(taskManager.isPresent(), "Could not find a registered task manager for instance id " + instanceId + '.');
    final TaskExecutorGateway gateway = taskManager.get().getTaskExecutorConnection().getTaskExecutorGateway();
    final ResourceID resourceId = taskManager.get().getTaskExecutorConnection().getResourceID();
    LOG.info("Starting allocation of slot {} from {} for job {} with resource profile {}.", allocationId, resourceId, jobId, resourceProfile);
    taskManagerTracker.notifySlotStatus(allocationId, jobId, instanceId, resourceProfile, SlotState.PENDING);
    resourceTracker.notifyAcquiredResource(jobId, resourceProfile);
    pendingSlotAllocations.add(allocationId);
    // RPC call to the task manager
    CompletableFuture<Acknowledge> requestFuture = gateway.requestSlot(SlotID.getDynamicSlotID(resourceId), jobId, allocationId, resourceProfile, targetAddress, resourceManagerId, taskManagerRequestTimeout);
    CompletableFuture<Void> returnedFuture = new CompletableFuture<>();
    FutureUtils.assertNoException(requestFuture.handleAsync((Acknowledge acknowledge, Throwable throwable) -> {
        if (!pendingSlotAllocations.remove(allocationId)) {
            LOG.debug("Ignoring slot allocation update from task manager {} for allocation {} and job {}, because the allocation was already completed or cancelled.", instanceId, allocationId, jobId);
            returnedFuture.complete(null);
            return null;
        }
        if (!taskManagerTracker.getAllocatedOrPendingSlot(allocationId).isPresent()) {
            LOG.debug("The slot {} has been removed before. Ignore the future.", allocationId);
            requestFuture.complete(null);
            return null;
        }
        if (acknowledge != null) {
            LOG.trace("Completed allocation of allocation {} for job {}.", allocationId, jobId);
            taskManagerTracker.notifySlotStatus(allocationId, jobId, instanceId, resourceProfile, SlotState.ALLOCATED);
            returnedFuture.complete(null);
        } else {
            if (throwable instanceof SlotOccupiedException) {
                LOG.error("Should not get this exception.", throwable);
            } else {
                // TODO If the taskManager does not have enough resource, we
                // may endlessly allocate slot on it until the next heartbeat.
                LOG.warn("Slot allocation for allocation {} for job {} failed.", allocationId, jobId, throwable);
                resourceTracker.notifyLostResource(jobId, resourceProfile);
                taskManagerTracker.notifySlotStatus(allocationId, jobId, instanceId, resourceProfile, SlotState.FREE);
            }
            returnedFuture.completeExceptionally(throwable);
        }
        return null;
    }, mainThreadExecutor));
    return returnedFuture;
}
Also used : CompletableFuture(java.util.concurrent.CompletableFuture) SlotOccupiedException(org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway)

Example 2 with SlotOccupiedException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException in project flink by apache.

the class DeclarativeSlotManagerTest method testSlotRequestRemovedIfTMReportsAllocation.

/**
 * Tests that pending request is removed if task executor reports a slot with the same job id.
 */
@Test
public void testSlotRequestRemovedIfTMReportsAllocation() throws Exception {
    final ResourceTracker resourceTracker = new DefaultResourceTracker();
    final DefaultSlotTracker slotTracker = new DefaultSlotTracker();
    try (final DeclarativeSlotManager slotManager = createDeclarativeSlotManagerBuilder().setResourceTracker(resourceTracker).setSlotTracker(slotTracker).buildAndStartWithDirectExec()) {
        final JobID jobID = new JobID();
        slotManager.processResourceRequirements(createResourceRequirementsForSingleSlot(jobID));
        final BlockingQueue<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestSlotQueue = new ArrayBlockingQueue<>(1);
        final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
        final CompletableFuture<Acknowledge> firstManualSlotRequestResponse = new CompletableFuture<>();
        responseQueue.offer(firstManualSlotRequestResponse);
        final CompletableFuture<Acknowledge> secondManualSlotRequestResponse = new CompletableFuture<>();
        responseQueue.offer(secondManualSlotRequestResponse);
        final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(slotIDJobIDAllocationIDStringResourceManagerIdTuple6 -> {
            requestSlotQueue.offer(slotIDJobIDAllocationIDStringResourceManagerIdTuple6);
            try {
                return responseQueue.take();
            } catch (InterruptedException ignored) {
                return FutureUtils.completedExceptionally(new FlinkException("Response queue was interrupted."));
            }
        }).createTestingTaskExecutorGateway();
        final ResourceID taskExecutorResourceId = ResourceID.generate();
        final TaskExecutorConnection taskExecutionConnection = new TaskExecutorConnection(taskExecutorResourceId, testingTaskExecutorGateway);
        final SlotReport slotReport = new SlotReport(createFreeSlotStatus(new SlotID(taskExecutorResourceId, 0)));
        slotManager.registerTaskManager(taskExecutionConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
        final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> firstRequest = requestSlotQueue.take();
        // fail first request
        firstManualSlotRequestResponse.completeExceptionally(new TimeoutException("Test exception to fail first allocation"));
        final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> secondRequest = requestSlotQueue.take();
        // fail second request
        secondManualSlotRequestResponse.completeExceptionally(new SlotOccupiedException("Test exception", new AllocationID(), jobID));
        assertThat(firstRequest.f1, equalTo(jobID));
        assertThat(secondRequest.f1, equalTo(jobID));
        assertThat(secondRequest.f0, equalTo(firstRequest.f0));
        final DeclarativeTaskManagerSlot slot = slotTracker.getSlot(secondRequest.f0);
        assertThat(slot.getState(), equalTo(SlotState.ALLOCATED));
        assertThat(slot.getJobId(), equalTo(firstRequest.f1));
        assertThat(slotManager.getNumberRegisteredSlots(), is(1));
        assertThat(getTotalResourceCount(resourceTracker.getAcquiredResources(jobID)), is(1));
    }
}
Also used : ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) Arrays(java.util.Arrays) CoreMatchers.hasItem(org.hamcrest.CoreMatchers.hasItem) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple6(org.apache.flink.api.java.tuple.Tuple6) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) TimeoutException(java.util.concurrent.TimeoutException) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) Assert.assertThat(org.junit.Assert.assertThat) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) FunctionUtils(org.apache.flink.util.function.FunctionUtils) TestLogger(org.apache.flink.util.TestLogger) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) SlotOccupiedException(org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Collection(java.util.Collection) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) Set(java.util.Set) BlockingQueue(java.util.concurrent.BlockingQueue) SlotManagerMetricGroup(org.apache.flink.runtime.metrics.groups.SlotManagerMetricGroup) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TestingUtils(org.apache.flink.testutils.TestingUtils) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) List(java.util.List) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Assert.assertFalse(org.junit.Assert.assertFalse) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) Matchers.is(org.hamcrest.Matchers.is) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) FlinkException(org.apache.flink.util.FlinkException) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) CoreMatchers.not(org.hamcrest.CoreMatchers.not) CompletableFuture(java.util.concurrent.CompletableFuture) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Assert.assertSame(org.junit.Assert.assertSame) ManuallyTriggeredScheduledExecutorService(org.apache.flink.core.testutils.ManuallyTriggeredScheduledExecutorService) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) Matchers.hasSize(org.hamcrest.Matchers.hasSize) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) ThrowingConsumer(org.apache.flink.util.function.ThrowingConsumer) Matchers.empty(org.hamcrest.Matchers.empty) Iterator(java.util.Iterator) Executor(java.util.concurrent.Executor) Assert.assertNotNull(org.junit.Assert.assertNotNull) Assert.assertTrue(org.junit.Assert.assertTrue) SystemExitTrackingSecurityManager(org.apache.flink.runtime.testutils.SystemExitTrackingSecurityManager) Test(org.junit.Test) InstanceID(org.apache.flink.runtime.instance.InstanceID) Iterators(org.apache.flink.shaded.guava30.com.google.common.collect.Iterators) TimeUnit(java.util.concurrent.TimeUnit) JobID(org.apache.flink.api.common.JobID) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) CompletableFuture(java.util.concurrent.CompletableFuture) SlotOccupiedException(org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) TimeoutException(java.util.concurrent.TimeoutException) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) FlinkException(org.apache.flink.util.FlinkException) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) Tuple6(org.apache.flink.api.java.tuple.Tuple6) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 3 with SlotOccupiedException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException in project flink by apache.

the class DeclarativeSlotManager method allocateSlot.

/**
 * Allocates the given slot. This entails sending a registration message to the task manager and
 * treating failures.
 *
 * @param taskManagerSlot slot to allocate
 * @param jobId job for which the slot should be allocated for
 * @param targetAddress address of the job master
 * @param resourceProfile resource profile for the requirement for which the slot is used
 */
private void allocateSlot(TaskManagerSlotInformation taskManagerSlot, JobID jobId, String targetAddress, ResourceProfile resourceProfile) {
    final SlotID slotId = taskManagerSlot.getSlotId();
    LOG.debug("Starting allocation of slot {} for job {} with resource profile {}.", slotId, jobId, resourceProfile);
    final InstanceID instanceId = taskManagerSlot.getInstanceId();
    if (!taskExecutorManager.isTaskManagerRegistered(instanceId)) {
        throw new IllegalStateException("Could not find a registered task manager for instance id " + instanceId + '.');
    }
    final TaskExecutorConnection taskExecutorConnection = taskManagerSlot.getTaskManagerConnection();
    final TaskExecutorGateway gateway = taskExecutorConnection.getTaskExecutorGateway();
    final AllocationID allocationId = new AllocationID();
    slotTracker.notifyAllocationStart(slotId, jobId);
    taskExecutorManager.markUsed(instanceId);
    pendingSlotAllocations.put(slotId, allocationId);
    // RPC call to the task manager
    CompletableFuture<Acknowledge> requestFuture = gateway.requestSlot(slotId, jobId, allocationId, resourceProfile, targetAddress, resourceManagerId, taskManagerRequestTimeout);
    CompletableFuture<Void> slotAllocationResponseProcessingFuture = requestFuture.handleAsync((Acknowledge acknowledge, Throwable throwable) -> {
        final AllocationID currentAllocationForSlot = pendingSlotAllocations.get(slotId);
        if (currentAllocationForSlot == null || !currentAllocationForSlot.equals(allocationId)) {
            LOG.debug("Ignoring slot allocation update from task executor {} for slot {} and job {}, because the allocation was already completed or cancelled.", instanceId, slotId, jobId);
            return null;
        }
        if (acknowledge != null) {
            LOG.trace("Completed allocation of slot {} for job {}.", slotId, jobId);
            slotTracker.notifyAllocationComplete(slotId, jobId);
        } else {
            if (throwable instanceof SlotOccupiedException) {
                SlotOccupiedException exception = (SlotOccupiedException) throwable;
                LOG.debug("Tried allocating slot {} for job {}, but it was already allocated for job {}.", slotId, jobId, exception.getJobId());
                // report as a slot status to force the state transition
                // this could be a problem if we ever assume that the task
                // executor always reports about all slots
                slotTracker.notifySlotStatus(Collections.singleton(new SlotStatus(slotId, taskManagerSlot.getResourceProfile(), exception.getJobId(), exception.getAllocationId())));
            } else {
                LOG.warn("Slot allocation for slot {} for job {} failed.", slotId, jobId, throwable);
                slotTracker.notifyFree(slotId);
            }
            checkResourceRequirements();
        }
        return null;
    }, mainThreadExecutor);
    FutureUtils.assertNoException(slotAllocationResponseProcessingFuture);
}
Also used : InstanceID(org.apache.flink.runtime.instance.InstanceID) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) SlotOccupiedException(org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection)

Aggregations

AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)3 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)3 TaskExecutorGateway (org.apache.flink.runtime.taskexecutor.TaskExecutorGateway)3 SlotOccupiedException (org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException)3 CompletableFuture (java.util.concurrent.CompletableFuture)2 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)2 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)2 InstanceID (org.apache.flink.runtime.instance.InstanceID)2 TaskExecutorConnection (org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection)2 SlotStatus (org.apache.flink.runtime.taskexecutor.SlotStatus)2 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Set (java.util.Set)1 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)1 BlockingQueue (java.util.concurrent.BlockingQueue)1