Search in sources :

Example 1 with SlotAllocationException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.

the class TaskExecutor method requestSlot.

// ----------------------------------------------------------------------
// Slot allocation RPCs
// ----------------------------------------------------------------------
/**
	 * /**
	 * Requests a slot from the TaskManager
	 *
	 * @param slotId identifying the requested slot
	 * @param jobId identifying the job for which the request is issued
	 * @param allocationId id for the request
	 * @param targetAddress of the job manager requesting the slot
	 * @param rmLeaderId current leader id of the ResourceManager
	 * @throws SlotAllocationException if the slot allocation fails
	 * @return answer to the slot request
	 */
@RpcMethod
public TMSlotRequestReply requestSlot(final SlotID slotId, final JobID jobId, final AllocationID allocationId, final String targetAddress, final UUID rmLeaderId) throws SlotAllocationException {
    log.info("Receive slot request {} for job {} from resource manager with leader id {}.", allocationId, jobId, rmLeaderId);
    if (resourceManagerConnection == null) {
        final String message = "TaskManager is not connected to a resource manager.";
        log.debug(message);
        throw new SlotAllocationException(message);
    }
    if (!resourceManagerConnection.getTargetLeaderId().equals(rmLeaderId)) {
        final String message = "The leader id " + rmLeaderId + " does not match with the leader id of the connected resource manager " + resourceManagerConnection.getTargetLeaderId() + '.';
        log.debug(message);
        throw new SlotAllocationException(message);
    }
    if (taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
        if (taskSlotTable.allocateSlot(slotId.getSlotNumber(), jobId, allocationId, taskManagerConfiguration.getTimeout())) {
            log.info("Allocated slot for {}.", allocationId);
        } else {
            log.info("Could not allocate slot for {}.", allocationId);
            throw new SlotAllocationException("Could not allocate slot.");
        }
    } else if (!taskSlotTable.isAllocated(slotId.getSlotNumber(), jobId, allocationId)) {
        final String message = "The slot " + slotId + " has already been allocated for a different job.";
        log.info(message);
        throw new SlotAllocationException(message);
    }
    if (jobManagerTable.contains(jobId)) {
        offerSlotsToJobManager(jobId);
    } else {
        try {
            jobLeaderService.addJob(jobId, targetAddress);
        } catch (Exception e) {
            // free the allocated slot
            try {
                taskSlotTable.freeSlot(allocationId);
            } catch (SlotNotFoundException slotNotFoundException) {
                // slot no longer existent, this should actually never happen, because we've
                // just allocated the slot. So let's fail hard in this case!
                onFatalError(slotNotFoundException);
            }
            // sanity check
            if (!taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
                onFatalError(new Exception("Could not free slot " + slotId));
            }
            throw new SlotAllocationException("Could not add job to job leader service.", e);
        }
    }
    return new TMSlotRequestRegistered(resourceManagerConnection.getRegistrationId(), getResourceID(), allocationId);
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TMSlotRequestRegistered(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered) TimeoutException(java.util.concurrent.TimeoutException) PartitionException(org.apache.flink.runtime.taskexecutor.exceptions.PartitionException) CheckpointException(org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) TaskException(org.apache.flink.runtime.taskexecutor.exceptions.TaskException) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) IOException(java.io.IOException) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 2 with SlotAllocationException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.

the class TaskExecutor method tryLoadLocalAllocationSnapshots.

/**
 * This method tries to repopulate the {@link JobTable} and {@link TaskSlotTable} from the local
 * filesystem in a best-effort manner.
 */
private void tryLoadLocalAllocationSnapshots() {
    Collection<SlotAllocationSnapshot> slotAllocationSnapshots = slotAllocationSnapshotPersistenceService.loadAllocationSnapshots();
    log.debug("Recovered slot allocation snapshots {}.", slotAllocationSnapshots);
    final Set<AllocationID> allocatedSlots = new HashSet<>();
    for (SlotAllocationSnapshot slotAllocationSnapshot : slotAllocationSnapshots) {
        try {
            allocateSlotForJob(slotAllocationSnapshot.getJobId(), slotAllocationSnapshot.getSlotID(), slotAllocationSnapshot.getAllocationId(), slotAllocationSnapshot.getResourceProfile(), slotAllocationSnapshot.getJobTargetAddress());
        } catch (SlotAllocationException e) {
            log.debug("Cannot reallocate restored slot {}.", slotAllocationSnapshot, e);
        }
        allocatedSlots.add(slotAllocationSnapshot.getAllocationId());
    }
    localStateStoresManager.retainLocalStateForAllocations(allocatedSlots);
}
Also used : SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotAllocationSnapshot(org.apache.flink.runtime.taskexecutor.slot.SlotAllocationSnapshot) HashSet(java.util.HashSet)

Example 3 with SlotAllocationException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.

the class DeclarativeSlotManagerTest method testSlotRequestFailure.

/**
 * Tests that the SlotManager retries allocating a slot if the TaskExecutor#requestSlot call
 * fails.
 */
@Test
public void testSlotRequestFailure() throws Exception {
    final DefaultSlotTracker slotTracker = new DefaultSlotTracker();
    try (final DeclarativeSlotManager slotManager = createDeclarativeSlotManagerBuilder().setSlotTracker(slotTracker).buildAndStartWithDirectExec()) {
        ResourceRequirements requirements = createResourceRequirementsForSingleSlot();
        slotManager.processResourceRequirements(requirements);
        final BlockingQueue<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestSlotQueue = new ArrayBlockingQueue<>(1);
        final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
        final CompletableFuture<Acknowledge> firstManualSlotRequestResponse = new CompletableFuture<>();
        responseQueue.offer(firstManualSlotRequestResponse);
        final CompletableFuture<Acknowledge> secondManualSlotRequestResponse = new CompletableFuture<>();
        responseQueue.offer(secondManualSlotRequestResponse);
        final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(slotIDJobIDAllocationIDStringResourceManagerIdTuple6 -> {
            requestSlotQueue.offer(slotIDJobIDAllocationIDStringResourceManagerIdTuple6);
            try {
                return responseQueue.take();
            } catch (InterruptedException ignored) {
                return FutureUtils.completedExceptionally(new FlinkException("Response queue was interrupted."));
            }
        }).createTestingTaskExecutorGateway();
        final ResourceID taskExecutorResourceId = ResourceID.generate();
        final TaskExecutorConnection taskExecutionConnection = new TaskExecutorConnection(taskExecutorResourceId, testingTaskExecutorGateway);
        final SlotReport slotReport = new SlotReport(createFreeSlotStatus(new SlotID(taskExecutorResourceId, 0)));
        slotManager.registerTaskManager(taskExecutionConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
        final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> firstRequest = requestSlotQueue.take();
        // fail first request
        firstManualSlotRequestResponse.completeExceptionally(new SlotAllocationException("Test exception"));
        final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> secondRequest = requestSlotQueue.take();
        assertThat(secondRequest.f1, equalTo(firstRequest.f1));
        assertThat(secondRequest.f0, equalTo(firstRequest.f0));
        secondManualSlotRequestResponse.complete(Acknowledge.get());
        final DeclarativeTaskManagerSlot slot = slotTracker.getSlot(secondRequest.f0);
        assertThat(slot.getState(), equalTo(SlotState.ALLOCATED));
        assertThat(slot.getJobId(), equalTo(secondRequest.f1));
    }
}
Also used : ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) Arrays(java.util.Arrays) CoreMatchers.hasItem(org.hamcrest.CoreMatchers.hasItem) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple6(org.apache.flink.api.java.tuple.Tuple6) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) TimeoutException(java.util.concurrent.TimeoutException) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) Assert.assertThat(org.junit.Assert.assertThat) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) FunctionUtils(org.apache.flink.util.function.FunctionUtils) TestLogger(org.apache.flink.util.TestLogger) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) SlotOccupiedException(org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Collection(java.util.Collection) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) Set(java.util.Set) BlockingQueue(java.util.concurrent.BlockingQueue) SlotManagerMetricGroup(org.apache.flink.runtime.metrics.groups.SlotManagerMetricGroup) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TestingUtils(org.apache.flink.testutils.TestingUtils) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) List(java.util.List) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) Assert.assertFalse(org.junit.Assert.assertFalse) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) Matchers.is(org.hamcrest.Matchers.is) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) FlinkException(org.apache.flink.util.FlinkException) WorkerResourceSpec(org.apache.flink.runtime.resourcemanager.WorkerResourceSpec) CoreMatchers.not(org.hamcrest.CoreMatchers.not) CompletableFuture(java.util.concurrent.CompletableFuture) Supplier(java.util.function.Supplier) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Assert.assertSame(org.junit.Assert.assertSame) ManuallyTriggeredScheduledExecutorService(org.apache.flink.core.testutils.ManuallyTriggeredScheduledExecutorService) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) Matchers.hasSize(org.hamcrest.Matchers.hasSize) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) ThrowingConsumer(org.apache.flink.util.function.ThrowingConsumer) Matchers.empty(org.hamcrest.Matchers.empty) Iterator(java.util.Iterator) Executor(java.util.concurrent.Executor) Assert.assertNotNull(org.junit.Assert.assertNotNull) Assert.assertTrue(org.junit.Assert.assertTrue) SystemExitTrackingSecurityManager(org.apache.flink.runtime.testutils.SystemExitTrackingSecurityManager) Test(org.junit.Test) InstanceID(org.apache.flink.runtime.instance.InstanceID) Iterators(org.apache.flink.shaded.guava30.com.google.common.collect.Iterators) TimeUnit(java.util.concurrent.TimeUnit) JobID(org.apache.flink.api.common.JobID) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) TaskExecutorConnection(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) FlinkException(org.apache.flink.util.FlinkException) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) Tuple6(org.apache.flink.api.java.tuple.Tuple6) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 4 with SlotAllocationException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.

the class TaskExecutor method requestSlot.

// ----------------------------------------------------------------------
// Slot allocation RPCs
// ----------------------------------------------------------------------
@Override
public CompletableFuture<Acknowledge> requestSlot(final SlotID slotId, final JobID jobId, final AllocationID allocationId, final ResourceProfile resourceProfile, final String targetAddress, final ResourceManagerId resourceManagerId, final Time timeout) {
    // TODO: Filter invalid requests from the resource manager by using the
    // instance/registration Id
    log.info("Receive slot request {} for job {} from resource manager with leader id {}.", allocationId, jobId, resourceManagerId);
    if (!isConnectedToResourceManager(resourceManagerId)) {
        final String message = String.format("TaskManager is not connected to the resource manager %s.", resourceManagerId);
        log.debug(message);
        return FutureUtils.completedExceptionally(new TaskManagerException(message));
    }
    tryPersistAllocationSnapshot(new SlotAllocationSnapshot(slotId, jobId, targetAddress, allocationId, resourceProfile));
    try {
        final boolean isConnected = allocateSlotForJob(jobId, slotId, allocationId, resourceProfile, targetAddress);
        if (isConnected) {
            offerSlotsToJobManager(jobId);
        }
        return CompletableFuture.completedFuture(Acknowledge.get());
    } catch (SlotAllocationException e) {
        log.debug("Could not allocate slot for allocation id {}.", allocationId, e);
        return FutureUtils.completedExceptionally(e);
    }
}
Also used : TaskManagerException(org.apache.flink.runtime.taskexecutor.exceptions.TaskManagerException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) SlotAllocationSnapshot(org.apache.flink.runtime.taskexecutor.slot.SlotAllocationSnapshot)

Example 5 with SlotAllocationException

use of org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException in project flink by apache.

the class TaskExecutor method allocateSlotForJob.

private boolean allocateSlotForJob(JobID jobId, SlotID slotId, AllocationID allocationId, ResourceProfile resourceProfile, String targetAddress) throws SlotAllocationException {
    allocateSlot(slotId, jobId, allocationId, resourceProfile);
    final JobTable.Job job;
    try {
        job = jobTable.getOrCreateJob(jobId, () -> registerNewJobAndCreateServices(jobId, targetAddress));
    } catch (Exception e) {
        // free the allocated slot
        try {
            taskSlotTable.freeSlot(allocationId);
        } catch (SlotNotFoundException slotNotFoundException) {
            // slot no longer existent, this should actually never happen, because we've
            // just allocated the slot. So let's fail hard in this case!
            onFatalError(slotNotFoundException);
        }
        // release local state under the allocation id.
        localStateStoresManager.releaseLocalStateForAllocationId(allocationId);
        // sanity check
        if (!taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
            onFatalError(new Exception("Could not free slot " + slotId));
        }
        throw new SlotAllocationException("Could not create new job.", e);
    }
    return job.isConnected();
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) SlotOccupiedException(org.apache.flink.runtime.taskexecutor.exceptions.SlotOccupiedException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) FlinkException(org.apache.flink.util.FlinkException) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) TaskException(org.apache.flink.runtime.taskexecutor.exceptions.TaskException) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) IOException(java.io.IOException) TimeoutException(java.util.concurrent.TimeoutException) RegistrationTimeoutException(org.apache.flink.runtime.taskexecutor.exceptions.RegistrationTimeoutException) CompletionException(java.util.concurrent.CompletionException) TaskManagerException(org.apache.flink.runtime.taskexecutor.exceptions.TaskManagerException)

Aggregations

SlotAllocationException (org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException)7 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)3 CompletableFuture (java.util.concurrent.CompletableFuture)3 TimeoutException (java.util.concurrent.TimeoutException)3 JobID (org.apache.flink.api.common.JobID)3 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)3 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)3 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)3 TaskExecutorConnection (org.apache.flink.runtime.resourcemanager.registration.TaskExecutorConnection)3 ResourceRequirements (org.apache.flink.runtime.slots.ResourceRequirements)3 SlotReport (org.apache.flink.runtime.taskexecutor.SlotReport)3 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)2 TestingTaskExecutorGateway (org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway)2 TestingTaskExecutorGatewayBuilder (org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder)2 TaskManagerException (org.apache.flink.runtime.taskexecutor.exceptions.TaskManagerException)2 Test (org.junit.Test)2 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1