Search in sources :

Example 1 with TMSlotRequestRejected

use of org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRejected in project flink by apache.

the class SlotManager method sendSlotRequest.

private void sendSlotRequest(final ResourceSlot freeSlot, final SlotRequest slotRequest) {
    final AllocationID allocationID = slotRequest.getAllocationId();
    final TaskExecutorRegistration registration = freeSlot.getTaskExecutorRegistration();
    final Future<TMSlotRequestReply> slotRequestReplyFuture = registration.getTaskExecutorGateway().requestSlot(freeSlot.getSlotId(), slotRequest.getJobId(), allocationID, // TODO: set proper JM address
    "foobar", rmServices.getLeaderID(), timeout);
    slotRequestReplyFuture.handleAsync(new BiFunction<TMSlotRequestReply, Throwable, Void>() {

        @Override
        public Void apply(TMSlotRequestReply slotRequestReply, Throwable throwable) {
            TaskExecutorRegistration current = taskManagers.get(slotRequestReply.getResourceID());
            if (current != null && current.getInstanceID().equals(slotRequestReply.getInstanceID())) {
                if (throwable != null || slotRequestReply instanceof TMSlotRequestRejected) {
                    handleSlotRequestFailedAtTaskManager(slotRequest, freeSlot.getSlotId());
                } else {
                    LOG.debug("Successfully registered slot {} ", freeSlot.getSlotId());
                }
            } else {
                LOG.debug("Discarding message from obsolete TaskExecutor with InstanceID {}", slotRequestReply.getInstanceID());
            }
            return null;
        }
    }, rmServices.getMainThreadExecutor());
}
Also used : TMSlotRequestRejected(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRejected) TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TaskExecutorRegistration(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorRegistration) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID)

Example 2 with TMSlotRequestRejected

use of org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRejected in project flink by apache.

the class TaskExecutorTest method testRejectAllocationRequestsForOutOfSyncSlots.

/**
	 * Tests that all allocation requests for slots are ignored if the slot has been reported as
	 * free by the TaskExecutor but this report hasn't been confirmed by the ResourceManager.
	 *
	 * This is essential for the correctness of the state of the ResourceManager.
	 */
@Ignore
@Test
public void testRejectAllocationRequestsForOutOfSyncSlots() throws Exception {
    final ResourceID resourceID = ResourceID.generate();
    final String address1 = "/resource/manager/address/one";
    final UUID leaderId = UUID.randomUUID();
    final JobID jobId = new JobID();
    final String jobManagerAddress = "foobar";
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    try {
        // register the mock resource manager gateways
        ResourceManagerGateway rmGateway1 = mock(ResourceManagerGateway.class);
        rpc.registerGateway(address1, rmGateway1);
        TestingLeaderRetrievalService testLeaderService = new TestingLeaderRetrievalService();
        TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
        haServices.setResourceManagerLeaderRetriever(testLeaderService);
        TaskManagerConfiguration taskManagerServicesConfiguration = mock(TaskManagerConfiguration.class);
        when(taskManagerServicesConfiguration.getNumberSlots()).thenReturn(1);
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(resourceID);
        final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
        TaskExecutor taskManager = new TaskExecutor(taskManagerServicesConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), mock(TaskSlotTable.class), mock(JobManagerTable.class), mock(JobLeaderService.class), testingFatalErrorHandler);
        taskManager.start();
        String taskManagerAddress = taskManager.getAddress();
        // no connection initially, since there is no leader
        assertNull(taskManager.getResourceManagerConnection());
        // define a leader and see that a registration happens
        testLeaderService.notifyListener(address1, leaderId);
        verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
        assertNotNull(taskManager.getResourceManagerConnection());
        // test that allocating a slot works
        final SlotID slotID = new SlotID(resourceID, 0);
        TMSlotRequestReply tmSlotRequestReply = taskManager.requestSlot(slotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply instanceof TMSlotRequestRegistered);
        // TODO: Figure out the concrete allocation behaviour between RM and TM. Maybe we don't need the SlotID...
        // test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
        final SlotID unconfirmedFreeSlotID = new SlotID(resourceID, 1);
        TMSlotRequestReply tmSlotRequestReply2 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply2 instanceof TMSlotRequestRejected);
        // re-register
        verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
        testLeaderService.notifyListener(address1, leaderId);
        // now we should be successful because the slots status has been synced
        // test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
        TMSlotRequestReply tmSlotRequestReply3 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply3 instanceof TMSlotRequestRegistered);
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) Time(org.apache.flink.api.common.time.Time) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) TMSlotRequestRejected(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRejected) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) TMSlotRequestRegistered(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered) UUID(java.util.UUID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)2 TMSlotRequestRejected (org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRejected)2 TMSlotRequestReply (org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply)2 UUID (java.util.UUID)1 JobID (org.apache.flink.api.common.JobID)1 Time (org.apache.flink.api.common.time.Time)1 BroadcastVariableManager (org.apache.flink.runtime.broadcast.BroadcastVariableManager)1 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)1 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)1 FileCache (org.apache.flink.runtime.filecache.FileCache)1 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)1 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)1 IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)1 NetworkEnvironment (org.apache.flink.runtime.io.network.NetworkEnvironment)1 TestingLeaderRetrievalService (org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService)1 MemoryManager (org.apache.flink.runtime.memory.MemoryManager)1 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)1 TaskManagerMetricGroup (org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup)1 ResourceManagerGateway (org.apache.flink.runtime.resourcemanager.ResourceManagerGateway)1 TMSlotRequestRegistered (org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered)1