Search in sources :

Example 21 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class TaskExecutorTest method testIgnoringSlotRequestsIfNotRegistered.

/**
 * Tests that we ignore slot requests if the TaskExecutor is not registered at a
 * ResourceManager.
 */
@Test
public void testIgnoringSlotRequestsIfNotRegistered() throws Exception {
    final TaskExecutor taskExecutor = createTaskExecutor(1);
    taskExecutor.start();
    try {
        final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
        final CompletableFuture<RegistrationResponse> registrationFuture = new CompletableFuture<>();
        final CompletableFuture<ResourceID> taskExecutorResourceIdFuture = new CompletableFuture<>();
        testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
            taskExecutorResourceIdFuture.complete(taskExecutorRegistration.getResourceId());
            return registrationFuture;
        });
        rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
        resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
        final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
        final ResourceID resourceId = taskExecutorResourceIdFuture.get();
        final CompletableFuture<Acknowledge> slotRequestResponse = taskExecutorGateway.requestSlot(new SlotID(resourceId, 0), jobId, new AllocationID(), ResourceProfile.ZERO, "foobar", testingResourceManagerGateway.getFencingToken(), timeout);
        try {
            slotRequestResponse.get();
            fail("We should not be able to request slots before the TaskExecutor is registered at the ResourceManager.");
        } catch (ExecutionException ee) {
            assertThat(ExceptionUtils.stripExecutionException(ee), instanceOf(TaskManagerException.class));
        }
    } finally {
        RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
    }
}
Also used : Acknowledge(org.apache.flink.runtime.messages.Acknowledge) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) CompletableFuture(java.util.concurrent.CompletableFuture) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 22 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class TaskExecutorTest method testInitialSlotReportFailure.

/**
 * Tests that the {@link TaskExecutor} tries to reconnect if the initial slot report fails.
 */
@Test
public void testInitialSlotReportFailure() throws Exception {
    final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
    final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
    final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).build();
    final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices);
    taskExecutor.start();
    try {
        final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
        final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
        testingResourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
            try {
                return responseQueue.take();
            } catch (InterruptedException e) {
                return FutureUtils.completedExceptionally(e);
            }
        });
        final CompletableFuture<RegistrationResponse> registrationResponse = CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("foobar", 1234)));
        final CountDownLatch numberRegistrations = new CountDownLatch(2);
        testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
            numberRegistrations.countDown();
            return registrationResponse;
        });
        responseQueue.offer(FutureUtils.completedExceptionally(new FlinkException("Test exception")));
        responseQueue.offer(CompletableFuture.completedFuture(Acknowledge.get()));
        rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
        resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
        // wait for the second registration attempt
        numberRegistrations.await();
    } finally {
        RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
    }
}
Also used : Task(org.apache.flink.runtime.taskmanager.Task) InstanceID(org.apache.flink.runtime.instance.InstanceID) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) CountDownLatch(java.util.concurrent.CountDownLatch) ClusterInformation(org.apache.flink.runtime.entrypoint.ClusterInformation) FlinkException(org.apache.flink.util.FlinkException) CompletableFuture(java.util.concurrent.CompletableFuture) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) TestingResourceManagerGateway(org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway) LocalUnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) Test(org.junit.Test)

Example 23 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class ResourceManagerTaskExecutorTest method testRegisterTaskExecutorWithUnmatchedLeaderSessionId.

/**
	 * Test receive registration with unmatched leadershipId from task executor
	 */
@Test
public void testRegisterTaskExecutorWithUnmatchedLeaderSessionId() throws Exception {
    try {
        // test throw exception when receive a registration from taskExecutor which takes unmatched leaderSessionId
        UUID differentLeaderSessionID = UUID.randomUUID();
        Future<RegistrationResponse> unMatchedLeaderFuture = resourceManager.registerTaskExecutor(differentLeaderSessionID, taskExecutorAddress, taskExecutorResourceID, slotReport);
        assertTrue(unMatchedLeaderFuture.get(5, TimeUnit.SECONDS) instanceof RegistrationResponse.Decline);
    } finally {
        if (testingFatalErrorHandler.hasExceptionOccurred()) {
            testingFatalErrorHandler.rethrowError();
        }
    }
}
Also used : UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) Test(org.junit.Test)

Example 24 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class TaskExecutorITCase method testSlotAllocation.

@Test
public void testSlotAllocation() throws Exception {
    TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
    TestingHighAvailabilityServices testingHAServices = new TestingHighAvailabilityServices();
    final Configuration configuration = new Configuration();
    final ScheduledExecutorService scheduledExecutorService = new ScheduledThreadPoolExecutor(1);
    final ResourceID taskManagerResourceId = new ResourceID("foobar");
    final UUID rmLeaderId = UUID.randomUUID();
    final TestingLeaderElectionService rmLeaderElectionService = new TestingLeaderElectionService();
    final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService();
    final String rmAddress = "rm";
    final String jmAddress = "jm";
    final UUID jmLeaderId = UUID.randomUUID();
    final JobID jobId = new JobID();
    final ResourceProfile resourceProfile = new ResourceProfile(1.0, 1);
    testingHAServices.setResourceManagerLeaderElectionService(rmLeaderElectionService);
    testingHAServices.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
    testingHAServices.setJobMasterLeaderRetriever(jobId, new TestingLeaderRetrievalService(jmAddress, jmLeaderId));
    TestingSerialRpcService rpcService = new TestingSerialRpcService();
    ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.milliseconds(500L), Time.milliseconds(500L), Time.minutes(5L));
    SlotManagerFactory slotManagerFactory = new DefaultSlotManager.Factory();
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHAServices, rpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
    MetricRegistry metricRegistry = mock(MetricRegistry.class);
    HeartbeatServices heartbeatServices = mock(HeartbeatServices.class, RETURNS_MOCKS);
    final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
    final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(taskManagerResourceId, InetAddress.getLocalHost(), 1234);
    final MemoryManager memoryManager = mock(MemoryManager.class);
    final IOManager ioManager = mock(IOManager.class);
    final NetworkEnvironment networkEnvironment = mock(NetworkEnvironment.class);
    final TaskManagerMetricGroup taskManagerMetricGroup = mock(TaskManagerMetricGroup.class);
    final BroadcastVariableManager broadcastVariableManager = mock(BroadcastVariableManager.class);
    final FileCache fileCache = mock(FileCache.class);
    final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(resourceProfile), new TimerService<AllocationID>(scheduledExecutorService, 100L));
    final JobManagerTable jobManagerTable = new JobManagerTable();
    final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
    ResourceManager<ResourceID> resourceManager = new StandaloneResourceManager(rpcService, resourceManagerConfiguration, testingHAServices, slotManagerFactory, metricRegistry, jobLeaderIdService, testingFatalErrorHandler);
    TaskExecutor taskExecutor = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpcService, memoryManager, ioManager, networkEnvironment, testingHAServices, heartbeatServices, metricRegistry, taskManagerMetricGroup, broadcastVariableManager, fileCache, taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
    JobMasterGateway jmGateway = mock(JobMasterGateway.class);
    when(jmGateway.registerTaskManager(any(String.class), any(TaskManagerLocation.class), eq(jmLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(taskManagerResourceId, 1234)));
    when(jmGateway.getHostname()).thenReturn(jmAddress);
    rpcService.registerGateway(rmAddress, resourceManager.getSelf());
    rpcService.registerGateway(jmAddress, jmGateway);
    final AllocationID allocationId = new AllocationID();
    final SlotRequest slotRequest = new SlotRequest(jobId, allocationId, resourceProfile);
    final SlotOffer slotOffer = new SlotOffer(allocationId, 0, resourceProfile);
    try {
        resourceManager.start();
        taskExecutor.start();
        // notify the RM that it is the leader
        rmLeaderElectionService.isLeader(rmLeaderId);
        // notify the TM about the new RM leader
        rmLeaderRetrievalService.notifyListener(rmAddress, rmLeaderId);
        Future<RegistrationResponse> registrationResponseFuture = resourceManager.registerJobManager(rmLeaderId, jmLeaderId, jmAddress, jobId);
        RegistrationResponse registrationResponse = registrationResponseFuture.get();
        assertTrue(registrationResponse instanceof JobMasterRegistrationSuccess);
        resourceManager.requestSlot(jmLeaderId, rmLeaderId, slotRequest);
        verify(jmGateway).offerSlots(eq(taskManagerResourceId), (Iterable<SlotOffer>) argThat(Matchers.contains(slotOffer)), eq(jmLeaderId), any(Time.class));
    } finally {
        if (testingFatalErrorHandler.hasExceptionOccurred()) {
            testingFatalErrorHandler.rethrowError();
        }
    }
}
Also used : ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) Configuration(org.apache.flink.configuration.Configuration) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) Time(org.apache.flink.api.common.time.Time) StandaloneResourceManager(org.apache.flink.runtime.resourcemanager.StandaloneResourceManager) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) SlotRequest(org.apache.flink.runtime.resourcemanager.SlotRequest) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) JMTMRegistrationSuccess(org.apache.flink.runtime.jobmaster.JMTMRegistrationSuccess) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) JobMasterRegistrationSuccess(org.apache.flink.runtime.jobmaster.JobMasterRegistrationSuccess) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 25 with RegistrationResponse

use of org.apache.flink.runtime.registration.RegistrationResponse in project flink by apache.

the class SlotProtocolTest method testSlotsUnavailableRequest.

/**
	 * Tests whether
	 * 1) SlotRequest is routed to the SlotManager
	 * 2) SlotRequest is confirmed
	 * 3) SlotRequest leads to a container allocation
	 * 4) Slot becomes available and TaskExecutor gets a SlotRequest
	 */
@Test
public void testSlotsUnavailableRequest() throws Exception {
    final String rmAddress = "/rm1";
    final String jmAddress = "/jm1";
    final JobID jobID = new JobID();
    testRpcService.registerGateway(jmAddress, mock(JobMasterGateway.class));
    final TestingHighAvailabilityServices testingHaServices = new TestingHighAvailabilityServices();
    final UUID rmLeaderID = UUID.randomUUID();
    final UUID jmLeaderID = UUID.randomUUID();
    TestingLeaderElectionService rmLeaderElectionService = configureHA(testingHaServices, jobID, rmAddress, rmLeaderID, jmAddress, jmLeaderID);
    ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.seconds(5L), Time.seconds(5L), Time.minutes(5L));
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHaServices, testRpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
    final TestingSlotManagerFactory slotManagerFactory = new TestingSlotManagerFactory();
    SpiedResourceManager resourceManager = new SpiedResourceManager(testRpcService, resourceManagerConfiguration, testingHaServices, slotManagerFactory, mock(MetricRegistry.class), jobLeaderIdService, mock(FatalErrorHandler.class));
    resourceManager.start();
    rmLeaderElectionService.isLeader(rmLeaderID);
    Future<RegistrationResponse> registrationFuture = resourceManager.registerJobManager(rmLeaderID, jmLeaderID, jmAddress, jobID);
    try {
        registrationFuture.get(5, TimeUnit.SECONDS);
    } catch (Exception e) {
        Assert.fail("JobManager registration Future didn't become ready.");
    }
    final SlotManager slotManager = slotManagerFactory.slotManager;
    final AllocationID allocationID = new AllocationID();
    final ResourceProfile resourceProfile = new ResourceProfile(1.0, 100);
    SlotRequest slotRequest = new SlotRequest(jobID, allocationID, resourceProfile);
    RMSlotRequestReply slotRequestReply = resourceManager.requestSlot(jmLeaderID, rmLeaderID, slotRequest);
    // 1) SlotRequest is routed to the SlotManager
    verify(slotManager).requestSlot(slotRequest);
    // 2) SlotRequest is confirmed
    Assert.assertEquals(slotRequestReply.getAllocationID(), allocationID);
    // 3) SlotRequest leads to a container allocation
    Assert.assertEquals(1, resourceManager.startNewWorkerCalled);
    Assert.assertFalse(slotManager.isAllocated(allocationID));
    // slot becomes available
    final String tmAddress = "/tm1";
    TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
    Mockito.when(taskExecutorGateway.requestSlot(any(SlotID.class), any(JobID.class), any(AllocationID.class), any(String.class), any(UUID.class), any(Time.class))).thenReturn(new FlinkCompletableFuture<TMSlotRequestReply>());
    testRpcService.registerGateway(tmAddress, taskExecutorGateway);
    final ResourceID resourceID = ResourceID.generate();
    final SlotID slotID = new SlotID(resourceID, 0);
    final SlotStatus slotStatus = new SlotStatus(slotID, resourceProfile);
    final SlotReport slotReport = new SlotReport(Collections.singletonList(slotStatus));
    // register slot at SlotManager
    slotManager.registerTaskExecutor(resourceID, new TaskExecutorRegistration(taskExecutorGateway), slotReport);
    // 4) Slot becomes available and TaskExecutor gets a SlotRequest
    verify(taskExecutorGateway, timeout(5000)).requestSlot(eq(slotID), eq(jobID), eq(allocationID), any(String.class), any(UUID.class), any(Time.class));
}
Also used : TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TaskExecutorRegistration(org.apache.flink.runtime.resourcemanager.registration.TaskExecutorRegistration) JobLeaderIdService(org.apache.flink.runtime.resourcemanager.JobLeaderIdService) Time(org.apache.flink.api.common.time.Time) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) SlotRequest(org.apache.flink.runtime.resourcemanager.SlotRequest) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) RMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.jobmanager.RMSlotRequestReply) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) FatalErrorHandler(org.apache.flink.runtime.rpc.FatalErrorHandler) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TestingSlotManager(org.apache.flink.runtime.resourcemanager.TestingSlotManager) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)40 Test (org.junit.Test)35 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)23 CompletableFuture (java.util.concurrent.CompletableFuture)18 UUID (java.util.UUID)14 JobID (org.apache.flink.api.common.JobID)14 ArrayList (java.util.ArrayList)12 FlinkException (org.apache.flink.util.FlinkException)11 Time (org.apache.flink.api.common.time.Time)10 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)10 ResourceProfile (org.apache.flink.runtime.clusterframework.types.ResourceProfile)10 TestingResourceManagerGateway (org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway)9 LocalUnresolvedTaskManagerLocation (org.apache.flink.runtime.taskmanager.LocalUnresolvedTaskManagerLocation)9 ExecutionException (java.util.concurrent.ExecutionException)8 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)8 UnresolvedTaskManagerLocation (org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation)8 TestingFatalErrorHandler (org.apache.flink.runtime.util.TestingFatalErrorHandler)8 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)7 TimeoutException (java.util.concurrent.TimeoutException)7 Configuration (org.apache.flink.configuration.Configuration)7