Search in sources :

Example 16 with TestingHighAvailabilityServices

use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.

the class TaskExecutorTest method testRejectAllocationRequestsForOutOfSyncSlots.

/**
	 * Tests that all allocation requests for slots are ignored if the slot has been reported as
	 * free by the TaskExecutor but this report hasn't been confirmed by the ResourceManager.
	 *
	 * This is essential for the correctness of the state of the ResourceManager.
	 */
@Ignore
@Test
public void testRejectAllocationRequestsForOutOfSyncSlots() throws Exception {
    final ResourceID resourceID = ResourceID.generate();
    final String address1 = "/resource/manager/address/one";
    final UUID leaderId = UUID.randomUUID();
    final JobID jobId = new JobID();
    final String jobManagerAddress = "foobar";
    final TestingSerialRpcService rpc = new TestingSerialRpcService();
    try {
        // register the mock resource manager gateways
        ResourceManagerGateway rmGateway1 = mock(ResourceManagerGateway.class);
        rpc.registerGateway(address1, rmGateway1);
        TestingLeaderRetrievalService testLeaderService = new TestingLeaderRetrievalService();
        TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
        haServices.setResourceManagerLeaderRetriever(testLeaderService);
        TaskManagerConfiguration taskManagerServicesConfiguration = mock(TaskManagerConfiguration.class);
        when(taskManagerServicesConfiguration.getNumberSlots()).thenReturn(1);
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(resourceID);
        final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
        TaskExecutor taskManager = new TaskExecutor(taskManagerServicesConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), mock(TaskSlotTable.class), mock(JobManagerTable.class), mock(JobLeaderService.class), testingFatalErrorHandler);
        taskManager.start();
        String taskManagerAddress = taskManager.getAddress();
        // no connection initially, since there is no leader
        assertNull(taskManager.getResourceManagerConnection());
        // define a leader and see that a registration happens
        testLeaderService.notifyListener(address1, leaderId);
        verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
        assertNotNull(taskManager.getResourceManagerConnection());
        // test that allocating a slot works
        final SlotID slotID = new SlotID(resourceID, 0);
        TMSlotRequestReply tmSlotRequestReply = taskManager.requestSlot(slotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply instanceof TMSlotRequestRegistered);
        // TODO: Figure out the concrete allocation behaviour between RM and TM. Maybe we don't need the SlotID...
        // test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
        final SlotID unconfirmedFreeSlotID = new SlotID(resourceID, 1);
        TMSlotRequestReply tmSlotRequestReply2 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply2 instanceof TMSlotRequestRejected);
        // re-register
        verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
        testLeaderService.notifyListener(address1, leaderId);
        // now we should be successful because the slots status has been synced
        // test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
        TMSlotRequestReply tmSlotRequestReply3 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
        assertTrue(tmSlotRequestReply3 instanceof TMSlotRequestRegistered);
        // check if a concurrent error occurred
        testingFatalErrorHandler.rethrowError();
    } finally {
        rpc.stopService();
    }
}
Also used : TMSlotRequestReply(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestReply) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) Time(org.apache.flink.api.common.time.Time) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) TMSlotRequestRejected(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRejected) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) BroadcastVariableManager(org.apache.flink.runtime.broadcast.BroadcastVariableManager) TestingSerialRpcService(org.apache.flink.runtime.rpc.TestingSerialRpcService) TMSlotRequestRegistered(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered) UUID(java.util.UUID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) FileCache(org.apache.flink.runtime.filecache.FileCache) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) TaskSlotTable(org.apache.flink.runtime.taskexecutor.slot.TaskSlotTable) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) JobID(org.apache.flink.api.common.JobID) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 17 with TestingHighAvailabilityServices

use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.

the class JobLeaderIdServiceTest method jobTimeoutAfterLostLeadership.

/**
	 * Tests that a timeout get cancelled once a job leader has been found. Furthermore, it tests
	 * that a new timeout is registered after the jobmanager has lost leadership.
	 */
@Test(timeout = 10000)
public void jobTimeoutAfterLostLeadership() throws Exception {
    final JobID jobId = new JobID();
    final String address = "foobar";
    final UUID leaderId = UUID.randomUUID();
    TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
    highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
    ScheduledFuture<?> timeout1 = mock(ScheduledFuture.class);
    ScheduledFuture<?> timeout2 = mock(ScheduledFuture.class);
    final Queue<ScheduledFuture<?>> timeoutQueue = new ArrayDeque<>(Arrays.asList(timeout1, timeout2));
    ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
    final AtomicReference<Runnable> lastRunnable = new AtomicReference<>();
    doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            lastRunnable.set((Runnable) invocation.getArguments()[0]);
            return timeoutQueue.poll();
        }
    }).when(scheduledExecutor).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
    Time timeout = Time.milliseconds(5000L);
    JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
    final AtomicReference<UUID> lastTimeoutId = new AtomicReference<>();
    doAnswer(new Answer() {

        @Override
        public Object answer(InvocationOnMock invocation) throws Throwable {
            lastTimeoutId.set((UUID) invocation.getArguments()[1]);
            return null;
        }
    }).when(jobLeaderIdActions).notifyJobTimeout(eq(jobId), any(UUID.class));
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
    jobLeaderIdService.start(jobLeaderIdActions);
    jobLeaderIdService.addJob(jobId);
    Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
    // notify the leader id service about the new leader
    leaderRetrievalService.notifyListener(address, leaderId);
    assertEquals(leaderId, leaderIdFuture.get());
    assertTrue(jobLeaderIdService.containsJob(jobId));
    // check that the first timeout got cancelled
    verify(timeout1, times(1)).cancel(anyBoolean());
    verify(scheduledExecutor, times(1)).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
    // initial timeout runnable which should no longer have an effect
    Runnable runnable = lastRunnable.get();
    assertNotNull(runnable);
    runnable.run();
    verify(jobLeaderIdActions, times(1)).notifyJobTimeout(eq(jobId), any(UUID.class));
    // the timeout should no longer be valid
    assertFalse(jobLeaderIdService.isValidTimeout(jobId, lastTimeoutId.get()));
    // lose leadership
    leaderRetrievalService.notifyListener("", null);
    verify(scheduledExecutor, times(2)).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
    // the second runnable should be the new timeout
    runnable = lastRunnable.get();
    assertNotNull(runnable);
    runnable.run();
    verify(jobLeaderIdActions, times(2)).notifyJobTimeout(eq(jobId), any(UUID.class));
    // the new timeout should be valid
    assertTrue(jobLeaderIdService.isValidTimeout(jobId, lastTimeoutId.get()));
}
Also used : TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) AtomicReference(java.util.concurrent.atomic.AtomicReference) Time(org.apache.flink.api.common.time.Time) ArrayDeque(java.util.ArrayDeque) ScheduledFuture(java.util.concurrent.ScheduledFuture) ScheduledExecutor(org.apache.flink.runtime.concurrent.ScheduledExecutor) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Answer(org.mockito.stubbing.Answer) Mockito.doAnswer(org.mockito.Mockito.doAnswer) InvocationOnMock(org.mockito.invocation.InvocationOnMock) TimeUnit(java.util.concurrent.TimeUnit) UUID(java.util.UUID) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 18 with TestingHighAvailabilityServices

use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.

the class JobLeaderIdServiceTest method testInitialJobTimeout.

/**
	 * Tests that the initial job registration registers a timeout which will call
	 * {@link JobLeaderIdActions#notifyJobTimeout(JobID, UUID)} when executed.
	 */
@Test
public void testInitialJobTimeout() throws Exception {
    final JobID jobId = new JobID();
    TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
    highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
    ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
    Time timeout = Time.milliseconds(5000L);
    JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
    jobLeaderIdService.start(jobLeaderIdActions);
    jobLeaderIdService.addJob(jobId);
    assertTrue(jobLeaderIdService.containsJob(jobId));
    ArgumentCaptor<Runnable> runnableArgumentCaptor = ArgumentCaptor.forClass(Runnable.class);
    verify(scheduledExecutor).schedule(runnableArgumentCaptor.capture(), anyLong(), any(TimeUnit.class));
    Runnable timeoutRunnable = runnableArgumentCaptor.getValue();
    timeoutRunnable.run();
    ArgumentCaptor<UUID> timeoutIdArgumentCaptor = ArgumentCaptor.forClass(UUID.class);
    verify(jobLeaderIdActions, times(1)).notifyJobTimeout(eq(jobId), timeoutIdArgumentCaptor.capture());
    assertTrue(jobLeaderIdService.isValidTimeout(jobId, timeoutIdArgumentCaptor.getValue()));
}
Also used : TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) TimeUnit(java.util.concurrent.TimeUnit) Time(org.apache.flink.api.common.time.Time) UUID(java.util.UUID) JobID(org.apache.flink.api.common.JobID) ScheduledExecutor(org.apache.flink.runtime.concurrent.ScheduledExecutor) Test(org.junit.Test)

Example 19 with TestingHighAvailabilityServices

use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.

the class JobLeaderIdServiceTest method testRemovingJob.

/**
	 * Tests that removing a job completes the job leader id future exceptionally
	 */
@Test(timeout = 10000)
public void testRemovingJob() throws Exception {
    final JobID jobId = new JobID();
    TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
    highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
    ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
    Time timeout = Time.milliseconds(5000L);
    JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
    jobLeaderIdService.start(jobLeaderIdActions);
    jobLeaderIdService.addJob(jobId);
    Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
    // remove the job before we could find a leader
    jobLeaderIdService.removeJob(jobId);
    assertFalse(jobLeaderIdService.containsJob(jobId));
    try {
        leaderIdFuture.get();
        fail("The leader id future should be completed exceptionally.");
    } catch (ExecutionException ignored) {
    // expected exception
    }
}
Also used : TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) TestingLeaderRetrievalService(org.apache.flink.runtime.leaderelection.TestingLeaderRetrievalService) Time(org.apache.flink.api.common.time.Time) UUID(java.util.UUID) ExecutionException(java.util.concurrent.ExecutionException) JobID(org.apache.flink.api.common.JobID) ScheduledExecutor(org.apache.flink.runtime.concurrent.ScheduledExecutor) Test(org.junit.Test)

Example 20 with TestingHighAvailabilityServices

use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.

the class ResourceManagerJobMasterTest method createAndStartResourceManager.

private ResourceManager createAndStartResourceManager(TestingLeaderElectionService resourceManagerLeaderElectionService, JobID jobID, TestingLeaderRetrievalService jobMasterLeaderRetrievalService, FatalErrorHandler fatalErrorHandler) throws Exception {
    TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
    highAvailabilityServices.setResourceManagerLeaderElectionService(resourceManagerLeaderElectionService);
    highAvailabilityServices.setJobMasterLeaderRetriever(jobID, jobMasterLeaderRetrievalService);
    ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.seconds(5L), Time.seconds(5L), Time.minutes(5L));
    SlotManagerFactory slotManagerFactory = new TestingSlotManagerFactory();
    MetricRegistry metricRegistry = mock(MetricRegistry.class);
    JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, rpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
    ResourceManager resourceManager = new StandaloneResourceManager(rpcService, resourceManagerConfiguration, highAvailabilityServices, slotManagerFactory, metricRegistry, jobLeaderIdService, fatalErrorHandler);
    resourceManager.start();
    return resourceManager;
}
Also used : TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) SlotManagerFactory(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManagerFactory) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry)

Aggregations

TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)31 JobID (org.apache.flink.api.common.JobID)21 Test (org.junit.Test)21 UUID (java.util.UUID)17 SettableLeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService)16 Time (org.apache.flink.api.common.time.Time)15 Configuration (org.apache.flink.configuration.Configuration)8 JobMasterId (org.apache.flink.runtime.jobmaster.JobMasterId)8 TestingFatalErrorHandler (org.apache.flink.runtime.util.TestingFatalErrorHandler)8 CompletableFuture (java.util.concurrent.CompletableFuture)7 TimeUnit (java.util.concurrent.TimeUnit)7 JobMasterGateway (org.apache.flink.runtime.jobmaster.JobMasterGateway)7 Before (org.junit.Before)7 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)6 BlockingQueue (java.util.concurrent.BlockingQueue)6 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)6 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)6 TestingHighAvailabilityServicesBuilder (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServicesBuilder)6 TestingJobMasterGateway (org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway)6 TestingJobMasterGatewayBuilder (org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder)6