use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.
the class TaskExecutorTest method testRejectAllocationRequestsForOutOfSyncSlots.
/**
* Tests that all allocation requests for slots are ignored if the slot has been reported as
* free by the TaskExecutor but this report hasn't been confirmed by the ResourceManager.
*
* This is essential for the correctness of the state of the ResourceManager.
*/
@Ignore
@Test
public void testRejectAllocationRequestsForOutOfSyncSlots() throws Exception {
final ResourceID resourceID = ResourceID.generate();
final String address1 = "/resource/manager/address/one";
final UUID leaderId = UUID.randomUUID();
final JobID jobId = new JobID();
final String jobManagerAddress = "foobar";
final TestingSerialRpcService rpc = new TestingSerialRpcService();
try {
// register the mock resource manager gateways
ResourceManagerGateway rmGateway1 = mock(ResourceManagerGateway.class);
rpc.registerGateway(address1, rmGateway1);
TestingLeaderRetrievalService testLeaderService = new TestingLeaderRetrievalService();
TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
haServices.setResourceManagerLeaderRetriever(testLeaderService);
TaskManagerConfiguration taskManagerServicesConfiguration = mock(TaskManagerConfiguration.class);
when(taskManagerServicesConfiguration.getNumberSlots()).thenReturn(1);
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(resourceID);
final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
TaskExecutor taskManager = new TaskExecutor(taskManagerServicesConfiguration, taskManagerLocation, rpc, mock(MemoryManager.class), mock(IOManager.class), mock(NetworkEnvironment.class), haServices, mock(HeartbeatServices.class, RETURNS_MOCKS), mock(MetricRegistry.class), mock(TaskManagerMetricGroup.class), mock(BroadcastVariableManager.class), mock(FileCache.class), mock(TaskSlotTable.class), mock(JobManagerTable.class), mock(JobLeaderService.class), testingFatalErrorHandler);
taskManager.start();
String taskManagerAddress = taskManager.getAddress();
// no connection initially, since there is no leader
assertNull(taskManager.getResourceManagerConnection());
// define a leader and see that a registration happens
testLeaderService.notifyListener(address1, leaderId);
verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
assertNotNull(taskManager.getResourceManagerConnection());
// test that allocating a slot works
final SlotID slotID = new SlotID(resourceID, 0);
TMSlotRequestReply tmSlotRequestReply = taskManager.requestSlot(slotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
assertTrue(tmSlotRequestReply instanceof TMSlotRequestRegistered);
// TODO: Figure out the concrete allocation behaviour between RM and TM. Maybe we don't need the SlotID...
// test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
final SlotID unconfirmedFreeSlotID = new SlotID(resourceID, 1);
TMSlotRequestReply tmSlotRequestReply2 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
assertTrue(tmSlotRequestReply2 instanceof TMSlotRequestRejected);
// re-register
verify(rmGateway1).registerTaskExecutor(eq(leaderId), eq(taskManagerAddress), eq(resourceID), any(SlotReport.class), any(Time.class));
testLeaderService.notifyListener(address1, leaderId);
// now we should be successful because the slots status has been synced
// test that we can't allocate slots which are blacklisted due to pending confirmation of the RM
TMSlotRequestReply tmSlotRequestReply3 = taskManager.requestSlot(unconfirmedFreeSlotID, jobId, new AllocationID(), jobManagerAddress, leaderId);
assertTrue(tmSlotRequestReply3 instanceof TMSlotRequestRegistered);
// check if a concurrent error occurred
testingFatalErrorHandler.rethrowError();
} finally {
rpc.stopService();
}
}
use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.
the class JobLeaderIdServiceTest method jobTimeoutAfterLostLeadership.
/**
* Tests that a timeout get cancelled once a job leader has been found. Furthermore, it tests
* that a new timeout is registered after the jobmanager has lost leadership.
*/
@Test(timeout = 10000)
public void jobTimeoutAfterLostLeadership() throws Exception {
final JobID jobId = new JobID();
final String address = "foobar";
final UUID leaderId = UUID.randomUUID();
TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
ScheduledFuture<?> timeout1 = mock(ScheduledFuture.class);
ScheduledFuture<?> timeout2 = mock(ScheduledFuture.class);
final Queue<ScheduledFuture<?>> timeoutQueue = new ArrayDeque<>(Arrays.asList(timeout1, timeout2));
ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
final AtomicReference<Runnable> lastRunnable = new AtomicReference<>();
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
lastRunnable.set((Runnable) invocation.getArguments()[0]);
return timeoutQueue.poll();
}
}).when(scheduledExecutor).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
Time timeout = Time.milliseconds(5000L);
JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
final AtomicReference<UUID> lastTimeoutId = new AtomicReference<>();
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
lastTimeoutId.set((UUID) invocation.getArguments()[1]);
return null;
}
}).when(jobLeaderIdActions).notifyJobTimeout(eq(jobId), any(UUID.class));
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
jobLeaderIdService.start(jobLeaderIdActions);
jobLeaderIdService.addJob(jobId);
Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
// notify the leader id service about the new leader
leaderRetrievalService.notifyListener(address, leaderId);
assertEquals(leaderId, leaderIdFuture.get());
assertTrue(jobLeaderIdService.containsJob(jobId));
// check that the first timeout got cancelled
verify(timeout1, times(1)).cancel(anyBoolean());
verify(scheduledExecutor, times(1)).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
// initial timeout runnable which should no longer have an effect
Runnable runnable = lastRunnable.get();
assertNotNull(runnable);
runnable.run();
verify(jobLeaderIdActions, times(1)).notifyJobTimeout(eq(jobId), any(UUID.class));
// the timeout should no longer be valid
assertFalse(jobLeaderIdService.isValidTimeout(jobId, lastTimeoutId.get()));
// lose leadership
leaderRetrievalService.notifyListener("", null);
verify(scheduledExecutor, times(2)).schedule(any(Runnable.class), anyLong(), any(TimeUnit.class));
// the second runnable should be the new timeout
runnable = lastRunnable.get();
assertNotNull(runnable);
runnable.run();
verify(jobLeaderIdActions, times(2)).notifyJobTimeout(eq(jobId), any(UUID.class));
// the new timeout should be valid
assertTrue(jobLeaderIdService.isValidTimeout(jobId, lastTimeoutId.get()));
}
use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.
the class JobLeaderIdServiceTest method testInitialJobTimeout.
/**
* Tests that the initial job registration registers a timeout which will call
* {@link JobLeaderIdActions#notifyJobTimeout(JobID, UUID)} when executed.
*/
@Test
public void testInitialJobTimeout() throws Exception {
final JobID jobId = new JobID();
TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
Time timeout = Time.milliseconds(5000L);
JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
jobLeaderIdService.start(jobLeaderIdActions);
jobLeaderIdService.addJob(jobId);
assertTrue(jobLeaderIdService.containsJob(jobId));
ArgumentCaptor<Runnable> runnableArgumentCaptor = ArgumentCaptor.forClass(Runnable.class);
verify(scheduledExecutor).schedule(runnableArgumentCaptor.capture(), anyLong(), any(TimeUnit.class));
Runnable timeoutRunnable = runnableArgumentCaptor.getValue();
timeoutRunnable.run();
ArgumentCaptor<UUID> timeoutIdArgumentCaptor = ArgumentCaptor.forClass(UUID.class);
verify(jobLeaderIdActions, times(1)).notifyJobTimeout(eq(jobId), timeoutIdArgumentCaptor.capture());
assertTrue(jobLeaderIdService.isValidTimeout(jobId, timeoutIdArgumentCaptor.getValue()));
}
use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.
the class JobLeaderIdServiceTest method testRemovingJob.
/**
* Tests that removing a job completes the job leader id future exceptionally
*/
@Test(timeout = 10000)
public void testRemovingJob() throws Exception {
final JobID jobId = new JobID();
TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, leaderRetrievalService);
ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
Time timeout = Time.milliseconds(5000L);
JobLeaderIdActions jobLeaderIdActions = mock(JobLeaderIdActions.class);
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, scheduledExecutor, timeout);
jobLeaderIdService.start(jobLeaderIdActions);
jobLeaderIdService.addJob(jobId);
Future<UUID> leaderIdFuture = jobLeaderIdService.getLeaderId(jobId);
// remove the job before we could find a leader
jobLeaderIdService.removeJob(jobId);
assertFalse(jobLeaderIdService.containsJob(jobId));
try {
leaderIdFuture.get();
fail("The leader id future should be completed exceptionally.");
} catch (ExecutionException ignored) {
// expected exception
}
}
use of org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices in project flink by apache.
the class ResourceManagerJobMasterTest method createAndStartResourceManager.
private ResourceManager createAndStartResourceManager(TestingLeaderElectionService resourceManagerLeaderElectionService, JobID jobID, TestingLeaderRetrievalService jobMasterLeaderRetrievalService, FatalErrorHandler fatalErrorHandler) throws Exception {
TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
highAvailabilityServices.setResourceManagerLeaderElectionService(resourceManagerLeaderElectionService);
highAvailabilityServices.setJobMasterLeaderRetriever(jobID, jobMasterLeaderRetrievalService);
ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.seconds(5L), Time.seconds(5L), Time.minutes(5L));
SlotManagerFactory slotManagerFactory = new TestingSlotManagerFactory();
MetricRegistry metricRegistry = mock(MetricRegistry.class);
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(highAvailabilityServices, rpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
ResourceManager resourceManager = new StandaloneResourceManager(rpcService, resourceManagerConfiguration, highAvailabilityServices, slotManagerFactory, metricRegistry, jobLeaderIdService, fatalErrorHandler);
resourceManager.start();
return resourceManager;
}
Aggregations