use of org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway in project flink by apache.
the class TaskExecutorTest method testRemoveJobFromJobLeaderService.
/**
* Tests that a job is removed from the JobLeaderService once a TaskExecutor has no more slots
* assigned to this job.
*
* <p>See FLINK-8504
*/
@Test
public void testRemoveJobFromJobLeaderService() throws Exception {
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices);
try {
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
final CompletableFuture<Void> initialSlotReport = new CompletableFuture<>();
resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReport.complete(null);
return CompletableFuture.completedFuture(Acknowledge.get());
});
final ResourceManagerId resourceManagerId = resourceManagerGateway.getFencingToken();
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerId.toUUID());
final CompletableFuture<LeaderRetrievalListener> startFuture = new CompletableFuture<>();
final CompletableFuture<Void> stopFuture = new CompletableFuture<>();
final StartStopNotifyingLeaderRetrievalService jobMasterLeaderRetriever = new StartStopNotifyingLeaderRetrievalService(startFuture, stopFuture);
haServices.setJobMasterLeaderRetriever(jobId, jobMasterLeaderRetriever);
taskExecutor.start();
taskExecutor.waitUntilStarted();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final SlotID slotId = buildSlotID(0);
final AllocationID allocationId = new AllocationID();
assertThat(startFuture.isDone(), is(false));
final JobLeaderService jobLeaderService = taskManagerServices.getJobLeaderService();
assertThat(jobLeaderService.containsJob(jobId), is(false));
// wait for the initial slot report
initialSlotReport.get();
requestSlot(taskExecutorGateway, jobId, allocationId, slotId, ResourceProfile.ZERO, "foobar", resourceManagerId);
// wait until the job leader retrieval service for jobId is started
startFuture.get();
assertThat(jobLeaderService.containsJob(jobId), is(true));
taskExecutorGateway.freeSlot(allocationId, new FlinkException("Test exception"), timeout).get();
// wait that the job leader retrieval service for jobId stopped becaue it should get
// removed
stopFuture.get();
assertThat(jobLeaderService.containsJob(jobId), is(false));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway in project flink by apache.
the class TaskExecutorTest method testReconnectionAttemptIfExplicitlyDisconnected.
/**
* Tests that the TaskExecutor tries to reconnect to a ResourceManager from which it was
* explicitly disconnected.
*/
@Test
public void testReconnectionAttemptIfExplicitlyDisconnected() throws Exception {
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final TaskExecutor taskExecutor = createTaskExecutor(new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).build());
taskExecutor.start();
try {
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final ClusterInformation clusterInformation = new ClusterInformation("foobar", 1234);
final CompletableFuture<RegistrationResponse> registrationResponseFuture = CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), ResourceID.generate(), clusterInformation));
final BlockingQueue<ResourceID> registrationQueue = new ArrayBlockingQueue<>(1);
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
registrationQueue.offer(taskExecutorRegistration.getResourceId());
return registrationResponseFuture;
});
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
final ResourceID firstRegistrationAttempt = registrationQueue.take();
assertThat(firstRegistrationAttempt, equalTo(unresolvedTaskManagerLocation.getResourceID()));
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
assertThat(registrationQueue, is(empty()));
taskExecutorGateway.disconnectResourceManager(new FlinkException("Test exception"));
final ResourceID secondRegistrationAttempt = registrationQueue.take();
assertThat(secondRegistrationAttempt, equalTo(unresolvedTaskManagerLocation.getResourceID()));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway in project flink by apache.
the class TaskExecutorTest method testSlotOfferCounterIsSeparatedByJob.
@Test
public void testSlotOfferCounterIsSeparatedByJob() throws Exception {
final OneShotLatch taskExecutorIsRegistered = new OneShotLatch();
final TestingResourceManagerGateway resourceManagerGateway = createRmWithTmRegisterAndNotifySlotHooks(new InstanceID(), taskExecutorIsRegistered, new CompletableFuture<>());
final CompletableFuture<Collection<SlotOffer>> firstOfferResponseFuture = new CompletableFuture<>();
final CompletableFuture<Collection<SlotOffer>> secondOfferResponseFuture = new CompletableFuture<>();
final Queue<CompletableFuture<Collection<SlotOffer>>> slotOfferResponses = new ArrayDeque<>(Arrays.asList(firstOfferResponseFuture, secondOfferResponseFuture));
final MultiShotLatch offerSlotsLatch = new MultiShotLatch();
final TestingJobMasterGateway jobMasterGateway1 = new TestingJobMasterGatewayBuilder().setAddress("jm1").setOfferSlotsFunction((resourceID, slotOffers) -> {
offerSlotsLatch.trigger();
return slotOfferResponses.remove();
}).build();
final TestingJobMasterGateway jobMasterGateway2 = new TestingJobMasterGatewayBuilder().setAddress("jm2").setOfferSlotsFunction((resourceID, slotOffers) -> {
offerSlotsLatch.trigger();
return slotOfferResponses.remove();
}).build();
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
rpc.registerGateway(jobMasterGateway1.getAddress(), jobMasterGateway1);
rpc.registerGateway(jobMasterGateway2.getAddress(), jobMasterGateway2);
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(2);
final TaskManagerServices taskManagerServices = createTaskManagerServicesWithTaskSlotTable(taskSlotTable);
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices);
final ThreadSafeTaskSlotTable<Task> threadSafeTaskSlotTable = new ThreadSafeTaskSlotTable<>(taskSlotTable, taskExecutor.getMainThreadExecutableForTesting());
final SlotOffer slotOffer1 = new SlotOffer(new AllocationID(), 0, ResourceProfile.ANY);
final SlotOffer slotOffer2 = new SlotOffer(new AllocationID(), 1, ResourceProfile.ANY);
try {
taskExecutor.start();
taskExecutor.waitUntilStarted();
final TaskExecutorGateway tmGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
// wait until task executor registered at the RM
taskExecutorIsRegistered.await();
// notify job leader to start slot offering
jobManagerLeaderRetriever.notifyListener(jobMasterGateway1.getAddress(), jobMasterGateway1.getFencingToken().toUUID());
jobManagerLeaderRetriever2.notifyListener(jobMasterGateway2.getAddress(), jobMasterGateway2.getFencingToken().toUUID());
// request the first slot
requestSlot(tmGateway, jobId, slotOffer1.getAllocationId(), buildSlotID(slotOffer1.getSlotIndex()), ResourceProfile.UNKNOWN, jobMasterGateway1.getAddress(), resourceManagerGateway.getFencingToken());
// wait until first slot offer as arrived
offerSlotsLatch.await();
// request second slot, triggering another offer containing both slots
requestSlot(tmGateway, jobId2, slotOffer2.getAllocationId(), buildSlotID(slotOffer2.getSlotIndex()), ResourceProfile.UNKNOWN, jobMasterGateway2.getAddress(), resourceManagerGateway.getFencingToken());
// wait until second slot offer as arrived
offerSlotsLatch.await();
firstOfferResponseFuture.complete(Collections.singletonList(slotOffer1));
secondOfferResponseFuture.complete(Collections.singletonList(slotOffer2));
assertThat(threadSafeTaskSlotTable.getActiveTaskSlotAllocationIdsPerJob(jobId), contains(slotOffer1.getAllocationId()));
assertThat(threadSafeTaskSlotTable.getActiveTaskSlotAllocationIdsPerJob(jobId2), contains(slotOffer2.getAllocationId()));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway in project flink by apache.
the class TaskExecutorTest method testRMHeartbeatStopWhenLeadershipRevoked.
/**
* Tests that the heartbeat is stopped once the TaskExecutor detects that the RM is no longer
* leader.
*
* <p>See FLINK-8462
*/
@Test
public void testRMHeartbeatStopWhenLeadershipRevoked() throws Exception {
final long heartbeatInterval = 1L;
final long heartbeatTimeout = 10000L;
final long pollTimeout = 1000L;
final RecordingHeartbeatServices heartbeatServices = new RecordingHeartbeatServices(heartbeatInterval, heartbeatTimeout);
final ResourceID rmResourceID = ResourceID.generate();
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final String rmAddress = "rm";
final TestingResourceManagerGateway rmGateway = new TestingResourceManagerGateway(ResourceManagerId.generate(), rmResourceID, rmAddress, rmAddress);
rpc.registerGateway(rmAddress, rmGateway);
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices, heartbeatServices);
try {
taskExecutor.start();
final BlockingQueue<ResourceID> unmonitoredTargets = heartbeatServices.getUnmonitoredTargets();
final BlockingQueue<ResourceID> monitoredTargets = heartbeatServices.getMonitoredTargets();
resourceManagerLeaderRetriever.notifyListener(rmAddress, rmGateway.getFencingToken().toUUID());
// wait for TM registration by checking the registered heartbeat targets
assertThat(monitoredTargets.poll(pollTimeout, TimeUnit.MILLISECONDS), equalTo(rmResourceID));
// let RM lose leadership
resourceManagerLeaderRetriever.notifyListener(null, null);
// the timeout should not have triggered since it is much higher
assertThat(unmonitoredTargets.poll(pollTimeout, TimeUnit.MILLISECONDS), equalTo(rmResourceID));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.resourcemanager.utils.TestingResourceManagerGateway in project flink by apache.
the class TaskExecutorTest method testSyncSlotsWithJobMasterByHeartbeat.
/**
* Tests that the TaskExecutor syncs its slots view with the JobMaster's view via the
* AllocatedSlotReport reported by the heartbeat (See FLINK-11059).
*/
@Test
public void testSyncSlotsWithJobMasterByHeartbeat() throws Exception {
final CountDownLatch activeSlots = new CountDownLatch(2);
final TaskSlotTable<Task> taskSlotTable = new ActivateSlotNotifyingTaskSlotTable(2, activeSlots);
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).build();
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices);
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final BlockingQueue<AllocationID> allocationsNotifiedFree = new ArrayBlockingQueue<>(2);
OneShotLatch initialSlotReporting = new OneShotLatch();
testingResourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReporting.trigger();
return CompletableFuture.completedFuture(Acknowledge.get());
});
testingResourceManagerGateway.setNotifySlotAvailableConsumer(instanceIDSlotIDAllocationIDTuple3 -> allocationsNotifiedFree.offer(instanceIDSlotIDAllocationIDTuple3.f2));
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
final BlockingQueue<AllocationID> failedSlotFutures = new ArrayBlockingQueue<>(2);
final ResourceID jobManagerResourceId = ResourceID.generate();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setFailSlotConsumer((resourceID, allocationID, throwable) -> failedSlotFutures.offer(allocationID)).setOfferSlotsFunction((resourceID, slotOffers) -> CompletableFuture.completedFuture(new ArrayList<>(slotOffers))).setRegisterTaskManagerFunction((ignoredJobId, ignoredTaskManagerRegistrationInformation) -> CompletableFuture.completedFuture(new JMTMRegistrationSuccess(jobManagerResourceId))).build();
final String jobManagerAddress = jobMasterGateway.getAddress();
rpc.registerGateway(jobManagerAddress, jobMasterGateway);
jobManagerLeaderRetriever.notifyListener(jobManagerAddress, jobMasterGateway.getFencingToken().toUUID());
taskExecutor.start();
try {
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
initialSlotReporting.await();
final AllocationID allocationIdInBoth = new AllocationID();
final AllocationID allocationIdOnlyInJM = new AllocationID();
final AllocationID allocationIdOnlyInTM = new AllocationID();
taskExecutorGateway.requestSlot(new SlotID(taskExecutor.getResourceID(), 0), jobId, allocationIdInBoth, ResourceProfile.ZERO, "foobar", testingResourceManagerGateway.getFencingToken(), timeout);
taskExecutorGateway.requestSlot(new SlotID(taskExecutor.getResourceID(), 1), jobId, allocationIdOnlyInTM, ResourceProfile.ZERO, "foobar", testingResourceManagerGateway.getFencingToken(), timeout);
activeSlots.await();
List<AllocatedSlotInfo> allocatedSlotInfos = Arrays.asList(new AllocatedSlotInfo(0, allocationIdInBoth), new AllocatedSlotInfo(1, allocationIdOnlyInJM));
AllocatedSlotReport allocatedSlotReport = new AllocatedSlotReport(jobId, allocatedSlotInfos);
taskExecutorGateway.heartbeatFromJobManager(jobManagerResourceId, allocatedSlotReport);
assertThat(failedSlotFutures.take(), is(allocationIdOnlyInJM));
assertThat(allocationsNotifiedFree.take(), is(allocationIdOnlyInTM));
assertThat(failedSlotFutures.poll(5L, TimeUnit.MILLISECONDS), nullValue());
assertThat(allocationsNotifiedFree.poll(5L, TimeUnit.MILLISECONDS), nullValue());
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
Aggregations