use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class TaskExecutorTest method testRemoveJobFromJobLeaderService.
/**
* Tests that a job is removed from the JobLeaderService once a TaskExecutor has no more slots
* assigned to this job.
*
* <p>See FLINK-8504
*/
@Test
public void testRemoveJobFromJobLeaderService() throws Exception {
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices);
try {
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
final CompletableFuture<Void> initialSlotReport = new CompletableFuture<>();
resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReport.complete(null);
return CompletableFuture.completedFuture(Acknowledge.get());
});
final ResourceManagerId resourceManagerId = resourceManagerGateway.getFencingToken();
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerId.toUUID());
final CompletableFuture<LeaderRetrievalListener> startFuture = new CompletableFuture<>();
final CompletableFuture<Void> stopFuture = new CompletableFuture<>();
final StartStopNotifyingLeaderRetrievalService jobMasterLeaderRetriever = new StartStopNotifyingLeaderRetrievalService(startFuture, stopFuture);
haServices.setJobMasterLeaderRetriever(jobId, jobMasterLeaderRetriever);
taskExecutor.start();
taskExecutor.waitUntilStarted();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final SlotID slotId = buildSlotID(0);
final AllocationID allocationId = new AllocationID();
assertThat(startFuture.isDone(), is(false));
final JobLeaderService jobLeaderService = taskManagerServices.getJobLeaderService();
assertThat(jobLeaderService.containsJob(jobId), is(false));
// wait for the initial slot report
initialSlotReport.get();
requestSlot(taskExecutorGateway, jobId, allocationId, slotId, ResourceProfile.ZERO, "foobar", resourceManagerId);
// wait until the job leader retrieval service for jobId is started
startFuture.get();
assertThat(jobLeaderService.containsJob(jobId), is(true));
taskExecutorGateway.freeSlot(allocationId, new FlinkException("Test exception"), timeout).get();
// wait that the job leader retrieval service for jobId stopped becaue it should get
// removed
stopFuture.get();
assertThat(jobLeaderService.containsJob(jobId), is(false));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class TaskExecutorTest method testDynamicSlotAllocation.
@Test
public void testDynamicSlotAllocation() throws Exception {
final AllocationID allocationId = new AllocationID();
try (TaskExecutorTestingContext submissionContext = createTaskExecutorTestingContext(2)) {
submissionContext.start();
final CompletableFuture<Tuple3<ResourceID, InstanceID, SlotReport>> initialSlotReportFuture = new CompletableFuture<>();
ResourceManagerId resourceManagerId = createAndRegisterResourceManager(initialSlotReportFuture);
initialSlotReportFuture.get();
final ResourceProfile resourceProfile = DEFAULT_RESOURCE_PROFILE.merge(ResourceProfile.newBuilder().setCpuCores(0.1).build());
TaskExecutorGateway selfGateway = submissionContext.taskExecutor.getSelfGateway(TaskExecutorGateway.class);
requestSlot(selfGateway, jobId, allocationId, SlotID.getDynamicSlotID(ResourceID.generate()), resourceProfile, submissionContext.jobMasterGateway.getAddress(), resourceManagerId);
ResourceID resourceId = ResourceID.generate();
SlotReport slotReport = submissionContext.taskSlotTable.createSlotReport(resourceId);
assertThat(slotReport, containsInAnyOrder(new SlotStatus(new SlotID(resourceId, 0), DEFAULT_RESOURCE_PROFILE), new SlotStatus(new SlotID(resourceId, 1), DEFAULT_RESOURCE_PROFILE), new SlotStatus(new SlotID(resourceId, 2), resourceProfile, jobId, allocationId)));
}
}
use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class TaskExecutorTest method runResourceManagerHeartbeatTest.
private void runResourceManagerHeartbeatTest(HeartbeatServices heartbeatServices, Consumer<TestingResourceManagerGateway> setupResourceManagerGateway, TriConsumerWithException<TaskExecutorGateway, ResourceID, CompletableFuture<ResourceID>, Exception> heartbeatAction) throws Exception {
final String rmAddress = "rm";
final ResourceID rmResourceId = new ResourceID(rmAddress);
final ResourceManagerId rmLeaderId = ResourceManagerId.generate();
TestingResourceManagerGateway rmGateway = new TestingResourceManagerGateway(rmLeaderId, rmResourceId, rmAddress, rmAddress);
final TaskExecutorRegistrationSuccess registrationResponse = new TaskExecutorRegistrationSuccess(new InstanceID(), rmResourceId, new ClusterInformation("localhost", 1234));
final Queue<CompletableFuture<RegistrationResponse>> registrationResponses = new ArrayDeque<>(2);
registrationResponses.add(CompletableFuture.completedFuture(registrationResponse));
registrationResponses.add(new CompletableFuture<>());
final CompletableFuture<ResourceID> taskExecutorRegistrationFuture = new CompletableFuture<>();
final CountDownLatch registrationAttempts = new CountDownLatch(2);
rmGateway.setRegisterTaskExecutorFunction(registration -> {
taskExecutorRegistrationFuture.complete(registration.getResourceId());
registrationAttempts.countDown();
return registrationResponses.poll();
});
setupResourceManagerGateway.accept(rmGateway);
final CompletableFuture<ResourceID> taskExecutorDisconnectFuture = new CompletableFuture<>();
rmGateway.setDisconnectTaskExecutorConsumer(disconnectInfo -> taskExecutorDisconnectFuture.complete(disconnectInfo.f0));
rpc.registerGateway(rmAddress, rmGateway);
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).build();
final TaskExecutor taskManager = createTaskExecutor(taskManagerServices, heartbeatServices);
try {
taskManager.start();
final TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
// define a leader and see that a registration happens
resourceManagerLeaderRetriever.notifyListener(rmAddress, rmLeaderId.toUUID());
// register resource manager success will trigger monitoring heartbeat target between tm
// and rm
assertThat(taskExecutorRegistrationFuture.get(), equalTo(unresolvedTaskManagerLocation.getResourceID()));
heartbeatAction.accept(taskExecutorGateway, rmGateway.getOwnResourceId(), taskExecutorDisconnectFuture);
// heartbeat timeout should trigger disconnect TaskManager from ResourceManager
assertThat(taskExecutorDisconnectFuture.get(timeout.toMilliseconds(), TimeUnit.MILLISECONDS), equalTo(unresolvedTaskManagerLocation.getResourceID()));
assertTrue("The TaskExecutor should try to reconnect to the RM", registrationAttempts.await(timeout.toMilliseconds(), TimeUnit.SECONDS));
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class JobMasterTest method testResourceManagerBecomesUnreachableTriggersDisconnect.
@Test
public void testResourceManagerBecomesUnreachableTriggersDisconnect() throws Exception {
final String resourceManagerAddress = "rm";
final ResourceManagerId resourceManagerId = ResourceManagerId.generate();
final ResourceID rmResourceId = new ResourceID(resourceManagerAddress);
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway(resourceManagerId, rmResourceId, resourceManagerAddress, "localhost");
final CompletableFuture<JobID> disconnectedJobManagerFuture = new CompletableFuture<>();
final CountDownLatch registrationAttempts = new CountDownLatch(2);
final Queue<CompletableFuture<RegistrationResponse>> connectionResponses = new ArrayDeque<>(2);
connectionResponses.add(CompletableFuture.completedFuture(resourceManagerGateway.getJobMasterRegistrationSuccess()));
connectionResponses.add(new CompletableFuture<>());
resourceManagerGateway.setRegisterJobManagerFunction((jobMasterId, resourceID, s, jobID) -> {
registrationAttempts.countDown();
return connectionResponses.poll();
});
resourceManagerGateway.setDisconnectJobManagerConsumer(tuple -> disconnectedJobManagerFuture.complete(tuple.f0));
resourceManagerGateway.setJobMasterHeartbeatFunction(ignored -> FutureUtils.completedExceptionally(new RecipientUnreachableException("sender", "recipient", "resource manager is unreachable")));
rpcService.registerGateway(resourceManagerAddress, resourceManagerGateway);
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withJobMasterId(jobMasterId).withResourceId(jmResourceId).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(heartbeatServices).createJobMaster();
jobMaster.start();
try {
// define a leader and see that a registration happens
rmLeaderRetrievalService.notifyListener(resourceManagerAddress, resourceManagerId.toUUID());
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
CommonTestUtils.waitUntilCondition(() -> {
jobMasterGateway.heartbeatFromResourceManager(rmResourceId);
return disconnectedJobManagerFuture.isDone();
}, Deadline.fromNow(TimeUtils.toDuration(testingTimeout)), 50L);
// heartbeat timeout should trigger disconnect JobManager from ResourceManager
assertThat(disconnectedJobManagerFuture.join(), equalTo(jobGraph.getJobID()));
// the JobMaster should try to reconnect to the RM
registrationAttempts.await();
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class AbstractFineGrainedSlotManagerITCase method testRequirementDeclaration.
private void testRequirementDeclaration(RequirementDeclarationScenario scenario) throws Exception {
final ResourceID resourceID = ResourceID.generate();
final JobID jobId = new JobID();
final SlotID slotId = SlotID.getDynamicSlotID(resourceID);
final String targetAddress = "localhost";
final ResourceRequirements requirements = ResourceRequirements.create(jobId, targetAddress, Collections.singleton(ResourceRequirement.create(DEFAULT_SLOT_RESOURCE_PROFILE, 1)));
final CompletableFuture<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestFuture = new CompletableFuture<>();
// accept an incoming slot request
final TaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(tuple6 -> {
requestFuture.complete(tuple6);
return CompletableFuture.completedFuture(Acknowledge.get());
}).createTestingTaskExecutorGateway();
final TaskExecutorConnection taskExecutorConnection = new TaskExecutorConnection(resourceID, taskExecutorGateway);
new Context() {
{
runTest(() -> {
if (scenario == RequirementDeclarationScenario.TASK_EXECUTOR_REGISTRATION_BEFORE_REQUIREMENT_DECLARATION) {
runInMainThread(() -> getSlotManager().registerTaskManager(taskExecutorConnection, new SlotReport(), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE));
}
runInMainThread(() -> getSlotManager().processResourceRequirements(requirements));
if (scenario == RequirementDeclarationScenario.TASK_EXECUTOR_REGISTRATION_AFTER_REQUIREMENT_DECLARATION) {
runInMainThread(() -> getSlotManager().registerTaskManager(taskExecutorConnection, new SlotReport(), DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE));
}
assertThat(assertFutureCompleteAndReturn(requestFuture), is(equalTo(Tuple6.of(slotId, jobId, assertFutureCompleteAndReturn(requestFuture).f2, DEFAULT_SLOT_RESOURCE_PROFILE, targetAddress, getResourceManagerId()))));
final TaskManagerSlotInformation slot = getTaskManagerTracker().getAllocatedOrPendingSlot(assertFutureCompleteAndReturn(requestFuture).f2).get();
assertEquals("The slot has not been allocated to the expected allocation id.", assertFutureCompleteAndReturn(requestFuture).f2, slot.getAllocationId());
});
}
};
}
Aggregations