use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method testJobLeaderDetection.
/**
* Tests that a TaskManager detects a job leader for which it has reserved slots. Upon detecting
* the job leader, it will offer all reserved slots to the JobManager.
*/
@Test
public void testJobLeaderDetection() throws Exception {
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, RetryingRegistrationConfiguration.defaultConfiguration());
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
CompletableFuture<Void> initialSlotReportFuture = new CompletableFuture<>();
resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReportFuture.complete(null);
return CompletableFuture.completedFuture(Acknowledge.get());
});
final CompletableFuture<Collection<SlotOffer>> offeredSlotsFuture = new CompletableFuture<>();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offeredSlotsFuture.complete(new ArrayList<>(slotOffers));
return CompletableFuture.completedFuture(slotOffers);
}).build();
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final AllocationID allocationId = new AllocationID();
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setJobLeaderService(jobLeaderService).setTaskStateManager(localStateStoresManager).build();
TaskExecutor taskManager = createTaskExecutor(taskManagerServices);
try {
taskManager.start();
final TaskExecutorGateway tmGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
// tell the task manager about the rm leader
resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerGateway.getFencingToken().toUUID());
initialSlotReportFuture.get();
requestSlot(tmGateway, jobId, allocationId, buildSlotID(0), ResourceProfile.ZERO, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken());
// now inform the task manager about the new job leader
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
final Collection<SlotOffer> offeredSlots = offeredSlotsFuture.get();
final Collection<AllocationID> allocationIds = offeredSlots.stream().map(SlotOffer::getAllocationId).collect(Collectors.toList());
assertThat(allocationIds, containsInAnyOrder(allocationId));
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method testRemoveJobFromJobLeaderService.
/**
* Tests that a job is removed from the JobLeaderService once a TaskExecutor has no more slots
* assigned to this job.
*
* <p>See FLINK-8504
*/
@Test
public void testRemoveJobFromJobLeaderService() throws Exception {
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices);
try {
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
final CompletableFuture<Void> initialSlotReport = new CompletableFuture<>();
resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReport.complete(null);
return CompletableFuture.completedFuture(Acknowledge.get());
});
final ResourceManagerId resourceManagerId = resourceManagerGateway.getFencingToken();
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerId.toUUID());
final CompletableFuture<LeaderRetrievalListener> startFuture = new CompletableFuture<>();
final CompletableFuture<Void> stopFuture = new CompletableFuture<>();
final StartStopNotifyingLeaderRetrievalService jobMasterLeaderRetriever = new StartStopNotifyingLeaderRetrievalService(startFuture, stopFuture);
haServices.setJobMasterLeaderRetriever(jobId, jobMasterLeaderRetriever);
taskExecutor.start();
taskExecutor.waitUntilStarted();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final SlotID slotId = buildSlotID(0);
final AllocationID allocationId = new AllocationID();
assertThat(startFuture.isDone(), is(false));
final JobLeaderService jobLeaderService = taskManagerServices.getJobLeaderService();
assertThat(jobLeaderService.containsJob(jobId), is(false));
// wait for the initial slot report
initialSlotReport.get();
requestSlot(taskExecutorGateway, jobId, allocationId, slotId, ResourceProfile.ZERO, "foobar", resourceManagerId);
// wait until the job leader retrieval service for jobId is started
startFuture.get();
assertThat(jobLeaderService.containsJob(jobId), is(true));
taskExecutorGateway.freeSlot(allocationId, new FlinkException("Test exception"), timeout).get();
// wait that the job leader retrieval service for jobId stopped becaue it should get
// removed
stopFuture.get();
assertThat(jobLeaderService.containsJob(jobId), is(false));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method testRMHeartbeatStopWhenLeadershipRevoked.
/**
* Tests that the heartbeat is stopped once the TaskExecutor detects that the RM is no longer
* leader.
*
* <p>See FLINK-8462
*/
@Test
public void testRMHeartbeatStopWhenLeadershipRevoked() throws Exception {
final long heartbeatInterval = 1L;
final long heartbeatTimeout = 10000L;
final long pollTimeout = 1000L;
final RecordingHeartbeatServices heartbeatServices = new RecordingHeartbeatServices(heartbeatInterval, heartbeatTimeout);
final ResourceID rmResourceID = ResourceID.generate();
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final String rmAddress = "rm";
final TestingResourceManagerGateway rmGateway = new TestingResourceManagerGateway(ResourceManagerId.generate(), rmResourceID, rmAddress, rmAddress);
rpc.registerGateway(rmAddress, rmGateway);
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices, heartbeatServices);
try {
taskExecutor.start();
final BlockingQueue<ResourceID> unmonitoredTargets = heartbeatServices.getUnmonitoredTargets();
final BlockingQueue<ResourceID> monitoredTargets = heartbeatServices.getMonitoredTargets();
resourceManagerLeaderRetriever.notifyListener(rmAddress, rmGateway.getFencingToken().toUUID());
// wait for TM registration by checking the registered heartbeat targets
assertThat(monitoredTargets.poll(pollTimeout, TimeUnit.MILLISECONDS), equalTo(rmResourceID));
// let RM lose leadership
resourceManagerLeaderRetriever.notifyListener(null, null);
// the timeout should not have triggered since it is much higher
assertThat(unmonitoredTargets.poll(pollTimeout, TimeUnit.MILLISECONDS), equalTo(rmResourceID));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method runJobManagerHeartbeatTest.
private void runJobManagerHeartbeatTest(ResourceID jmResourceId, HeartbeatServices heartbeatServices, Consumer<TestingJobMasterGatewayBuilder> jobMasterGatewayBuilderConsumer, TriConsumer<ResourceID, TaskExecutorGateway, AllocationID> heartbeatAction) throws IOException, InterruptedException, ExecutionException, TimeoutException {
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, RetryingRegistrationConfiguration.defaultConfiguration());
final String jobMasterAddress = "jm";
final UUID jmLeaderId = UUID.randomUUID();
final CountDownLatch registrationAttempts = new CountDownLatch(2);
final OneShotLatch slotOfferedLatch = new OneShotLatch();
final CompletableFuture<ResourceID> disconnectTaskManagerFuture = new CompletableFuture<>();
final TestingJobMasterGatewayBuilder testingJobMasterGatewayBuilder = new TestingJobMasterGatewayBuilder().setRegisterTaskManagerFunction((ignoredJobId, ignoredTaskManagerRegistrationInformation) -> {
registrationAttempts.countDown();
return CompletableFuture.completedFuture(new JMTMRegistrationSuccess(jmResourceId));
}).setDisconnectTaskManagerFunction(resourceID -> {
disconnectTaskManagerFuture.complete(resourceID);
return CompletableFuture.completedFuture(Acknowledge.get());
}).setOfferSlotsFunction((resourceID, slotOffers) -> {
slotOfferedLatch.trigger();
return CompletableFuture.completedFuture(slotOffers);
});
jobMasterGatewayBuilderConsumer.accept(testingJobMasterGatewayBuilder);
final TestingJobMasterGateway jobMasterGateway = testingJobMasterGatewayBuilder.build();
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(TaskSlotUtils.createTaskSlotTable(1)).setJobLeaderService(jobLeaderService).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskManager = createTestingTaskExecutor(taskManagerServices, heartbeatServices);
final OneShotLatch slotReportReceived = new OneShotLatch();
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
testingResourceManagerGateway.setSendSlotReportFunction(ignored -> {
slotReportReceived.trigger();
return CompletableFuture.completedFuture(Acknowledge.get());
});
final Queue<CompletableFuture<RegistrationResponse>> registrationResponses = new ArrayDeque<>();
registrationResponses.add(CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("foobar", 1234))));
registrationResponses.add(new CompletableFuture<>());
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> registrationResponses.poll());
rpc.registerGateway(jobMasterAddress, jobMasterGateway);
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
try {
taskManager.start();
taskManager.waitUntilStarted();
final TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
slotReportReceived.await();
final AllocationID allocationId = new AllocationID();
requestSlot(taskExecutorGateway, jobId, allocationId, buildSlotID(0), ResourceProfile.UNKNOWN, jobMasterAddress, testingResourceManagerGateway.getFencingToken());
// now inform the task manager about the new job leader
jobManagerLeaderRetriever.notifyListener(jobMasterAddress, jmLeaderId);
// register task manager success will trigger monitoring heartbeat target between tm and
// jm
slotOfferedLatch.await();
heartbeatAction.accept(unresolvedTaskManagerLocation.getResourceID(), taskExecutorGateway, allocationId);
// the timeout should trigger disconnecting from the JobManager
final ResourceID resourceID = disconnectTaskManagerFuture.get();
assertThat(resourceID, equalTo(unresolvedTaskManagerLocation.getResourceID()));
assertTrue("The TaskExecutor should try to reconnect to the JM", registrationAttempts.await(timeout.toMilliseconds(), TimeUnit.SECONDS));
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method testShouldShutDownTaskManagerServicesInPostStop.
@Test
public void testShouldShutDownTaskManagerServicesInPostStop() throws Exception {
final TaskSlotTableImpl<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, RetryingRegistrationConfiguration.defaultConfiguration());
final IOManager ioManager = new IOManagerAsync(tmp.newFolder().getAbsolutePath());
final TaskExecutorLocalStateStoresManager localStateStoresManager = new TaskExecutorLocalStateStoresManager(false, Reference.borrowed(ioManager.getSpillingDirectories()), Executors.directExecutor());
nettyShuffleEnvironment.start();
final KvStateService kvStateService = new KvStateService(new KvStateRegistry(), null, null);
kvStateService.start();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setIoManager(ioManager).setShuffleEnvironment(nettyShuffleEnvironment).setKvStateService(kvStateService).setTaskSlotTable(taskSlotTable).setJobLeaderService(jobLeaderService).setTaskStateManager(localStateStoresManager).build();
final TaskExecutor taskManager = createTaskExecutor(taskManagerServices);
try {
taskManager.start();
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
assertThat(taskSlotTable.isClosed(), is(true));
assertThat(nettyShuffleEnvironment.isClosed(), is(true));
assertThat(kvStateService.isShutdown(), is(true));
}
Aggregations