use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobMasterTest method testReleasingTaskExecutorIfNoMoreSlotsRegistered.
/**
* Tests that the TaskExecutor is released if all of its slots have been freed.
*/
@Test
public void testReleasingTaskExecutorIfNoMoreSlotsRegistered() throws Exception {
final JobGraph jobGraph = createSingleVertexJobWithRestartStrategy();
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(heartbeatServices).createJobMaster();
final CompletableFuture<JobID> disconnectTaskExecutorFuture = new CompletableFuture<>();
final CompletableFuture<AllocationID> freedSlotFuture = new CompletableFuture<>();
final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setFreeSlotFunction((allocationID, throwable) -> {
freedSlotFuture.complete(allocationID);
return CompletableFuture.completedFuture(Acknowledge.get());
}).setDisconnectJobManagerConsumer((jobID, throwable) -> disconnectTaskExecutorFuture.complete(jobID)).createTestingTaskExecutorGateway();
final LocalUnresolvedTaskManagerLocation taskManagerLocation = new LocalUnresolvedTaskManagerLocation();
try {
jobMaster.start();
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
final Collection<SlotOffer> slotOffers = registerSlotsAtJobMaster(1, jobMasterGateway, jobGraph.getJobID(), testingTaskExecutorGateway, taskManagerLocation);
// check that we accepted the offered slot
assertThat(slotOffers, hasSize(1));
final AllocationID allocationId = slotOffers.iterator().next().getAllocationId();
// now fail the allocation and check that we close the connection to the TaskExecutor
jobMasterGateway.failSlot(taskManagerLocation.getResourceID(), allocationId, new FlinkException("Fail allocation test exception"));
// we should free the slot and then disconnect from the TaskExecutor because we use no
// longer slots from it
assertThat(freedSlotFuture.get(), equalTo(allocationId));
assertThat(disconnectTaskExecutorFuture.get(), equalTo(jobGraph.getJobID()));
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class DispatcherTest method testRetrieveJobResultAfterSubmissionOfFailedJob.
@Test
public void testRetrieveJobResultAfterSubmissionOfFailedJob() throws Exception {
dispatcher = createAndStartDispatcher(heartbeatServices, haServices, new ExpectedJobIdJobManagerRunnerFactory(jobId, createdJobManagerRunnerLatch));
final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
final JobID failedJobId = new JobID();
final String failedJobName = "test";
final CompletableFuture<Acknowledge> submitFuture = dispatcherGateway.submitFailedJob(failedJobId, failedJobName, new RuntimeException("Test exception."));
submitFuture.get();
final ArchivedExecutionGraph archivedExecutionGraph = dispatcherGateway.requestJob(failedJobId, TIMEOUT).get();
Assertions.assertThat(archivedExecutionGraph.getJobID()).isEqualTo(failedJobId);
Assertions.assertThat(archivedExecutionGraph.getJobName()).isEqualTo(failedJobName);
Assertions.assertThat(archivedExecutionGraph.getState()).isEqualTo(JobStatus.FAILED);
Assertions.assertThat(archivedExecutionGraph.getFailureInfo()).isNotNull().extracting(ErrorInfo::getException).extracting(e -> e.deserializeError(Thread.currentThread().getContextClassLoader())).satisfies(exception -> Assertions.assertThat(exception).isInstanceOf(RuntimeException.class).hasMessage("Test exception."));
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class TaskExecutorTest method runJobManagerHeartbeatTest.
private void runJobManagerHeartbeatTest(ResourceID jmResourceId, HeartbeatServices heartbeatServices, Consumer<TestingJobMasterGatewayBuilder> jobMasterGatewayBuilderConsumer, TriConsumer<ResourceID, TaskExecutorGateway, AllocationID> heartbeatAction) throws IOException, InterruptedException, ExecutionException, TimeoutException {
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, RetryingRegistrationConfiguration.defaultConfiguration());
final String jobMasterAddress = "jm";
final UUID jmLeaderId = UUID.randomUUID();
final CountDownLatch registrationAttempts = new CountDownLatch(2);
final OneShotLatch slotOfferedLatch = new OneShotLatch();
final CompletableFuture<ResourceID> disconnectTaskManagerFuture = new CompletableFuture<>();
final TestingJobMasterGatewayBuilder testingJobMasterGatewayBuilder = new TestingJobMasterGatewayBuilder().setRegisterTaskManagerFunction((ignoredJobId, ignoredTaskManagerRegistrationInformation) -> {
registrationAttempts.countDown();
return CompletableFuture.completedFuture(new JMTMRegistrationSuccess(jmResourceId));
}).setDisconnectTaskManagerFunction(resourceID -> {
disconnectTaskManagerFuture.complete(resourceID);
return CompletableFuture.completedFuture(Acknowledge.get());
}).setOfferSlotsFunction((resourceID, slotOffers) -> {
slotOfferedLatch.trigger();
return CompletableFuture.completedFuture(slotOffers);
});
jobMasterGatewayBuilderConsumer.accept(testingJobMasterGatewayBuilder);
final TestingJobMasterGateway jobMasterGateway = testingJobMasterGatewayBuilder.build();
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(TaskSlotUtils.createTaskSlotTable(1)).setJobLeaderService(jobLeaderService).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskManager = createTestingTaskExecutor(taskManagerServices, heartbeatServices);
final OneShotLatch slotReportReceived = new OneShotLatch();
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
testingResourceManagerGateway.setSendSlotReportFunction(ignored -> {
slotReportReceived.trigger();
return CompletableFuture.completedFuture(Acknowledge.get());
});
final Queue<CompletableFuture<RegistrationResponse>> registrationResponses = new ArrayDeque<>();
registrationResponses.add(CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("foobar", 1234))));
registrationResponses.add(new CompletableFuture<>());
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> registrationResponses.poll());
rpc.registerGateway(jobMasterAddress, jobMasterGateway);
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
try {
taskManager.start();
taskManager.waitUntilStarted();
final TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
slotReportReceived.await();
final AllocationID allocationId = new AllocationID();
requestSlot(taskExecutorGateway, jobId, allocationId, buildSlotID(0), ResourceProfile.UNKNOWN, jobMasterAddress, testingResourceManagerGateway.getFencingToken());
// now inform the task manager about the new job leader
jobManagerLeaderRetriever.notifyListener(jobMasterAddress, jmLeaderId);
// register task manager success will trigger monitoring heartbeat target between tm and
// jm
slotOfferedLatch.await();
heartbeatAction.accept(unresolvedTaskManagerLocation.getResourceID(), taskExecutorGateway, allocationId);
// the timeout should trigger disconnecting from the JobManager
final ResourceID resourceID = disconnectTaskManagerFuture.get();
assertThat(resourceID, equalTo(unresolvedTaskManagerLocation.getResourceID()));
assertTrue("The TaskExecutor should try to reconnect to the JM", registrationAttempts.await(timeout.toMilliseconds(), TimeUnit.SECONDS));
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class AbstractDispatcherTest method setUp.
@Before
public void setUp() throws Exception {
heartbeatServices = new HeartbeatServices(1000L, 10000L);
haServices = new TestingHighAvailabilityServices();
haServices.setCheckpointRecoveryFactory(new StandaloneCheckpointRecoveryFactory());
haServices.setResourceManagerLeaderRetriever(new SettableLeaderRetrievalService());
haServices.setJobGraphStore(new StandaloneJobGraphStore());
haServices.setJobResultStore(new EmbeddedJobResultStore());
configuration = new Configuration();
blobServer = new BlobServer(configuration, temporaryFolder.newFolder(), new VoidBlobStore());
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobMasterTest method setupClass.
@BeforeClass
public static void setupClass() {
rpcService = new TestingRpcService();
fastHeartbeatServices = new HeartbeatServices(fastHeartbeatInterval, fastHeartbeatTimeout, -1);
heartbeatServices = new HeartbeatServices(heartbeatInterval, heartbeatTimeout, 1);
}
Aggregations