use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.
the class TaskExecutorTest method testInitialSlotReportFailure.
/**
* Tests that the {@link TaskExecutor} tries to reconnect if the initial slot report fails.
*/
@Test
public void testInitialSlotReportFailure() throws Exception {
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).build();
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices);
taskExecutor.start();
try {
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
testingResourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
try {
return responseQueue.take();
} catch (InterruptedException e) {
return FutureUtils.completedExceptionally(e);
}
});
final CompletableFuture<RegistrationResponse> registrationResponse = CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("foobar", 1234)));
final CountDownLatch numberRegistrations = new CountDownLatch(2);
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
numberRegistrations.countDown();
return registrationResponse;
});
responseQueue.offer(FutureUtils.completedExceptionally(new FlinkException("Test exception")));
responseQueue.offer(CompletableFuture.completedFuture(Acknowledge.get()));
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
// wait for the second registration attempt
numberRegistrations.await();
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.
the class TaskExecutorTest method createRmWithTmRegisterAndNotifySlotHooks.
private TestingResourceManagerGateway createRmWithTmRegisterAndNotifySlotHooks(InstanceID registrationId, OneShotLatch taskExecutorIsRegistered, CompletableFuture<Tuple3<InstanceID, SlotID, AllocationID>> availableSlotFuture) {
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerGateway.getFencingToken().toUUID());
resourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(registrationId, resourceManagerGateway.getOwnResourceId(), new ClusterInformation("localhost", 1234))));
resourceManagerGateway.setNotifySlotAvailableConsumer(availableSlotFuture::complete);
resourceManagerGateway.setSendSlotReportFunction(ignored -> {
taskExecutorIsRegistered.trigger();
return CompletableFuture.completedFuture(Acknowledge.get());
});
return resourceManagerGateway;
}
use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.
the class TaskExecutorTest method testImmediatelyRegistersIfLeaderIsKnown.
@Test
public void testImmediatelyRegistersIfLeaderIsKnown() throws Exception {
final String resourceManagerAddress = "/resource/manager/address/one";
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final CountDownLatch taskManagerRegisteredLatch = new CountDownLatch(1);
testingResourceManagerGateway.setRegisterTaskExecutorFunction(FunctionUtils.uncheckedFunction(ignored -> {
taskManagerRegisteredLatch.countDown();
return CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), new ResourceID(resourceManagerAddress), new ClusterInformation("localhost", 1234)));
}));
rpc.registerGateway(resourceManagerAddress, testingResourceManagerGateway);
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(TaskSlotUtils.createTaskSlotTable(1)).setTaskStateManager(createTaskExecutorLocalStateStoresManager()).build();
final TaskExecutor taskManager = createTaskExecutor(taskManagerServices);
try {
taskManager.start();
resourceManagerLeaderRetriever.notifyListener(resourceManagerAddress, UUID.randomUUID());
assertTrue(taskManagerRegisteredLatch.await(timeout.toMilliseconds(), TimeUnit.MILLISECONDS));
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.
the class TaskExecutorTest method testReconnectionAttemptIfExplicitlyDisconnected.
/**
* Tests that the TaskExecutor tries to reconnect to a ResourceManager from which it was
* explicitly disconnected.
*/
@Test
public void testReconnectionAttemptIfExplicitlyDisconnected() throws Exception {
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final TaskExecutor taskExecutor = createTaskExecutor(new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).build());
taskExecutor.start();
try {
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final ClusterInformation clusterInformation = new ClusterInformation("foobar", 1234);
final CompletableFuture<RegistrationResponse> registrationResponseFuture = CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), ResourceID.generate(), clusterInformation));
final BlockingQueue<ResourceID> registrationQueue = new ArrayBlockingQueue<>(1);
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
registrationQueue.offer(taskExecutorRegistration.getResourceId());
return registrationResponseFuture;
});
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
final ResourceID firstRegistrationAttempt = registrationQueue.take();
assertThat(firstRegistrationAttempt, equalTo(unresolvedTaskManagerLocation.getResourceID()));
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
assertThat(registrationQueue, is(empty()));
taskExecutorGateway.disconnectResourceManager(new FlinkException("Test exception"));
final ResourceID secondRegistrationAttempt = registrationQueue.take();
assertThat(secondRegistrationAttempt, equalTo(unresolvedTaskManagerLocation.getResourceID()));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.entrypoint.ClusterInformation in project flink by apache.
the class TaskExecutorTest method runJobManagerHeartbeatTest.
private void runJobManagerHeartbeatTest(ResourceID jmResourceId, HeartbeatServices heartbeatServices, Consumer<TestingJobMasterGatewayBuilder> jobMasterGatewayBuilderConsumer, TriConsumer<ResourceID, TaskExecutorGateway, AllocationID> heartbeatAction) throws IOException, InterruptedException, ExecutionException, TimeoutException {
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, RetryingRegistrationConfiguration.defaultConfiguration());
final String jobMasterAddress = "jm";
final UUID jmLeaderId = UUID.randomUUID();
final CountDownLatch registrationAttempts = new CountDownLatch(2);
final OneShotLatch slotOfferedLatch = new OneShotLatch();
final CompletableFuture<ResourceID> disconnectTaskManagerFuture = new CompletableFuture<>();
final TestingJobMasterGatewayBuilder testingJobMasterGatewayBuilder = new TestingJobMasterGatewayBuilder().setRegisterTaskManagerFunction((ignoredJobId, ignoredTaskManagerRegistrationInformation) -> {
registrationAttempts.countDown();
return CompletableFuture.completedFuture(new JMTMRegistrationSuccess(jmResourceId));
}).setDisconnectTaskManagerFunction(resourceID -> {
disconnectTaskManagerFuture.complete(resourceID);
return CompletableFuture.completedFuture(Acknowledge.get());
}).setOfferSlotsFunction((resourceID, slotOffers) -> {
slotOfferedLatch.trigger();
return CompletableFuture.completedFuture(slotOffers);
});
jobMasterGatewayBuilderConsumer.accept(testingJobMasterGatewayBuilder);
final TestingJobMasterGateway jobMasterGateway = testingJobMasterGatewayBuilder.build();
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(TaskSlotUtils.createTaskSlotTable(1)).setJobLeaderService(jobLeaderService).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskManager = createTestingTaskExecutor(taskManagerServices, heartbeatServices);
final OneShotLatch slotReportReceived = new OneShotLatch();
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
testingResourceManagerGateway.setSendSlotReportFunction(ignored -> {
slotReportReceived.trigger();
return CompletableFuture.completedFuture(Acknowledge.get());
});
final Queue<CompletableFuture<RegistrationResponse>> registrationResponses = new ArrayDeque<>();
registrationResponses.add(CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("foobar", 1234))));
registrationResponses.add(new CompletableFuture<>());
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> registrationResponses.poll());
rpc.registerGateway(jobMasterAddress, jobMasterGateway);
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
try {
taskManager.start();
taskManager.waitUntilStarted();
final TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
slotReportReceived.await();
final AllocationID allocationId = new AllocationID();
requestSlot(taskExecutorGateway, jobId, allocationId, buildSlotID(0), ResourceProfile.UNKNOWN, jobMasterAddress, testingResourceManagerGateway.getFencingToken());
// now inform the task manager about the new job leader
jobManagerLeaderRetriever.notifyListener(jobMasterAddress, jmLeaderId);
// register task manager success will trigger monitoring heartbeat target between tm and
// jm
slotOfferedLatch.await();
heartbeatAction.accept(unresolvedTaskManagerLocation.getResourceID(), taskExecutorGateway, allocationId);
// the timeout should trigger disconnecting from the JobManager
final ResourceID resourceID = disconnectTaskManagerFuture.get();
assertThat(resourceID, equalTo(unresolvedTaskManagerLocation.getResourceID()));
assertTrue("The TaskExecutor should try to reconnect to the JM", registrationAttempts.await(timeout.toMilliseconds(), TimeUnit.SECONDS));
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
Aggregations