use of org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess in project flink by apache.
the class ResourceManagerTaskExecutorTest method testDelayedRegisterTaskExecutor.
/**
* Test delayed registration of task executor where the delay is introduced during connection
* from resource manager to the registering task executor.
*/
@Test
public void testDelayedRegisterTaskExecutor() throws Exception {
final Time fastTimeout = Time.milliseconds(1L);
try {
final OneShotLatch startConnection = new OneShotLatch();
final OneShotLatch finishConnection = new OneShotLatch();
// first registration is with blocking connection
rpcService.setRpcGatewayFutureFunction(rpcGateway -> CompletableFuture.supplyAsync(() -> {
startConnection.trigger();
try {
finishConnection.await();
} catch (InterruptedException ignored) {
}
return rpcGateway;
}, TestingUtils.defaultExecutor()));
TaskExecutorRegistration taskExecutorRegistration = new TaskExecutorRegistration(taskExecutorGateway.getAddress(), taskExecutorResourceID, dataPort, jmxPort, hardwareDescription, new TaskExecutorMemoryConfiguration(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), DEFAULT_SLOT_PROFILE, DEFAULT_SLOT_PROFILE);
CompletableFuture<RegistrationResponse> firstFuture = rmGateway.registerTaskExecutor(taskExecutorRegistration, fastTimeout);
try {
firstFuture.get();
fail("Should have failed because connection to taskmanager is delayed beyond timeout");
} catch (Exception e) {
final Throwable cause = ExceptionUtils.stripExecutionException(e);
assertThat(cause, instanceOf(TimeoutException.class));
assertThat(cause.getMessage(), containsString("ResourceManagerGateway.registerTaskExecutor"));
}
startConnection.await();
// second registration after timeout is with no delay, expecting it to be succeeded
rpcService.resetRpcGatewayFutureFunction();
CompletableFuture<RegistrationResponse> secondFuture = rmGateway.registerTaskExecutor(taskExecutorRegistration, TIMEOUT);
RegistrationResponse response = secondFuture.get();
assertTrue(response instanceof TaskExecutorRegistrationSuccess);
// on success, send slot report for taskmanager registration
final SlotReport slotReport = new SlotReport(new SlotStatus(new SlotID(taskExecutorResourceID, 0), ResourceProfile.ANY));
rmGateway.sendSlotReport(taskExecutorResourceID, ((TaskExecutorRegistrationSuccess) response).getRegistrationId(), slotReport, TIMEOUT).get();
// let the remaining part of the first registration proceed
finishConnection.trigger();
Thread.sleep(1L);
// verify that the latest registration is valid not being unregistered by the delayed
// one
final TaskManagerInfoWithSlots taskManagerInfoWithSlots = rmGateway.requestTaskManagerDetailsInfo(taskExecutorResourceID, TIMEOUT).get();
assertThat(taskManagerInfoWithSlots.getTaskManagerInfo().getResourceId(), equalTo(taskExecutorResourceID));
assertThat(taskManagerInfoWithSlots.getTaskManagerInfo().getNumberSlots(), equalTo(1));
} finally {
rpcService.resetRpcGatewayFutureFunction();
}
}
use of org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess in project flink by apache.
the class ResourceManagerTaskExecutorTest method testRegisterTaskExecutor.
/**
* Test receive normal registration from task executor and receive duplicate registration from
* task executor.
*/
@Test
public void testRegisterTaskExecutor() throws Exception {
// test response successful
CompletableFuture<RegistrationResponse> successfulFuture = registerTaskExecutor(rmGateway, taskExecutorGateway.getAddress());
RegistrationResponse response = successfulFuture.get(TIMEOUT.toMilliseconds(), TimeUnit.MILLISECONDS);
assertTrue(response instanceof TaskExecutorRegistrationSuccess);
final TaskManagerInfoWithSlots taskManagerInfoWithSlots = rmGateway.requestTaskManagerDetailsInfo(taskExecutorResourceID, TIMEOUT).get();
assertThat(taskManagerInfoWithSlots.getTaskManagerInfo().getResourceId(), equalTo(taskExecutorResourceID));
// test response successful with instanceID not equal to previous when receive duplicate
// registration from taskExecutor
CompletableFuture<RegistrationResponse> duplicateFuture = registerTaskExecutor(rmGateway, taskExecutorGateway.getAddress());
RegistrationResponse duplicateResponse = duplicateFuture.get();
assertTrue(duplicateResponse instanceof TaskExecutorRegistrationSuccess);
assertNotEquals(((TaskExecutorRegistrationSuccess) response).getRegistrationId(), ((TaskExecutorRegistrationSuccess) duplicateResponse).getRegistrationId());
assertThat(rmGateway.requestResourceOverview(TIMEOUT).get().getNumberTaskManagers(), is(1));
}
use of org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess in project flink by apache.
the class ResourceManagerTaskExecutorTest method testDisconnectTaskExecutor.
/**
* Tests that a TaskExecutor can disconnect from the {@link ResourceManager}.
*/
@Test
public void testDisconnectTaskExecutor() throws Exception {
final int numberSlots = 10;
final TaskExecutorRegistration taskExecutorRegistration = new TaskExecutorRegistration(taskExecutorGateway.getAddress(), taskExecutorResourceID, dataPort, jmxPort, hardwareDescription, new TaskExecutorMemoryConfiguration(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), DEFAULT_SLOT_PROFILE, DEFAULT_SLOT_PROFILE.multiply(numberSlots));
final RegistrationResponse registrationResponse = rmGateway.registerTaskExecutor(taskExecutorRegistration, TIMEOUT).get();
assertThat(registrationResponse, instanceOf(TaskExecutorRegistrationSuccess.class));
final InstanceID registrationId = ((TaskExecutorRegistrationSuccess) registrationResponse).getRegistrationId();
final Collection<SlotStatus> slots = createSlots(numberSlots);
final SlotReport slotReport = new SlotReport(slots);
rmGateway.sendSlotReport(taskExecutorResourceID, registrationId, slotReport, TIMEOUT).get();
final ResourceOverview resourceOverview = rmGateway.requestResourceOverview(TIMEOUT).get();
assertThat(resourceOverview.getNumberTaskManagers(), is(1));
assertThat(resourceOverview.getNumberRegisteredSlots(), is(numberSlots));
rmGateway.disconnectTaskManager(taskExecutorResourceID, new FlinkException("testDisconnectTaskExecutor"));
final ResourceOverview afterDisconnectResourceOverview = rmGateway.requestResourceOverview(TIMEOUT).get();
assertThat(afterDisconnectResourceOverview.getNumberTaskManagers(), is(0));
assertThat(afterDisconnectResourceOverview.getNumberRegisteredSlots(), is(0));
}
use of org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess in project flink by apache.
the class ResourceManager method registerTaskExecutorInternal.
/**
* Registers a new TaskExecutor.
*
* @param taskExecutorRegistration task executor registration parameters
* @return RegistrationResponse
*/
private RegistrationResponse registerTaskExecutorInternal(TaskExecutorGateway taskExecutorGateway, TaskExecutorRegistration taskExecutorRegistration) {
ResourceID taskExecutorResourceId = taskExecutorRegistration.getResourceId();
WorkerRegistration<WorkerType> oldRegistration = taskExecutors.remove(taskExecutorResourceId);
if (oldRegistration != null) {
// TODO :: suggest old taskExecutor to stop itself
log.debug("Replacing old registration of TaskExecutor {}.", taskExecutorResourceId.getStringWithMetadata());
// remove old task manager registration from slot manager
slotManager.unregisterTaskManager(oldRegistration.getInstanceID(), new ResourceManagerException(String.format("TaskExecutor %s re-connected to the ResourceManager.", taskExecutorResourceId.getStringWithMetadata())));
}
final WorkerType newWorker = workerStarted(taskExecutorResourceId);
String taskExecutorAddress = taskExecutorRegistration.getTaskExecutorAddress();
if (newWorker == null) {
log.warn("Discard registration from TaskExecutor {} at ({}) because the framework did " + "not recognize it", taskExecutorResourceId.getStringWithMetadata(), taskExecutorAddress);
return new TaskExecutorRegistrationRejection("The ResourceManager does not recognize this TaskExecutor.");
} else {
WorkerRegistration<WorkerType> registration = new WorkerRegistration<>(taskExecutorGateway, newWorker, taskExecutorRegistration.getDataPort(), taskExecutorRegistration.getJmxPort(), taskExecutorRegistration.getHardwareDescription(), taskExecutorRegistration.getMemoryConfiguration(), taskExecutorRegistration.getTotalResourceProfile(), taskExecutorRegistration.getDefaultSlotResourceProfile());
log.info("Registering TaskManager with ResourceID {} ({}) at ResourceManager", taskExecutorResourceId.getStringWithMetadata(), taskExecutorAddress);
taskExecutors.put(taskExecutorResourceId, registration);
taskManagerHeartbeatManager.monitorTarget(taskExecutorResourceId, new TaskExecutorHeartbeatSender(taskExecutorGateway));
return new TaskExecutorRegistrationSuccess(registration.getInstanceID(), resourceId, clusterInformation);
}
}
Aggregations