Search in sources :

Example 1 with TaskExecutorRegistrationSuccess

use of org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess in project flink by apache.

the class ResourceManagerTaskExecutorTest method testDelayedRegisterTaskExecutor.

/**
 * Test delayed registration of task executor where the delay is introduced during connection
 * from resource manager to the registering task executor.
 */
@Test
public void testDelayedRegisterTaskExecutor() throws Exception {
    final Time fastTimeout = Time.milliseconds(1L);
    try {
        final OneShotLatch startConnection = new OneShotLatch();
        final OneShotLatch finishConnection = new OneShotLatch();
        // first registration is with blocking connection
        rpcService.setRpcGatewayFutureFunction(rpcGateway -> CompletableFuture.supplyAsync(() -> {
            startConnection.trigger();
            try {
                finishConnection.await();
            } catch (InterruptedException ignored) {
            }
            return rpcGateway;
        }, TestingUtils.defaultExecutor()));
        TaskExecutorRegistration taskExecutorRegistration = new TaskExecutorRegistration(taskExecutorGateway.getAddress(), taskExecutorResourceID, dataPort, jmxPort, hardwareDescription, new TaskExecutorMemoryConfiguration(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), DEFAULT_SLOT_PROFILE, DEFAULT_SLOT_PROFILE);
        CompletableFuture<RegistrationResponse> firstFuture = rmGateway.registerTaskExecutor(taskExecutorRegistration, fastTimeout);
        try {
            firstFuture.get();
            fail("Should have failed because connection to taskmanager is delayed beyond timeout");
        } catch (Exception e) {
            final Throwable cause = ExceptionUtils.stripExecutionException(e);
            assertThat(cause, instanceOf(TimeoutException.class));
            assertThat(cause.getMessage(), containsString("ResourceManagerGateway.registerTaskExecutor"));
        }
        startConnection.await();
        // second registration after timeout is with no delay, expecting it to be succeeded
        rpcService.resetRpcGatewayFutureFunction();
        CompletableFuture<RegistrationResponse> secondFuture = rmGateway.registerTaskExecutor(taskExecutorRegistration, TIMEOUT);
        RegistrationResponse response = secondFuture.get();
        assertTrue(response instanceof TaskExecutorRegistrationSuccess);
        // on success, send slot report for taskmanager registration
        final SlotReport slotReport = new SlotReport(new SlotStatus(new SlotID(taskExecutorResourceID, 0), ResourceProfile.ANY));
        rmGateway.sendSlotReport(taskExecutorResourceID, ((TaskExecutorRegistrationSuccess) response).getRegistrationId(), slotReport, TIMEOUT).get();
        // let the remaining part of the first registration proceed
        finishConnection.trigger();
        Thread.sleep(1L);
        // verify that the latest registration is valid not being unregistered by the delayed
        // one
        final TaskManagerInfoWithSlots taskManagerInfoWithSlots = rmGateway.requestTaskManagerDetailsInfo(taskExecutorResourceID, TIMEOUT).get();
        assertThat(taskManagerInfoWithSlots.getTaskManagerInfo().getResourceId(), equalTo(taskExecutorResourceID));
        assertThat(taskManagerInfoWithSlots.getTaskManagerInfo().getNumberSlots(), equalTo(1));
    } finally {
        rpcService.resetRpcGatewayFutureFunction();
    }
}
Also used : SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) TaskExecutorMemoryConfiguration(org.apache.flink.runtime.taskexecutor.TaskExecutorMemoryConfiguration) TaskExecutorRegistrationSuccess(org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess) Time(org.apache.flink.api.common.time.Time) FlinkException(org.apache.flink.util.FlinkException) FencingTokenException(org.apache.flink.runtime.rpc.exceptions.FencingTokenException) TimeoutException(java.util.concurrent.TimeoutException) ExecutionException(java.util.concurrent.ExecutionException) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) Test(org.junit.Test)

Example 2 with TaskExecutorRegistrationSuccess

use of org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess in project flink by apache.

the class ResourceManagerTaskExecutorTest method testRegisterTaskExecutor.

/**
 * Test receive normal registration from task executor and receive duplicate registration from
 * task executor.
 */
@Test
public void testRegisterTaskExecutor() throws Exception {
    // test response successful
    CompletableFuture<RegistrationResponse> successfulFuture = registerTaskExecutor(rmGateway, taskExecutorGateway.getAddress());
    RegistrationResponse response = successfulFuture.get(TIMEOUT.toMilliseconds(), TimeUnit.MILLISECONDS);
    assertTrue(response instanceof TaskExecutorRegistrationSuccess);
    final TaskManagerInfoWithSlots taskManagerInfoWithSlots = rmGateway.requestTaskManagerDetailsInfo(taskExecutorResourceID, TIMEOUT).get();
    assertThat(taskManagerInfoWithSlots.getTaskManagerInfo().getResourceId(), equalTo(taskExecutorResourceID));
    // test response successful with instanceID not equal to previous when receive duplicate
    // registration from taskExecutor
    CompletableFuture<RegistrationResponse> duplicateFuture = registerTaskExecutor(rmGateway, taskExecutorGateway.getAddress());
    RegistrationResponse duplicateResponse = duplicateFuture.get();
    assertTrue(duplicateResponse instanceof TaskExecutorRegistrationSuccess);
    assertNotEquals(((TaskExecutorRegistrationSuccess) response).getRegistrationId(), ((TaskExecutorRegistrationSuccess) duplicateResponse).getRegistrationId());
    assertThat(rmGateway.requestResourceOverview(TIMEOUT).get().getNumberTaskManagers(), is(1));
}
Also used : TaskExecutorRegistrationSuccess(org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) Test(org.junit.Test)

Example 3 with TaskExecutorRegistrationSuccess

use of org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess in project flink by apache.

the class ResourceManagerTaskExecutorTest method testDisconnectTaskExecutor.

/**
 * Tests that a TaskExecutor can disconnect from the {@link ResourceManager}.
 */
@Test
public void testDisconnectTaskExecutor() throws Exception {
    final int numberSlots = 10;
    final TaskExecutorRegistration taskExecutorRegistration = new TaskExecutorRegistration(taskExecutorGateway.getAddress(), taskExecutorResourceID, dataPort, jmxPort, hardwareDescription, new TaskExecutorMemoryConfiguration(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L), DEFAULT_SLOT_PROFILE, DEFAULT_SLOT_PROFILE.multiply(numberSlots));
    final RegistrationResponse registrationResponse = rmGateway.registerTaskExecutor(taskExecutorRegistration, TIMEOUT).get();
    assertThat(registrationResponse, instanceOf(TaskExecutorRegistrationSuccess.class));
    final InstanceID registrationId = ((TaskExecutorRegistrationSuccess) registrationResponse).getRegistrationId();
    final Collection<SlotStatus> slots = createSlots(numberSlots);
    final SlotReport slotReport = new SlotReport(slots);
    rmGateway.sendSlotReport(taskExecutorResourceID, registrationId, slotReport, TIMEOUT).get();
    final ResourceOverview resourceOverview = rmGateway.requestResourceOverview(TIMEOUT).get();
    assertThat(resourceOverview.getNumberTaskManagers(), is(1));
    assertThat(resourceOverview.getNumberRegisteredSlots(), is(numberSlots));
    rmGateway.disconnectTaskManager(taskExecutorResourceID, new FlinkException("testDisconnectTaskExecutor"));
    final ResourceOverview afterDisconnectResourceOverview = rmGateway.requestResourceOverview(TIMEOUT).get();
    assertThat(afterDisconnectResourceOverview.getNumberTaskManagers(), is(0));
    assertThat(afterDisconnectResourceOverview.getNumberRegisteredSlots(), is(0));
}
Also used : InstanceID(org.apache.flink.runtime.instance.InstanceID) SlotStatus(org.apache.flink.runtime.taskexecutor.SlotStatus) SlotReport(org.apache.flink.runtime.taskexecutor.SlotReport) TaskExecutorMemoryConfiguration(org.apache.flink.runtime.taskexecutor.TaskExecutorMemoryConfiguration) TaskExecutorRegistrationSuccess(org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) FlinkException(org.apache.flink.util.FlinkException) Test(org.junit.Test)

Example 4 with TaskExecutorRegistrationSuccess

use of org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess in project flink by apache.

the class ResourceManager method registerTaskExecutorInternal.

/**
 * Registers a new TaskExecutor.
 *
 * @param taskExecutorRegistration task executor registration parameters
 * @return RegistrationResponse
 */
private RegistrationResponse registerTaskExecutorInternal(TaskExecutorGateway taskExecutorGateway, TaskExecutorRegistration taskExecutorRegistration) {
    ResourceID taskExecutorResourceId = taskExecutorRegistration.getResourceId();
    WorkerRegistration<WorkerType> oldRegistration = taskExecutors.remove(taskExecutorResourceId);
    if (oldRegistration != null) {
        // TODO :: suggest old taskExecutor to stop itself
        log.debug("Replacing old registration of TaskExecutor {}.", taskExecutorResourceId.getStringWithMetadata());
        // remove old task manager registration from slot manager
        slotManager.unregisterTaskManager(oldRegistration.getInstanceID(), new ResourceManagerException(String.format("TaskExecutor %s re-connected to the ResourceManager.", taskExecutorResourceId.getStringWithMetadata())));
    }
    final WorkerType newWorker = workerStarted(taskExecutorResourceId);
    String taskExecutorAddress = taskExecutorRegistration.getTaskExecutorAddress();
    if (newWorker == null) {
        log.warn("Discard registration from TaskExecutor {} at ({}) because the framework did " + "not recognize it", taskExecutorResourceId.getStringWithMetadata(), taskExecutorAddress);
        return new TaskExecutorRegistrationRejection("The ResourceManager does not recognize this TaskExecutor.");
    } else {
        WorkerRegistration<WorkerType> registration = new WorkerRegistration<>(taskExecutorGateway, newWorker, taskExecutorRegistration.getDataPort(), taskExecutorRegistration.getJmxPort(), taskExecutorRegistration.getHardwareDescription(), taskExecutorRegistration.getMemoryConfiguration(), taskExecutorRegistration.getTotalResourceProfile(), taskExecutorRegistration.getDefaultSlotResourceProfile());
        log.info("Registering TaskManager with ResourceID {} ({}) at ResourceManager", taskExecutorResourceId.getStringWithMetadata(), taskExecutorAddress);
        taskExecutors.put(taskExecutorResourceId, registration);
        taskManagerHeartbeatManager.monitorTarget(taskExecutorResourceId, new TaskExecutorHeartbeatSender(taskExecutorGateway));
        return new TaskExecutorRegistrationSuccess(registration.getInstanceID(), resourceId, clusterInformation);
    }
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskExecutorRegistrationRejection(org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationRejection) TaskExecutorRegistrationSuccess(org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess) WorkerRegistration(org.apache.flink.runtime.resourcemanager.registration.WorkerRegistration) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException)

Aggregations

TaskExecutorRegistrationSuccess (org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationSuccess)4 RegistrationResponse (org.apache.flink.runtime.registration.RegistrationResponse)3 Test (org.junit.Test)3 SlotReport (org.apache.flink.runtime.taskexecutor.SlotReport)2 SlotStatus (org.apache.flink.runtime.taskexecutor.SlotStatus)2 TaskExecutorMemoryConfiguration (org.apache.flink.runtime.taskexecutor.TaskExecutorMemoryConfiguration)2 FlinkException (org.apache.flink.util.FlinkException)2 ExecutionException (java.util.concurrent.ExecutionException)1 TimeoutException (java.util.concurrent.TimeoutException)1 Time (org.apache.flink.api.common.time.Time)1 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)1 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)1 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)1 InstanceID (org.apache.flink.runtime.instance.InstanceID)1 ResourceManagerException (org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException)1 WorkerRegistration (org.apache.flink.runtime.resourcemanager.registration.WorkerRegistration)1 FencingTokenException (org.apache.flink.runtime.rpc.exceptions.FencingTokenException)1 TaskExecutorRegistrationRejection (org.apache.flink.runtime.taskexecutor.TaskExecutorRegistrationRejection)1