Search in sources :

Example 11 with RpcService

use of org.apache.flink.runtime.rpc.RpcService in project flink by apache.

the class TaskManagerRunnerConfigurationTest method testTaskManagerRpcServiceShouldBindToHostnameAddress.

@Test
public void testTaskManagerRpcServiceShouldBindToHostnameAddress() throws Exception {
    final Configuration config = createFlinkConfigWithHostBindPolicy(HostBindPolicy.NAME);
    final HighAvailabilityServices highAvailabilityServices = createHighAvailabilityServices(config);
    RpcService taskManagerRpcService = null;
    try {
        taskManagerRpcService = TaskManagerRunner.createRpcService(config, highAvailabilityServices, RPC_SYSTEM);
        assertThat(taskManagerRpcService.getAddress(), not(isEmptyOrNullString()));
    } finally {
        maybeCloseRpcService(taskManagerRpcService);
        highAvailabilityServices.closeAndCleanupAllData();
    }
}
Also used : UnmodifiableConfiguration(org.apache.flink.configuration.UnmodifiableConfiguration) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) RpcService(org.apache.flink.runtime.rpc.RpcService) Test(org.junit.Test)

Example 12 with RpcService

use of org.apache.flink.runtime.rpc.RpcService in project flink by apache.

the class JobManagerHAProcessFailureRecoveryITCase method testDispatcherProcessFailure.

@Test
public void testDispatcherProcessFailure() throws Exception {
    final Time timeout = Time.seconds(30L);
    final File zookeeperStoragePath = temporaryFolder.newFolder();
    // Config
    final int numberOfJobManagers = 2;
    final int numberOfTaskManagers = 2;
    final int numberOfSlotsPerTaskManager = 2;
    assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
    // Job managers
    final DispatcherProcess[] dispatcherProcesses = new DispatcherProcess[numberOfJobManagers];
    // Task managers
    TaskManagerRunner[] taskManagerRunners = new TaskManagerRunner[numberOfTaskManagers];
    HighAvailabilityServices highAvailabilityServices = null;
    LeaderRetrievalService leaderRetrievalService = null;
    // Coordination between the processes goes through a directory
    File coordinateTempDir = null;
    // Cluster config
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), zookeeperStoragePath.getPath());
    // Task manager configuration
    config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
    config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
    config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
    config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
    config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
    config.set(TaskManagerOptions.CPU_CORES, 1.0);
    TaskExecutorResourceUtils.adjustForLocalExecution(config);
    final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
    try {
        final Deadline deadline = Deadline.fromNow(TEST_TIMEOUT);
        // Coordination directory
        coordinateTempDir = temporaryFolder.newFolder();
        // Start first process
        dispatcherProcesses[0] = new DispatcherProcess(0, config);
        dispatcherProcesses[0].startProcess();
        highAvailabilityServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config, TestingUtils.defaultExecutor(), NoOpFatalErrorHandler.INSTANCE);
        final PluginManager pluginManager = PluginUtils.createPluginManagerFromRootFolder(config);
        // Start the task manager process
        for (int i = 0; i < numberOfTaskManagers; i++) {
            taskManagerRunners[i] = new TaskManagerRunner(config, pluginManager, TaskManagerRunner::createTaskExecutorService);
            taskManagerRunners[i].start();
        }
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
        leaderRetrievalService.start(leaderListener);
        // Initial submission
        leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        final CompletableFuture<DispatcherGateway> dispatcherGatewayFuture = rpcService.connect(leaderAddress, DispatcherId.fromUuid(leaderId), DispatcherGateway.class);
        final DispatcherGateway dispatcherGateway = dispatcherGatewayFuture.get();
        // Wait for all task managers to connect to the leading job manager
        waitForTaskManagers(numberOfTaskManagers, dispatcherGateway, deadline.timeLeft());
        final File coordinateDirClosure = coordinateTempDir;
        final Throwable[] errorRef = new Throwable[1];
        // we trigger program execution in a separate thread
        Thread programTrigger = new Thread("Program Trigger") {

            @Override
            public void run() {
                try {
                    testJobManagerFailure(zooKeeper.getConnectString(), coordinateDirClosure, zookeeperStoragePath);
                } catch (Throwable t) {
                    t.printStackTrace();
                    errorRef[0] = t;
                }
            }
        };
        // start the test program
        programTrigger.start();
        // wait until all marker files are in place, indicating that all tasks have started
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
        // Kill one of the job managers and trigger recovery
        dispatcherProcesses[0].destroy();
        dispatcherProcesses[1] = new DispatcherProcess(1, config);
        dispatcherProcesses[1].startProcess();
        // we create the marker file which signals the program functions tasks that they can
        // complete
        AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
        programTrigger.join(deadline.timeLeft().toMillis());
        // We wait for the finish marker file. We don't wait for the program trigger, because
        // we submit in detached mode.
        AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
        // check that the program really finished
        assertFalse("The program did not finish in time", programTrigger.isAlive());
        // check whether the program encountered an error
        if (errorRef[0] != null) {
            Throwable error = errorRef[0];
            error.printStackTrace();
            fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
        }
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        for (DispatcherProcess p : dispatcherProcesses) {
            if (p != null) {
                p.printProcessLog();
            }
        }
        throw t;
    } finally {
        for (int i = 0; i < numberOfTaskManagers; i++) {
            if (taskManagerRunners[i] != null) {
                taskManagerRunners[i].close();
            }
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        for (DispatcherProcess dispatcherProcess : dispatcherProcesses) {
            if (dispatcherProcess != null) {
                dispatcherProcess.destroy();
            }
        }
        if (highAvailabilityServices != null) {
            highAvailabilityServices.closeAndCleanupAllData();
        }
        RpcUtils.terminateRpcService(rpcService, timeout);
        // Delete coordination directory
        if (coordinateTempDir != null) {
            try {
                FileUtils.deleteDirectory(coordinateTempDir);
            } catch (Throwable ignored) {
            }
        }
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) Deadline(org.apache.flink.api.common.time.Deadline) Time(org.apache.flink.api.common.time.Time) DispatcherProcess(org.apache.flink.runtime.testutils.DispatcherProcess) DispatcherGateway(org.apache.flink.runtime.dispatcher.DispatcherGateway) PluginManager(org.apache.flink.core.plugin.PluginManager) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) TaskManagerRunner(org.apache.flink.runtime.taskexecutor.TaskManagerRunner) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) RpcService(org.apache.flink.runtime.rpc.RpcService) UUID(java.util.UUID) File(java.io.File) Test(org.junit.Test)

Example 13 with RpcService

use of org.apache.flink.runtime.rpc.RpcService in project beam by apache.

the class RemoteMiniClusterImpl method createLocalRpcService.

@Override
protected RpcService createLocalRpcService(Configuration configuration) throws Exception {
    // Enable remote connections to the mini cluster which are disabled by default
    final RpcService rpcService = AkkaRpcServiceUtils.remoteServiceBuilder(configuration, jobManagerBindAddress, String.valueOf(0)).withBindAddress(jobManagerBindAddress).withBindPort(0).withCustomConfig(AkkaUtils.testDispatcherConfig()).createAndStart();
    this.port = rpcService.getPort();
    return rpcService;
}
Also used : RpcService(org.apache.flink.runtime.rpc.RpcService)

Example 14 with RpcService

use of org.apache.flink.runtime.rpc.RpcService in project flink by apache.

the class DefaultDispatcherGatewayServiceFactory method create.

@Override
public AbstractDispatcherLeaderProcess.DispatcherGatewayService create(DispatcherId fencingToken, Collection<JobGraph> recoveredJobs, Collection<JobResult> recoveredDirtyJobResults, JobGraphWriter jobGraphWriter, JobResultStore jobResultStore) {
    final Dispatcher dispatcher;
    try {
        dispatcher = dispatcherFactory.createDispatcher(rpcService, fencingToken, recoveredJobs, recoveredDirtyJobResults, (dispatcherGateway, scheduledExecutor, errorHandler) -> new NoOpDispatcherBootstrap(), PartialDispatcherServicesWithJobPersistenceComponents.from(partialDispatcherServices, jobGraphWriter, jobResultStore));
    } catch (Exception e) {
        throw new FlinkRuntimeException("Could not create the Dispatcher rpc endpoint.", e);
    }
    dispatcher.start();
    return DefaultDispatcherGatewayService.from(dispatcher);
}
Also used : DispatcherId(org.apache.flink.runtime.dispatcher.DispatcherId) Dispatcher(org.apache.flink.runtime.dispatcher.Dispatcher) PartialDispatcherServices(org.apache.flink.runtime.dispatcher.PartialDispatcherServices) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) PartialDispatcherServicesWithJobPersistenceComponents(org.apache.flink.runtime.dispatcher.PartialDispatcherServicesWithJobPersistenceComponents) Collection(java.util.Collection) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobResult(org.apache.flink.runtime.jobmaster.JobResult) RpcService(org.apache.flink.runtime.rpc.RpcService) NoOpDispatcherBootstrap(org.apache.flink.runtime.dispatcher.NoOpDispatcherBootstrap) DispatcherFactory(org.apache.flink.runtime.dispatcher.DispatcherFactory) JobResultStore(org.apache.flink.runtime.highavailability.JobResultStore) JobGraphWriter(org.apache.flink.runtime.jobmanager.JobGraphWriter) NoOpDispatcherBootstrap(org.apache.flink.runtime.dispatcher.NoOpDispatcherBootstrap) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) Dispatcher(org.apache.flink.runtime.dispatcher.Dispatcher) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException)

Example 15 with RpcService

use of org.apache.flink.runtime.rpc.RpcService in project flink by apache.

the class RetryingRegistrationTest method testRetryOnError.

@Test
@SuppressWarnings("unchecked")
public void testRetryOnError() throws Exception {
    final String testId = "Petit a petit, l'oiseau fait son nid";
    final String testEndpointAddress = "<test-address>";
    final UUID leaderId = UUID.randomUUID();
    // gateway that upon calls first responds with a failure, then with a success
    final Queue<CompletableFuture<RegistrationResponse>> responses = new ArrayDeque<>(2);
    responses.add(FutureUtils.completedExceptionally(new Exception("test exception")));
    responses.add(CompletableFuture.completedFuture(new TestRegistrationSuccess(testId)));
    TestRegistrationGateway testGateway = DefaultTestRegistrationGateway.newBuilder().setRegistrationFunction((uuid, aLong) -> responses.poll()).build();
    rpcService.registerGateway(testEndpointAddress, testGateway);
    TestRetryingRegistration registration = new TestRetryingRegistration(rpcService, testEndpointAddress, leaderId);
    long started = System.nanoTime();
    registration.startRegistration();
    CompletableFuture<RetryingRegistration.RetryingRegistrationResult<TestRegistrationGateway, TestRegistrationSuccess, TestRegistrationRejection>> future = registration.getFuture();
    RetryingRegistration.RetryingRegistrationResult<TestRegistrationGateway, TestRegistrationSuccess, TestRegistrationRejection> registrationResponse = future.get(10, TimeUnit.SECONDS);
    long finished = System.nanoTime();
    long elapsedMillis = (finished - started) / 1000000;
    assertEquals(testId, registrationResponse.getSuccess().getCorrelationId());
    // validate that some retry-delay / back-off behavior happened
    assertTrue("retries did not properly back off", elapsedMillis >= TestRetryingRegistration.DELAY_ON_ERROR);
}
Also used : FlinkException(org.apache.flink.util.FlinkException) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) LoggerFactory(org.slf4j.LoggerFactory) ScheduledExecutorServiceAdapter(org.apache.flink.util.concurrent.ScheduledExecutorServiceAdapter) TimeoutException(java.util.concurrent.TimeoutException) CompletableFuture(java.util.concurrent.CompletableFuture) Assert.assertThat(org.junit.Assert.assertThat) InvocationOnMock(org.mockito.invocation.InvocationOnMock) RpcService(org.apache.flink.runtime.rpc.RpcService) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) After(org.junit.After) TestLogger(org.apache.flink.util.TestLogger) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Assert.fail(org.junit.Assert.fail) Mockito.anyLong(org.mockito.Mockito.anyLong) Mockito.anyString(org.mockito.Mockito.anyString) Before(org.junit.Before) Matchers.lessThanOrEqualTo(org.hamcrest.Matchers.lessThanOrEqualTo) Assert.assertNotNull(org.junit.Assert.assertNotNull) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) UUID(java.util.UUID) Mockito.when(org.mockito.Mockito.when) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) TestingUtils(org.apache.flink.testutils.TestingUtils) Matchers.is(org.hamcrest.Matchers.is) Queue(java.util.Queue) ArrayDeque(java.util.ArrayDeque) Mockito.any(org.mockito.Mockito.any) Assert.assertEquals(org.junit.Assert.assertEquals) Mockito.mock(org.mockito.Mockito.mock) Mockito.anyString(org.mockito.Mockito.anyString) ArrayDeque(java.util.ArrayDeque) FlinkException(org.apache.flink.util.FlinkException) TimeoutException(java.util.concurrent.TimeoutException) ExecutionException(java.util.concurrent.ExecutionException) CompletableFuture(java.util.concurrent.CompletableFuture) UUID(java.util.UUID) Test(org.junit.Test)

Aggregations

RpcService (org.apache.flink.runtime.rpc.RpcService)25 Test (org.junit.Test)15 Configuration (org.apache.flink.configuration.Configuration)13 HighAvailabilityServices (org.apache.flink.runtime.highavailability.HighAvailabilityServices)9 ExecutionException (java.util.concurrent.ExecutionException)8 TestingRpcService (org.apache.flink.runtime.rpc.TestingRpcService)7 UUID (java.util.UUID)6 CompletableFuture (java.util.concurrent.CompletableFuture)6 Before (org.junit.Before)6 FlinkException (org.apache.flink.util.FlinkException)5 TestLogger (org.apache.flink.util.TestLogger)5 After (org.junit.After)5 Assert.assertThat (org.junit.Assert.assertThat)5 Assert.fail (org.junit.Assert.fail)5 LoggerFactory (org.slf4j.LoggerFactory)5 IOException (java.io.IOException)4 TimeUnit (java.util.concurrent.TimeUnit)4 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)4 Mockito.anyString (org.mockito.Mockito.anyString)4 InetAddress (java.net.InetAddress)3