use of org.apache.flink.runtime.rpc.RpcService in project flink by apache.
the class TaskManagerRunnerConfigurationTest method testTaskManagerRpcServiceShouldBindToHostnameAddress.
@Test
public void testTaskManagerRpcServiceShouldBindToHostnameAddress() throws Exception {
final Configuration config = createFlinkConfigWithHostBindPolicy(HostBindPolicy.NAME);
final HighAvailabilityServices highAvailabilityServices = createHighAvailabilityServices(config);
RpcService taskManagerRpcService = null;
try {
taskManagerRpcService = TaskManagerRunner.createRpcService(config, highAvailabilityServices, RPC_SYSTEM);
assertThat(taskManagerRpcService.getAddress(), not(isEmptyOrNullString()));
} finally {
maybeCloseRpcService(taskManagerRpcService);
highAvailabilityServices.closeAndCleanupAllData();
}
}
use of org.apache.flink.runtime.rpc.RpcService in project flink by apache.
the class JobManagerHAProcessFailureRecoveryITCase method testDispatcherProcessFailure.
@Test
public void testDispatcherProcessFailure() throws Exception {
final Time timeout = Time.seconds(30L);
final File zookeeperStoragePath = temporaryFolder.newFolder();
// Config
final int numberOfJobManagers = 2;
final int numberOfTaskManagers = 2;
final int numberOfSlotsPerTaskManager = 2;
assertEquals(PARALLELISM, numberOfTaskManagers * numberOfSlotsPerTaskManager);
// Job managers
final DispatcherProcess[] dispatcherProcesses = new DispatcherProcess[numberOfJobManagers];
// Task managers
TaskManagerRunner[] taskManagerRunners = new TaskManagerRunner[numberOfTaskManagers];
HighAvailabilityServices highAvailabilityServices = null;
LeaderRetrievalService leaderRetrievalService = null;
// Coordination between the processes goes through a directory
File coordinateTempDir = null;
// Cluster config
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeper.getConnectString(), zookeeperStoragePath.getPath());
// Task manager configuration
config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
config.set(NettyShuffleEnvironmentOptions.NETWORK_SORT_SHUFFLE_MIN_BUFFERS, 16);
config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
config.set(TaskManagerOptions.CPU_CORES, 1.0);
TaskExecutorResourceUtils.adjustForLocalExecution(config);
final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
try {
final Deadline deadline = Deadline.fromNow(TEST_TIMEOUT);
// Coordination directory
coordinateTempDir = temporaryFolder.newFolder();
// Start first process
dispatcherProcesses[0] = new DispatcherProcess(0, config);
dispatcherProcesses[0].startProcess();
highAvailabilityServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config, TestingUtils.defaultExecutor(), NoOpFatalErrorHandler.INSTANCE);
final PluginManager pluginManager = PluginUtils.createPluginManagerFromRootFolder(config);
// Start the task manager process
for (int i = 0; i < numberOfTaskManagers; i++) {
taskManagerRunners[i] = new TaskManagerRunner(config, pluginManager, TaskManagerRunner::createTaskExecutorService);
taskManagerRunners[i].start();
}
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
leaderRetrievalService.start(leaderListener);
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
final CompletableFuture<DispatcherGateway> dispatcherGatewayFuture = rpcService.connect(leaderAddress, DispatcherId.fromUuid(leaderId), DispatcherGateway.class);
final DispatcherGateway dispatcherGateway = dispatcherGatewayFuture.get();
// Wait for all task managers to connect to the leading job manager
waitForTaskManagers(numberOfTaskManagers, dispatcherGateway, deadline.timeLeft());
final File coordinateDirClosure = coordinateTempDir;
final Throwable[] errorRef = new Throwable[1];
// we trigger program execution in a separate thread
Thread programTrigger = new Thread("Program Trigger") {
@Override
public void run() {
try {
testJobManagerFailure(zooKeeper.getConnectString(), coordinateDirClosure, zookeeperStoragePath);
} catch (Throwable t) {
t.printStackTrace();
errorRef[0] = t;
}
}
};
// start the test program
programTrigger.start();
// wait until all marker files are in place, indicating that all tasks have started
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, READY_MARKER_FILE_PREFIX, PARALLELISM, deadline.timeLeft().toMillis());
// Kill one of the job managers and trigger recovery
dispatcherProcesses[0].destroy();
dispatcherProcesses[1] = new DispatcherProcess(1, config);
dispatcherProcesses[1].startProcess();
// we create the marker file which signals the program functions tasks that they can
// complete
AbstractTaskManagerProcessFailureRecoveryTest.touchFile(new File(coordinateTempDir, PROCEED_MARKER_FILE));
programTrigger.join(deadline.timeLeft().toMillis());
// We wait for the finish marker file. We don't wait for the program trigger, because
// we submit in detached mode.
AbstractTaskManagerProcessFailureRecoveryTest.waitForMarkerFiles(coordinateTempDir, FINISH_MARKER_FILE_PREFIX, 1, deadline.timeLeft().toMillis());
// check that the program really finished
assertFalse("The program did not finish in time", programTrigger.isAlive());
// check whether the program encountered an error
if (errorRef[0] != null) {
Throwable error = errorRef[0];
error.printStackTrace();
fail("The program encountered a " + error.getClass().getSimpleName() + " : " + error.getMessage());
}
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
for (DispatcherProcess p : dispatcherProcesses) {
if (p != null) {
p.printProcessLog();
}
}
throw t;
} finally {
for (int i = 0; i < numberOfTaskManagers; i++) {
if (taskManagerRunners[i] != null) {
taskManagerRunners[i].close();
}
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (DispatcherProcess dispatcherProcess : dispatcherProcesses) {
if (dispatcherProcess != null) {
dispatcherProcess.destroy();
}
}
if (highAvailabilityServices != null) {
highAvailabilityServices.closeAndCleanupAllData();
}
RpcUtils.terminateRpcService(rpcService, timeout);
// Delete coordination directory
if (coordinateTempDir != null) {
try {
FileUtils.deleteDirectory(coordinateTempDir);
} catch (Throwable ignored) {
}
}
}
}
use of org.apache.flink.runtime.rpc.RpcService in project beam by apache.
the class RemoteMiniClusterImpl method createLocalRpcService.
@Override
protected RpcService createLocalRpcService(Configuration configuration) throws Exception {
// Enable remote connections to the mini cluster which are disabled by default
final RpcService rpcService = AkkaRpcServiceUtils.remoteServiceBuilder(configuration, jobManagerBindAddress, String.valueOf(0)).withBindAddress(jobManagerBindAddress).withBindPort(0).withCustomConfig(AkkaUtils.testDispatcherConfig()).createAndStart();
this.port = rpcService.getPort();
return rpcService;
}
use of org.apache.flink.runtime.rpc.RpcService in project flink by apache.
the class DefaultDispatcherGatewayServiceFactory method create.
@Override
public AbstractDispatcherLeaderProcess.DispatcherGatewayService create(DispatcherId fencingToken, Collection<JobGraph> recoveredJobs, Collection<JobResult> recoveredDirtyJobResults, JobGraphWriter jobGraphWriter, JobResultStore jobResultStore) {
final Dispatcher dispatcher;
try {
dispatcher = dispatcherFactory.createDispatcher(rpcService, fencingToken, recoveredJobs, recoveredDirtyJobResults, (dispatcherGateway, scheduledExecutor, errorHandler) -> new NoOpDispatcherBootstrap(), PartialDispatcherServicesWithJobPersistenceComponents.from(partialDispatcherServices, jobGraphWriter, jobResultStore));
} catch (Exception e) {
throw new FlinkRuntimeException("Could not create the Dispatcher rpc endpoint.", e);
}
dispatcher.start();
return DefaultDispatcherGatewayService.from(dispatcher);
}
use of org.apache.flink.runtime.rpc.RpcService in project flink by apache.
the class RetryingRegistrationTest method testRetryOnError.
@Test
@SuppressWarnings("unchecked")
public void testRetryOnError() throws Exception {
final String testId = "Petit a petit, l'oiseau fait son nid";
final String testEndpointAddress = "<test-address>";
final UUID leaderId = UUID.randomUUID();
// gateway that upon calls first responds with a failure, then with a success
final Queue<CompletableFuture<RegistrationResponse>> responses = new ArrayDeque<>(2);
responses.add(FutureUtils.completedExceptionally(new Exception("test exception")));
responses.add(CompletableFuture.completedFuture(new TestRegistrationSuccess(testId)));
TestRegistrationGateway testGateway = DefaultTestRegistrationGateway.newBuilder().setRegistrationFunction((uuid, aLong) -> responses.poll()).build();
rpcService.registerGateway(testEndpointAddress, testGateway);
TestRetryingRegistration registration = new TestRetryingRegistration(rpcService, testEndpointAddress, leaderId);
long started = System.nanoTime();
registration.startRegistration();
CompletableFuture<RetryingRegistration.RetryingRegistrationResult<TestRegistrationGateway, TestRegistrationSuccess, TestRegistrationRejection>> future = registration.getFuture();
RetryingRegistration.RetryingRegistrationResult<TestRegistrationGateway, TestRegistrationSuccess, TestRegistrationRejection> registrationResponse = future.get(10, TimeUnit.SECONDS);
long finished = System.nanoTime();
long elapsedMillis = (finished - started) / 1000000;
assertEquals(testId, registrationResponse.getSuccess().getCorrelationId());
// validate that some retry-delay / back-off behavior happened
assertTrue("retries did not properly back off", elapsedMillis >= TestRetryingRegistration.DELAY_ON_ERROR);
}
Aggregations