use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobManagerRunnerMockTest method setUp.
@Before
public void setUp() throws Exception {
RpcService mockRpc = mock(RpcService.class);
when(mockRpc.getAddress()).thenReturn("localhost");
jobManager = mock(JobMaster.class);
jobManagerGateway = mock(JobMasterGateway.class);
when(jobManager.getSelf()).thenReturn(jobManagerGateway);
when(jobManager.getRpcService()).thenReturn(mockRpc);
PowerMockito.whenNew(JobMaster.class).withAnyArguments().thenReturn(jobManager);
jobCompletion = new TestingOnCompletionActions();
leaderElectionService = mock(LeaderElectionService.class);
when(leaderElectionService.hasLeadership()).thenReturn(true);
SubmittedJobGraphStore submittedJobGraphStore = mock(SubmittedJobGraphStore.class);
blobStore = mock(BlobStore.class);
HighAvailabilityServices haServices = mock(HighAvailabilityServices.class);
when(haServices.getJobManagerLeaderElectionService(any(JobID.class))).thenReturn(leaderElectionService);
when(haServices.getSubmittedJobGraphStore()).thenReturn(submittedJobGraphStore);
when(haServices.createBlobStore()).thenReturn(blobStore);
when(haServices.getRunningJobsRegistry()).thenReturn(runningJobsRegistry);
HeartbeatServices heartbeatServices = mock(HeartbeatServices.class);
runner = PowerMockito.spy(new JobManagerRunner(ResourceID.generate(), new JobGraph("test", new JobVertex("vertex")), mock(Configuration.class), mockRpc, haServices, heartbeatServices, JobManagerServices.fromConfiguration(new Configuration(), haServices), new MetricRegistry(MetricRegistryConfiguration.defaultMetricRegistryConfiguration()), jobCompletion, jobCompletion));
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class YarnTaskExecutorRunner method runTaskExecutor.
// ------------------------------------------------------------------------
// Core work method
// ------------------------------------------------------------------------
/**
* The main work method, must run as a privileged action.
*
* @return The return code for the Java process.
*/
protected int runTaskExecutor(Configuration config) {
try {
// ---- (1) create common services
// first get the ResouceId, resource id is the container id for yarn.
final String containerId = ENV.get(YarnFlinkResourceManager.ENV_FLINK_CONTAINER_ID);
Preconditions.checkArgument(containerId != null, "ContainerId variable %s not set", YarnFlinkResourceManager.ENV_FLINK_CONTAINER_ID);
// use the hostname passed by job manager
final String taskExecutorHostname = ENV.get(YarnResourceManager.ENV_FLINK_NODE_ID);
if (taskExecutorHostname != null) {
config.setString(ConfigConstants.TASK_MANAGER_HOSTNAME_KEY, taskExecutorHostname);
}
ResourceID resourceID = new ResourceID(containerId);
LOG.info("YARN assigned resource id {} for the task executor.", resourceID.toString());
haServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config);
HeartbeatServices heartbeatServices = HeartbeatServices.fromConfiguration(config);
metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
// ---- (2) init task manager runner -------
taskExecutorRpcService = TaskManagerRunner.createRpcService(config, haServices);
taskManagerRunner = new TaskManagerRunner(config, resourceID, taskExecutorRpcService, haServices, heartbeatServices, metricRegistry);
// ---- (3) start the task manager runner
taskManagerRunner.start();
LOG.debug("YARN task executor started");
taskManagerRunner.getTerminationFuture().get();
// everything started, we can wait until all is done or the process is killed
LOG.info("YARN task manager runner finished");
shutdown();
} catch (Throwable t) {
// make sure that everything whatever ends up in the log
LOG.error("YARN task executor initialization failed", t);
shutdown();
return INIT_ERROR_EXIT_CODE;
}
return 0;
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class TaskManagerRunner method startTaskManagerRunnerServices.
private void startTaskManagerRunnerServices() throws Exception {
synchronized (lock) {
rpcSystem = RpcSystem.load(configuration);
this.executor = Executors.newScheduledThreadPool(Hardware.getNumberCPUCores(), new ExecutorThreadFactory("taskmanager-future"));
highAvailabilityServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(configuration, executor, AddressResolution.NO_ADDRESS_RESOLUTION, rpcSystem, this);
JMXService.startInstance(configuration.getString(JMXServerOptions.JMX_SERVER_PORT));
rpcService = createRpcService(configuration, highAvailabilityServices, rpcSystem);
this.resourceId = getTaskManagerResourceID(configuration, rpcService.getAddress(), rpcService.getPort());
this.workingDirectory = ClusterEntrypointUtils.createTaskManagerWorkingDirectory(configuration, resourceId);
LOG.info("Using working directory: {}", workingDirectory);
HeartbeatServices heartbeatServices = HeartbeatServices.fromConfiguration(configuration);
metricRegistry = new MetricRegistryImpl(MetricRegistryConfiguration.fromConfiguration(configuration, rpcSystem.getMaximumMessageSizeInBytes(configuration)), ReporterSetup.fromConfiguration(configuration, pluginManager));
final RpcService metricQueryServiceRpcService = MetricUtils.startRemoteMetricsRpcService(configuration, rpcService.getAddress(), configuration.getString(TaskManagerOptions.BIND_HOST), rpcSystem);
metricRegistry.startQueryService(metricQueryServiceRpcService, resourceId.unwrap());
blobCacheService = BlobUtils.createBlobCacheService(configuration, Reference.borrowed(workingDirectory.unwrap().getBlobStorageDirectory()), highAvailabilityServices.createBlobStore(), null);
final ExternalResourceInfoProvider externalResourceInfoProvider = ExternalResourceUtils.createStaticExternalResourceInfoProviderFromConfig(configuration, pluginManager);
taskExecutorService = taskExecutorServiceFactory.createTaskExecutor(this.configuration, this.resourceId.unwrap(), rpcService, highAvailabilityServices, heartbeatServices, metricRegistry, blobCacheService, false, externalResourceInfoProvider, workingDirectory.unwrap(), this);
handleUnexpectedTaskExecutorServiceTermination();
MemoryLogger.startIfConfigured(LOG, configuration, terminationFuture.thenAccept(ignored -> {
}));
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobMasterTest method runHeartbeatTest.
private void runHeartbeatTest(TestingTaskExecutorGatewayBuilder testingTaskExecutorGatewayBuilder, HeartbeatServices heartbeatServices) throws Exception {
final CompletableFuture<JobID> disconnectedJobManagerFuture = new CompletableFuture<>();
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final TestingTaskExecutorGateway taskExecutorGateway = testingTaskExecutorGatewayBuilder.setDisconnectJobManagerConsumer((jobId, throwable) -> disconnectedJobManagerFuture.complete(jobId)).createTestingTaskExecutorGateway();
rpcService.registerGateway(taskExecutorGateway.getAddress(), taskExecutorGateway);
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withResourceId(jmResourceId).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(heartbeatServices).createJobMaster();
jobMaster.start();
try {
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
// register task manager will trigger monitor heartbeat target, schedule heartbeat
// request at interval time
CompletableFuture<RegistrationResponse> registrationResponse = jobMasterGateway.registerTaskManager(jobGraph.getJobID(), TaskManagerRegistrationInformation.create(taskExecutorGateway.getAddress(), unresolvedTaskManagerLocation, TestingUtils.zeroUUID()), testingTimeout);
// wait for the completion of the registration
registrationResponse.get();
final JobID disconnectedJobManager = disconnectedJobManagerFuture.get(testingTimeout.toMilliseconds(), TimeUnit.MILLISECONDS);
assertThat(disconnectedJobManager, equalTo(jobGraph.getJobID()));
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class TaskExecutorTest method testMaximumRegistrationDurationAfterConnectionLoss.
@Test
public void testMaximumRegistrationDurationAfterConnectionLoss() throws Exception {
configuration.set(TaskManagerOptions.REGISTRATION_TIMEOUT, TimeUtils.parseDuration("100 ms"));
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).build();
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices, new HeartbeatServices(10L, 10L));
taskExecutor.start();
final CompletableFuture<ResourceID> registrationFuture = new CompletableFuture<>();
final OneShotLatch secondRegistration = new OneShotLatch();
try {
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
if (registrationFuture.complete(taskExecutorRegistration.getResourceId())) {
return createRegistrationResponse(testingResourceManagerGateway);
} else {
secondRegistration.trigger();
return CompletableFuture.completedFuture(new Failure(new FlinkException("Only the first registration should succeed.")));
}
});
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), UUID.randomUUID());
final ResourceID registrationResourceId = registrationFuture.get();
assertThat(registrationResourceId, equalTo(taskManagerServices.getUnresolvedTaskManagerLocation().getResourceID()));
secondRegistration.await();
final Throwable error = testingFatalErrorHandler.getErrorFuture().get();
assertThat(error, is(notNullValue()));
assertThat(ExceptionUtils.stripExecutionException(error), instanceOf(RegistrationTimeoutException.class));
testingFatalErrorHandler.clearError();
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
Aggregations