use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class DefaultDispatcherResourceManagerComponentFactory method create.
@Override
public DispatcherResourceManagerComponent create(Configuration configuration, ResourceID resourceId, Executor ioExecutor, RpcService rpcService, HighAvailabilityServices highAvailabilityServices, BlobServer blobServer, HeartbeatServices heartbeatServices, MetricRegistry metricRegistry, ExecutionGraphInfoStore executionGraphInfoStore, MetricQueryServiceRetriever metricQueryServiceRetriever, FatalErrorHandler fatalErrorHandler) throws Exception {
LeaderRetrievalService dispatcherLeaderRetrievalService = null;
LeaderRetrievalService resourceManagerRetrievalService = null;
WebMonitorEndpoint<?> webMonitorEndpoint = null;
ResourceManagerService resourceManagerService = null;
DispatcherRunner dispatcherRunner = null;
try {
dispatcherLeaderRetrievalService = highAvailabilityServices.getDispatcherLeaderRetriever();
resourceManagerRetrievalService = highAvailabilityServices.getResourceManagerLeaderRetriever();
final LeaderGatewayRetriever<DispatcherGateway> dispatcherGatewayRetriever = new RpcGatewayRetriever<>(rpcService, DispatcherGateway.class, DispatcherId::fromUuid, new ExponentialBackoffRetryStrategy(12, Duration.ofMillis(10), Duration.ofMillis(50)));
final LeaderGatewayRetriever<ResourceManagerGateway> resourceManagerGatewayRetriever = new RpcGatewayRetriever<>(rpcService, ResourceManagerGateway.class, ResourceManagerId::fromUuid, new ExponentialBackoffRetryStrategy(12, Duration.ofMillis(10), Duration.ofMillis(50)));
final ScheduledExecutorService executor = WebMonitorEndpoint.createExecutorService(configuration.getInteger(RestOptions.SERVER_NUM_THREADS), configuration.getInteger(RestOptions.SERVER_THREAD_PRIORITY), "DispatcherRestEndpoint");
final long updateInterval = configuration.getLong(MetricOptions.METRIC_FETCHER_UPDATE_INTERVAL);
final MetricFetcher metricFetcher = updateInterval == 0 ? VoidMetricFetcher.INSTANCE : MetricFetcherImpl.fromConfiguration(configuration, metricQueryServiceRetriever, dispatcherGatewayRetriever, executor);
webMonitorEndpoint = restEndpointFactory.createRestEndpoint(configuration, dispatcherGatewayRetriever, resourceManagerGatewayRetriever, blobServer, executor, metricFetcher, highAvailabilityServices.getClusterRestEndpointLeaderElectionService(), fatalErrorHandler);
log.debug("Starting Dispatcher REST endpoint.");
webMonitorEndpoint.start();
final String hostname = RpcUtils.getHostname(rpcService);
resourceManagerService = ResourceManagerServiceImpl.create(resourceManagerFactory, configuration, resourceId, rpcService, highAvailabilityServices, heartbeatServices, fatalErrorHandler, new ClusterInformation(hostname, blobServer.getPort()), webMonitorEndpoint.getRestBaseUrl(), metricRegistry, hostname, ioExecutor);
final HistoryServerArchivist historyServerArchivist = HistoryServerArchivist.createHistoryServerArchivist(configuration, webMonitorEndpoint, ioExecutor);
final DispatcherOperationCaches dispatcherOperationCaches = new DispatcherOperationCaches(configuration.get(RestOptions.ASYNC_OPERATION_STORE_DURATION));
final PartialDispatcherServices partialDispatcherServices = new PartialDispatcherServices(configuration, highAvailabilityServices, resourceManagerGatewayRetriever, blobServer, heartbeatServices, () -> JobManagerMetricGroup.createJobManagerMetricGroup(metricRegistry, hostname), executionGraphInfoStore, fatalErrorHandler, historyServerArchivist, metricRegistry.getMetricQueryServiceGatewayRpcAddress(), ioExecutor, dispatcherOperationCaches);
log.debug("Starting Dispatcher.");
dispatcherRunner = dispatcherRunnerFactory.createDispatcherRunner(highAvailabilityServices.getDispatcherLeaderElectionService(), fatalErrorHandler, new HaServicesJobPersistenceComponentFactory(highAvailabilityServices), ioExecutor, rpcService, partialDispatcherServices);
log.debug("Starting ResourceManagerService.");
resourceManagerService.start();
resourceManagerRetrievalService.start(resourceManagerGatewayRetriever);
dispatcherLeaderRetrievalService.start(dispatcherGatewayRetriever);
return new DispatcherResourceManagerComponent(dispatcherRunner, resourceManagerService, dispatcherLeaderRetrievalService, resourceManagerRetrievalService, webMonitorEndpoint, fatalErrorHandler, dispatcherOperationCaches);
} catch (Exception exception) {
// clean up all started components
if (dispatcherLeaderRetrievalService != null) {
try {
dispatcherLeaderRetrievalService.stop();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
}
if (resourceManagerRetrievalService != null) {
try {
resourceManagerRetrievalService.stop();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
}
final Collection<CompletableFuture<Void>> terminationFutures = new ArrayList<>(3);
if (webMonitorEndpoint != null) {
terminationFutures.add(webMonitorEndpoint.closeAsync());
}
if (resourceManagerService != null) {
terminationFutures.add(resourceManagerService.closeAsync());
}
if (dispatcherRunner != null) {
terminationFutures.add(dispatcherRunner.closeAsync());
}
final FutureUtils.ConjunctFuture<Void> terminationFuture = FutureUtils.completeAll(terminationFutures);
try {
terminationFuture.get();
} catch (Exception e) {
exception = ExceptionUtils.firstOrSuppressed(e, exception);
}
throw new FlinkException("Could not create the DispatcherResourceManagerComponent.", exception);
}
}
use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class JobMasterTest method testHeartbeatTimeoutWithResourceManager.
@Test
public void testHeartbeatTimeoutWithResourceManager() throws Exception {
final String resourceManagerAddress = "rm";
final ResourceManagerId resourceManagerId = ResourceManagerId.generate();
final ResourceID rmResourceId = new ResourceID(resourceManagerAddress);
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway(resourceManagerId, rmResourceId, resourceManagerAddress, "localhost");
final CompletableFuture<Tuple3<JobMasterId, ResourceID, JobID>> jobManagerRegistrationFuture = new CompletableFuture<>();
final CompletableFuture<JobID> disconnectedJobManagerFuture = new CompletableFuture<>();
final CountDownLatch registrationAttempts = new CountDownLatch(2);
resourceManagerGateway.setRegisterJobManagerFunction((jobMasterId, resourceID, s, jobID) -> {
jobManagerRegistrationFuture.complete(Tuple3.of(jobMasterId, resourceID, jobID));
registrationAttempts.countDown();
return CompletableFuture.completedFuture(resourceManagerGateway.getJobMasterRegistrationSuccess());
});
resourceManagerGateway.setDisconnectJobManagerConsumer(tuple -> disconnectedJobManagerFuture.complete(tuple.f0));
rpcService.registerGateway(resourceManagerAddress, resourceManagerGateway);
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withJobMasterId(jobMasterId).withResourceId(jmResourceId).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(fastHeartbeatServices).createJobMaster();
jobMaster.start();
try {
// define a leader and see that a registration happens
rmLeaderRetrievalService.notifyListener(resourceManagerAddress, resourceManagerId.toUUID());
// register job manager success will trigger monitor heartbeat target between jm and rm
final Tuple3<JobMasterId, ResourceID, JobID> registrationInformation = jobManagerRegistrationFuture.get(testingTimeout.toMilliseconds(), TimeUnit.MILLISECONDS);
assertThat(registrationInformation.f0, Matchers.equalTo(jobMasterId));
assertThat(registrationInformation.f1, Matchers.equalTo(jmResourceId));
assertThat(registrationInformation.f2, Matchers.equalTo(jobGraph.getJobID()));
final JobID disconnectedJobManager = disconnectedJobManagerFuture.get(testingTimeout.toMilliseconds(), TimeUnit.MILLISECONDS);
// heartbeat timeout should trigger disconnect JobManager from ResourceManager
assertThat(disconnectedJobManager, Matchers.equalTo(jobGraph.getJobID()));
// the JobMaster should try to reconnect to the RM
registrationAttempts.await();
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class JobMasterTest method testReconnectionAfterDisconnect.
/**
* Tests that we continue reconnecting to the latest known RM after a disconnection message.
*/
@Test
public void testReconnectionAfterDisconnect() throws Exception {
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withJobMasterId(jobMasterId).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(heartbeatServices).createJobMaster();
jobMaster.start();
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
try {
final TestingResourceManagerGateway testingResourceManagerGateway = createAndRegisterTestingResourceManagerGateway();
final BlockingQueue<JobMasterId> registrationsQueue = new ArrayBlockingQueue<>(1);
testingResourceManagerGateway.setRegisterJobManagerFunction((jobMasterId, resourceID, s, jobID) -> {
registrationsQueue.offer(jobMasterId);
return CompletableFuture.completedFuture(testingResourceManagerGateway.getJobMasterRegistrationSuccess());
});
final ResourceManagerId resourceManagerId = testingResourceManagerGateway.getFencingToken();
notifyResourceManagerLeaderListeners(testingResourceManagerGateway);
// wait for first registration attempt
final JobMasterId firstRegistrationAttempt = registrationsQueue.take();
assertThat(firstRegistrationAttempt, equalTo(jobMasterId));
assertThat(registrationsQueue.isEmpty(), is(true));
jobMasterGateway.disconnectResourceManager(resourceManagerId, new FlinkException("Test exception"));
// wait for the second registration attempt after the disconnect call
assertThat(registrationsQueue.take(), equalTo(jobMasterId));
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class DeclarativeSlotManagerTest method testRequirementDeclaration.
private void testRequirementDeclaration(RequirementDeclarationScenario scenario) throws Exception {
final ResourceManagerId resourceManagerId = ResourceManagerId.generate();
final ResourceID resourceID = ResourceID.generate();
final JobID jobId = new JobID();
final SlotID slotId = new SlotID(resourceID, 0);
final String targetAddress = "localhost";
final ResourceProfile resourceProfile = ResourceProfile.fromResources(42.0, 1337);
final CompletableFuture<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestFuture = new CompletableFuture<>();
// accept an incoming slot request
final TaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(tuple6 -> {
requestFuture.complete(Tuple6.of(tuple6.f0, tuple6.f1, tuple6.f2, tuple6.f3, tuple6.f4, tuple6.f5));
return CompletableFuture.completedFuture(Acknowledge.get());
}).createTestingTaskExecutorGateway();
final TaskExecutorConnection taskExecutorConnection = new TaskExecutorConnection(resourceID, taskExecutorGateway);
final SlotStatus slotStatus = new SlotStatus(slotId, resourceProfile);
final SlotReport slotReport = new SlotReport(slotStatus);
final DefaultSlotTracker slotTracker = new DefaultSlotTracker();
try (DeclarativeSlotManager slotManager = createDeclarativeSlotManagerBuilder().setSlotTracker(slotTracker).buildAndStartWithDirectExec(resourceManagerId, new TestingResourceActionsBuilder().build())) {
if (scenario == RequirementDeclarationScenario.TASK_EXECUTOR_REGISTRATION_BEFORE_REQUIREMENT_DECLARATION) {
slotManager.registerTaskManager(taskExecutorConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
}
final ResourceRequirements requirements = ResourceRequirements.create(jobId, targetAddress, Collections.singleton(ResourceRequirement.create(resourceProfile, 1)));
slotManager.processResourceRequirements(requirements);
if (scenario == RequirementDeclarationScenario.TASK_EXECUTOR_REGISTRATION_AFTER_REQUIREMENT_DECLARATION) {
slotManager.registerTaskManager(taskExecutorConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
}
assertThat(requestFuture.get(), is(equalTo(Tuple6.of(slotId, jobId, requestFuture.get().f2, resourceProfile, targetAddress, resourceManagerId))));
DeclarativeTaskManagerSlot slot = slotTracker.getSlot(slotId);
assertEquals("The slot has not been allocated to the expected allocation id.", jobId, slot.getJobId());
}
}
use of org.apache.flink.runtime.resourcemanager.ResourceManagerId in project flink by apache.
the class DeclarativeSlotManagerTest method testSlotRequestFailure.
/**
* Tests that the SlotManager retries allocating a slot if the TaskExecutor#requestSlot call
* fails.
*/
@Test
public void testSlotRequestFailure() throws Exception {
final DefaultSlotTracker slotTracker = new DefaultSlotTracker();
try (final DeclarativeSlotManager slotManager = createDeclarativeSlotManagerBuilder().setSlotTracker(slotTracker).buildAndStartWithDirectExec()) {
ResourceRequirements requirements = createResourceRequirementsForSingleSlot();
slotManager.processResourceRequirements(requirements);
final BlockingQueue<Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId>> requestSlotQueue = new ArrayBlockingQueue<>(1);
final BlockingQueue<CompletableFuture<Acknowledge>> responseQueue = new ArrayBlockingQueue<>(2);
final CompletableFuture<Acknowledge> firstManualSlotRequestResponse = new CompletableFuture<>();
responseQueue.offer(firstManualSlotRequestResponse);
final CompletableFuture<Acknowledge> secondManualSlotRequestResponse = new CompletableFuture<>();
responseQueue.offer(secondManualSlotRequestResponse);
final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(slotIDJobIDAllocationIDStringResourceManagerIdTuple6 -> {
requestSlotQueue.offer(slotIDJobIDAllocationIDStringResourceManagerIdTuple6);
try {
return responseQueue.take();
} catch (InterruptedException ignored) {
return FutureUtils.completedExceptionally(new FlinkException("Response queue was interrupted."));
}
}).createTestingTaskExecutorGateway();
final ResourceID taskExecutorResourceId = ResourceID.generate();
final TaskExecutorConnection taskExecutionConnection = new TaskExecutorConnection(taskExecutorResourceId, testingTaskExecutorGateway);
final SlotReport slotReport = new SlotReport(createFreeSlotStatus(new SlotID(taskExecutorResourceId, 0)));
slotManager.registerTaskManager(taskExecutionConnection, slotReport, ResourceProfile.ANY, ResourceProfile.ANY);
final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> firstRequest = requestSlotQueue.take();
// fail first request
firstManualSlotRequestResponse.completeExceptionally(new SlotAllocationException("Test exception"));
final Tuple6<SlotID, JobID, AllocationID, ResourceProfile, String, ResourceManagerId> secondRequest = requestSlotQueue.take();
assertThat(secondRequest.f1, equalTo(firstRequest.f1));
assertThat(secondRequest.f0, equalTo(firstRequest.f0));
secondManualSlotRequestResponse.complete(Acknowledge.get());
final DeclarativeTaskManagerSlot slot = slotTracker.getSlot(secondRequest.f0);
assertThat(slot.getState(), equalTo(SlotState.ALLOCATED));
assertThat(slot.getJobId(), equalTo(secondRequest.f1));
}
}
Aggregations