use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobMasterTest method testAllocatedSlotReportDoesNotContainStaleInformation.
/**
* Tests that the {@link AllocatedSlotReport} contains up to date information and not stale
* information about the allocated slots on the {@link JobMaster}.
*
* <p>This is a probabilistic test case which only fails if executed repeatedly without the fix
* for FLINK-12863.
*/
@Test
public void testAllocatedSlotReportDoesNotContainStaleInformation() throws Exception {
final CompletableFuture<Void> assertionFuture = new CompletableFuture<>();
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final AtomicBoolean terminateHeartbeatVerification = new AtomicBoolean(false);
final OneShotLatch hasReceivedSlotOffers = new OneShotLatch();
final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setHeartbeatJobManagerFunction((taskManagerId, allocatedSlotReport) -> {
try {
if (hasReceivedSlotOffers.isTriggered()) {
assertThat(allocatedSlotReport.getAllocatedSlotInfos(), hasSize(1));
} else {
assertThat(allocatedSlotReport.getAllocatedSlotInfos(), empty());
}
} catch (AssertionError e) {
assertionFuture.completeExceptionally(e);
}
if (terminateHeartbeatVerification.get()) {
assertionFuture.complete(null);
}
return FutureUtils.completedVoidFuture();
}).createTestingTaskExecutorGateway();
rpcService.registerGateway(taskExecutorGateway.getAddress(), taskExecutorGateway);
final JobManagerSharedServices jobManagerSharedServices = new TestingJobManagerSharedServicesBuilder().build();
final JobGraph jobGraph = JobGraphTestUtils.singleNoOpJobGraph();
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withHeartbeatServices(new HeartbeatServices(5L, 1000L)).withSlotPoolServiceSchedulerFactory(DefaultSlotPoolServiceSchedulerFactory.create(new TestingSlotPoolFactory(hasReceivedSlotOffers), new DefaultSchedulerFactory())).createJobMaster();
jobMaster.start();
try {
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
// register task manager will trigger monitor heartbeat target, schedule heartbeat
// request at interval time
CompletableFuture<RegistrationResponse> registrationResponse = jobMasterGateway.registerTaskManager(jobGraph.getJobID(), TaskManagerRegistrationInformation.create(taskExecutorGateway.getAddress(), unresolvedTaskManagerLocation, TestingUtils.zeroUUID()), testingTimeout);
// wait for the completion of the registration
registrationResponse.get();
final SlotOffer slotOffer = new SlotOffer(new AllocationID(), 0, ResourceProfile.ANY);
final CompletableFuture<Collection<SlotOffer>> slotOfferFuture = jobMasterGateway.offerSlots(unresolvedTaskManagerLocation.getResourceID(), Collections.singleton(slotOffer), testingTimeout);
assertThat(slotOfferFuture.get(), containsInAnyOrder(slotOffer));
terminateHeartbeatVerification.set(true);
// make sure that no assertion has been violated
assertionFuture.get();
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
jobManagerSharedServices.shutdown();
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class TaskExecutorBuilder method build.
public TaskExecutor build() throws Exception {
final TaskExecutorBlobService resolvedTaskExecutorBlobService;
TaskExecutorResourceUtils.adjustForLocalExecution(configuration);
if (taskExecutorBlobService == null) {
resolvedTaskExecutorBlobService = NoOpTaskExecutorBlobService.INSTANCE;
} else {
resolvedTaskExecutorBlobService = taskExecutorBlobService;
}
final TaskManagerConfiguration resolvedTaskManagerConfiguration;
if (taskManagerConfiguration == null) {
resolvedTaskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration, taskExecutorResourceSpec, rpcService.getAddress(), workingDirectory.getTmpDirectory());
} else {
resolvedTaskManagerConfiguration = taskManagerConfiguration;
}
final TaskManagerServices resolvedTaskManagerServices;
if (taskManagerServices == null) {
final TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(configuration, resourceId, rpcService.getAddress(), true, taskExecutorResourceSpec, workingDirectory);
resolvedTaskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, VoidPermanentBlobService.INSTANCE, UnregisteredMetricGroups.createUnregisteredTaskManagerMetricGroup(), Executors.newDirectExecutorService(), throwable -> {
}, workingDirectory);
} else {
resolvedTaskManagerServices = taskManagerServices;
}
return new TaskExecutor(rpcService, resolvedTaskManagerConfiguration, haServices, resolvedTaskManagerServices, externalResourceInfoProvider, heartbeatServices, taskManagerMetricGroup, metricQueryServiceAddress, resolvedTaskExecutorBlobService, fatalErrorHandler, partitionTracker);
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class MiniCluster method setupDispatcherResourceManagerComponents.
@GuardedBy("lock")
private void setupDispatcherResourceManagerComponents(Configuration configuration, RpcServiceFactory dispatcherResourceManagerComponentRpcServiceFactory, MetricQueryServiceRetriever metricQueryServiceRetriever) throws Exception {
dispatcherResourceManagerComponents.addAll(createDispatcherResourceManagerComponents(configuration, dispatcherResourceManagerComponentRpcServiceFactory, haServices, blobServer, heartbeatServices, metricRegistry, metricQueryServiceRetriever, new ShutDownFatalErrorHandler()));
final Collection<CompletableFuture<ApplicationStatus>> shutDownFutures = new ArrayList<>(dispatcherResourceManagerComponents.size());
for (DispatcherResourceManagerComponent dispatcherResourceManagerComponent : dispatcherResourceManagerComponents) {
final CompletableFuture<ApplicationStatus> shutDownFuture = dispatcherResourceManagerComponent.getShutDownFuture();
FutureUtils.assertNoException(shutDownFuture.thenCompose(applicationStatus -> dispatcherResourceManagerComponent.stopApplication(applicationStatus, null)));
shutDownFutures.add(shutDownFuture);
}
FutureUtils.completeAll(shutDownFutures).whenComplete((ignored, exception) -> closeAsync());
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class ProcessFailureCancelingITCase method testCancelingOnProcessFailure.
@Test
public void testCancelingOnProcessFailure() throws Throwable {
Assume.assumeTrue("---- Skipping Process Failure test : Could not find java executable ----", getJavaCommandPath() != null);
TestProcess taskManagerProcess = null;
final TestingFatalErrorHandler fatalErrorHandler = new TestingFatalErrorHandler();
Configuration config = new Configuration();
config.setString(JobManagerOptions.ADDRESS, "localhost");
config.set(AkkaOptions.ASK_TIMEOUT_DURATION, Duration.ofSeconds(100));
config.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
config.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, zooKeeperResource.getConnectString());
config.setString(HighAvailabilityOptions.HA_STORAGE_PATH, temporaryFolder.newFolder().getAbsolutePath());
config.setInteger(TaskManagerOptions.NUM_TASK_SLOTS, 2);
config.set(TaskManagerOptions.MANAGED_MEMORY_SIZE, MemorySize.parse("4m"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MIN, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.NETWORK_MEMORY_MAX, MemorySize.parse("3200k"));
config.set(TaskManagerOptions.TASK_HEAP_MEMORY, MemorySize.parse("128m"));
config.set(TaskManagerOptions.CPU_CORES, 1.0);
config.setInteger(RestOptions.PORT, 0);
final RpcService rpcService = RpcSystem.load().remoteServiceBuilder(config, "localhost", "0").createAndStart();
final int jobManagerPort = rpcService.getPort();
config.setInteger(JobManagerOptions.PORT, jobManagerPort);
final DispatcherResourceManagerComponentFactory resourceManagerComponentFactory = DefaultDispatcherResourceManagerComponentFactory.createSessionComponentFactory(StandaloneResourceManagerFactory.getInstance());
DispatcherResourceManagerComponent dispatcherResourceManagerComponent = null;
final ScheduledExecutorService ioExecutor = TestingUtils.defaultExecutor();
final HighAvailabilityServices haServices = HighAvailabilityServicesUtils.createHighAvailabilityServices(config, ioExecutor, AddressResolution.NO_ADDRESS_RESOLUTION, RpcSystem.load(), NoOpFatalErrorHandler.INSTANCE);
final AtomicReference<Throwable> programException = new AtomicReference<>();
try {
dispatcherResourceManagerComponent = resourceManagerComponentFactory.create(config, ResourceID.generate(), ioExecutor, rpcService, haServices, blobServerResource.getBlobServer(), new HeartbeatServices(100L, 10000L, 2), NoOpMetricRegistry.INSTANCE, new MemoryExecutionGraphInfoStore(), VoidMetricQueryServiceRetriever.INSTANCE, fatalErrorHandler);
TestProcessBuilder taskManagerProcessBuilder = new TestProcessBuilder(TaskExecutorProcessEntryPoint.class.getName());
taskManagerProcessBuilder.addConfigAsMainClassArgs(config);
taskManagerProcess = taskManagerProcessBuilder.start();
// start the test program, which infinitely blocks
Runnable programRunner = new Runnable() {
@Override
public void run() {
try {
ExecutionEnvironment env = ExecutionEnvironment.createRemoteEnvironment("localhost", 1337, config);
env.setParallelism(2);
env.setRestartStrategy(RestartStrategies.noRestart());
env.generateSequence(0, Long.MAX_VALUE).map(new MapFunction<Long, Long>() {
@Override
public Long map(Long value) throws Exception {
synchronized (this) {
System.out.println(TASK_DEPLOYED_MARKER);
wait();
}
return 0L;
}
}).output(new DiscardingOutputFormat<>());
env.execute();
} catch (Throwable t) {
programException.set(t);
}
}
};
Thread programThread = new Thread(programRunner);
programThread.start();
waitUntilAtLeastOneTaskHasBeenDeployed(taskManagerProcess);
// kill the TaskManager after the job started to run
taskManagerProcess.destroy();
taskManagerProcess = null;
// the job should fail within a few seconds due to heartbeat timeouts
// since the CI environment is often slow, we conservatively give it up to 2 minutes
programThread.join(TIMEOUT.toMillis());
assertFalse("The program did not cancel in time", programThread.isAlive());
Throwable error = programException.get();
assertNotNull("The program did not fail properly", error);
assertTrue(error instanceof ProgramInvocationException);
// all seems well :-)
} catch (Exception | Error e) {
if (taskManagerProcess != null) {
printOutput("TaskManager OUT", taskManagerProcess.getProcessOutput().toString());
printOutput("TaskManager ERR", taskManagerProcess.getErrorOutput().toString());
}
throw ExceptionUtils.firstOrSuppressed(e, programException.get());
} finally {
if (taskManagerProcess != null) {
taskManagerProcess.destroy();
}
if (dispatcherResourceManagerComponent != null) {
dispatcherResourceManagerComponent.stopApplication(ApplicationStatus.SUCCEEDED, null);
}
fatalErrorHandler.rethrowError();
RpcUtils.terminateRpcService(rpcService, Time.seconds(100L));
haServices.closeAndCleanupAllData();
}
}
Aggregations