use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobMasterTest method testRequestPartitionState.
/**
* Tests the {@link JobMaster#requestPartitionState(IntermediateDataSetID, ResultPartitionID)}
* call for a finished result partition.
*/
@Test
public void testRequestPartitionState() throws Exception {
final JobGraph producerConsumerJobGraph = producerConsumerJobGraph();
final JobMaster jobMaster = new JobMasterBuilder(producerConsumerJobGraph, rpcService).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(heartbeatServices).createJobMaster();
jobMaster.start();
try {
final CompletableFuture<TaskDeploymentDescriptor> tddFuture = new CompletableFuture<>();
final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setSubmitTaskConsumer((taskDeploymentDescriptor, jobMasterId) -> {
tddFuture.complete(taskDeploymentDescriptor);
return CompletableFuture.completedFuture(Acknowledge.get());
}).createTestingTaskExecutorGateway();
final LocalUnresolvedTaskManagerLocation taskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
final Collection<SlotOffer> slotOffers = registerSlotsAtJobMaster(1, jobMasterGateway, producerConsumerJobGraph.getJobID(), testingTaskExecutorGateway, taskManagerLocation);
assertThat(slotOffers, hasSize(1));
// obtain tdd for the result partition ids
final TaskDeploymentDescriptor tdd = tddFuture.get();
assertThat(tdd.getProducedPartitions(), hasSize(1));
final ResultPartitionDeploymentDescriptor partition = tdd.getProducedPartitions().iterator().next();
final ExecutionAttemptID executionAttemptId = tdd.getExecutionAttemptId();
final ExecutionAttemptID copiedExecutionAttemptId = new ExecutionAttemptID(executionAttemptId);
// finish the producer task
jobMasterGateway.updateTaskExecutionState(new TaskExecutionState(executionAttemptId, ExecutionState.FINISHED)).get();
// request the state of the result partition of the producer
final ResultPartitionID partitionId = new ResultPartitionID(partition.getPartitionId(), copiedExecutionAttemptId);
CompletableFuture<ExecutionState> partitionStateFuture = jobMasterGateway.requestPartitionState(partition.getResultId(), partitionId);
assertThat(partitionStateFuture.get(), equalTo(ExecutionState.FINISHED));
// ask for unknown result partition
partitionStateFuture = jobMasterGateway.requestPartitionState(partition.getResultId(), new ResultPartitionID());
try {
partitionStateFuture.get();
fail("Expected failure.");
} catch (ExecutionException e) {
assertThat(ExceptionUtils.findThrowable(e, IllegalArgumentException.class).isPresent(), is(true));
}
// ask for wrong intermediate data set id
partitionStateFuture = jobMasterGateway.requestPartitionState(new IntermediateDataSetID(), partitionId);
try {
partitionStateFuture.get();
fail("Expected failure.");
} catch (ExecutionException e) {
assertThat(ExceptionUtils.findThrowable(e, IllegalArgumentException.class).isPresent(), is(true));
}
// ask for "old" execution
partitionStateFuture = jobMasterGateway.requestPartitionState(partition.getResultId(), new ResultPartitionID(partition.getPartitionId(), new ExecutionAttemptID()));
try {
partitionStateFuture.get();
fail("Expected failure.");
} catch (ExecutionException e) {
assertThat(ExceptionUtils.findThrowable(e, PartitionProducerDisposedException.class).isPresent(), is(true));
}
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobMasterTest method testTaskExecutorNotReleasedOnFailedAllocationIfPartitionIsAllocated.
@Test
public void testTaskExecutorNotReleasedOnFailedAllocationIfPartitionIsAllocated() throws Exception {
final JobManagerSharedServices jobManagerSharedServices = new TestingJobManagerSharedServicesBuilder().build();
final JobGraph jobGraph = JobGraphTestUtils.singleNoOpJobGraph();
final LocalUnresolvedTaskManagerLocation taskManagerUnresolvedLocation = new LocalUnresolvedTaskManagerLocation();
final AtomicBoolean isTrackingPartitions = new AtomicBoolean(true);
final TestingJobMasterPartitionTracker partitionTracker = new TestingJobMasterPartitionTracker();
partitionTracker.setIsTrackingPartitionsForFunction(ignored -> isTrackingPartitions.get());
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withConfiguration(configuration).withHighAvailabilityServices(haServices).withJobManagerSharedServices(jobManagerSharedServices).withHeartbeatServices(heartbeatServices).withPartitionTrackerFactory(ignored -> partitionTracker).createJobMaster();
final CompletableFuture<JobID> disconnectTaskExecutorFuture = new CompletableFuture<>();
final CompletableFuture<AllocationID> freedSlotFuture = new CompletableFuture<>();
final TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setFreeSlotFunction((allocationID, throwable) -> {
freedSlotFuture.complete(allocationID);
return CompletableFuture.completedFuture(Acknowledge.get());
}).setDisconnectJobManagerConsumer((jobID, throwable) -> disconnectTaskExecutorFuture.complete(jobID)).createTestingTaskExecutorGateway();
try {
jobMaster.start();
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
final Collection<SlotOffer> slotOffers = registerSlotsAtJobMaster(1, jobMasterGateway, jobGraph.getJobID(), testingTaskExecutorGateway, taskManagerUnresolvedLocation);
// check that we accepted the offered slot
assertThat(slotOffers, hasSize(1));
final AllocationID allocationId = slotOffers.iterator().next().getAllocationId();
jobMasterGateway.failSlot(taskManagerUnresolvedLocation.getResourceID(), allocationId, new FlinkException("Fail allocation test exception"));
// we should free the slot, but not disconnect from the TaskExecutor as we still have an
// allocated partition
assertThat(freedSlotFuture.get(), equalTo(allocationId));
// trigger some request to guarantee ensure the slotAllocationFailure processing if
// complete
jobMasterGateway.requestJobStatus(Time.seconds(5)).get();
assertThat(disconnectTaskExecutorFuture.isDone(), is(false));
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class TaskExecutorITCase method testSlotAllocation.
@Test
public void testSlotAllocation() throws Exception {
TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
TestingHighAvailabilityServices testingHAServices = new TestingHighAvailabilityServices();
final Configuration configuration = new Configuration();
final ScheduledExecutorService scheduledExecutorService = new ScheduledThreadPoolExecutor(1);
final ResourceID taskManagerResourceId = new ResourceID("foobar");
final UUID rmLeaderId = UUID.randomUUID();
final TestingLeaderElectionService rmLeaderElectionService = new TestingLeaderElectionService();
final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService();
final String rmAddress = "rm";
final String jmAddress = "jm";
final UUID jmLeaderId = UUID.randomUUID();
final JobID jobId = new JobID();
final ResourceProfile resourceProfile = new ResourceProfile(1.0, 1);
testingHAServices.setResourceManagerLeaderElectionService(rmLeaderElectionService);
testingHAServices.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
testingHAServices.setJobMasterLeaderRetriever(jobId, new TestingLeaderRetrievalService(jmAddress, jmLeaderId));
TestingSerialRpcService rpcService = new TestingSerialRpcService();
ResourceManagerConfiguration resourceManagerConfiguration = new ResourceManagerConfiguration(Time.milliseconds(500L), Time.milliseconds(500L), Time.minutes(5L));
SlotManagerFactory slotManagerFactory = new DefaultSlotManager.Factory();
JobLeaderIdService jobLeaderIdService = new JobLeaderIdService(testingHAServices, rpcService.getScheduledExecutor(), resourceManagerConfiguration.getJobTimeout());
MetricRegistry metricRegistry = mock(MetricRegistry.class);
HeartbeatServices heartbeatServices = mock(HeartbeatServices.class, RETURNS_MOCKS);
final TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(configuration);
final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(taskManagerResourceId, InetAddress.getLocalHost(), 1234);
final MemoryManager memoryManager = mock(MemoryManager.class);
final IOManager ioManager = mock(IOManager.class);
final NetworkEnvironment networkEnvironment = mock(NetworkEnvironment.class);
final TaskManagerMetricGroup taskManagerMetricGroup = mock(TaskManagerMetricGroup.class);
final BroadcastVariableManager broadcastVariableManager = mock(BroadcastVariableManager.class);
final FileCache fileCache = mock(FileCache.class);
final TaskSlotTable taskSlotTable = new TaskSlotTable(Arrays.asList(resourceProfile), new TimerService<AllocationID>(scheduledExecutorService, 100L));
final JobManagerTable jobManagerTable = new JobManagerTable();
final JobLeaderService jobLeaderService = new JobLeaderService(taskManagerLocation);
ResourceManager<ResourceID> resourceManager = new StandaloneResourceManager(rpcService, resourceManagerConfiguration, testingHAServices, slotManagerFactory, metricRegistry, jobLeaderIdService, testingFatalErrorHandler);
TaskExecutor taskExecutor = new TaskExecutor(taskManagerConfiguration, taskManagerLocation, rpcService, memoryManager, ioManager, networkEnvironment, testingHAServices, heartbeatServices, metricRegistry, taskManagerMetricGroup, broadcastVariableManager, fileCache, taskSlotTable, jobManagerTable, jobLeaderService, testingFatalErrorHandler);
JobMasterGateway jmGateway = mock(JobMasterGateway.class);
when(jmGateway.registerTaskManager(any(String.class), any(TaskManagerLocation.class), eq(jmLeaderId), any(Time.class))).thenReturn(FlinkCompletableFuture.<RegistrationResponse>completed(new JMTMRegistrationSuccess(taskManagerResourceId, 1234)));
when(jmGateway.getHostname()).thenReturn(jmAddress);
rpcService.registerGateway(rmAddress, resourceManager.getSelf());
rpcService.registerGateway(jmAddress, jmGateway);
final AllocationID allocationId = new AllocationID();
final SlotRequest slotRequest = new SlotRequest(jobId, allocationId, resourceProfile);
final SlotOffer slotOffer = new SlotOffer(allocationId, 0, resourceProfile);
try {
resourceManager.start();
taskExecutor.start();
// notify the RM that it is the leader
rmLeaderElectionService.isLeader(rmLeaderId);
// notify the TM about the new RM leader
rmLeaderRetrievalService.notifyListener(rmAddress, rmLeaderId);
Future<RegistrationResponse> registrationResponseFuture = resourceManager.registerJobManager(rmLeaderId, jmLeaderId, jmAddress, jobId);
RegistrationResponse registrationResponse = registrationResponseFuture.get();
assertTrue(registrationResponse instanceof JobMasterRegistrationSuccess);
resourceManager.requestSlot(jmLeaderId, rmLeaderId, slotRequest);
verify(jmGateway).offerSlots(eq(taskManagerResourceId), (Iterable<SlotOffer>) argThat(Matchers.contains(slotOffer)), eq(jmLeaderId), any(Time.class));
} finally {
if (testingFatalErrorHandler.hasExceptionOccurred()) {
testingFatalErrorHandler.rethrowError();
}
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobMasterTest method testHeartbeatTimeoutWithTaskManager.
@Test
public void testHeartbeatTimeoutWithTaskManager() throws Exception {
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServices();
final TestingLeaderRetrievalService rmLeaderRetrievalService = new TestingLeaderRetrievalService();
haServices.setResourceManagerLeaderRetriever(rmLeaderRetrievalService);
haServices.setCheckpointRecoveryFactory(mock(CheckpointRecoveryFactory.class));
final TestingFatalErrorHandler testingFatalErrorHandler = new TestingFatalErrorHandler();
final String jobManagerAddress = "jm";
final UUID jmLeaderId = UUID.randomUUID();
final ResourceID jmResourceId = new ResourceID(jobManagerAddress);
final String taskManagerAddress = "tm";
final ResourceID tmResourceId = new ResourceID(taskManagerAddress);
final TaskManagerLocation taskManagerLocation = new TaskManagerLocation(tmResourceId, InetAddress.getLoopbackAddress(), 1234);
final TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
final TestingSerialRpcService rpc = new TestingSerialRpcService();
rpc.registerGateway(taskManagerAddress, taskExecutorGateway);
final long heartbeatInterval = 1L;
final long heartbeatTimeout = 5L;
final ScheduledExecutor scheduledExecutor = mock(ScheduledExecutor.class);
final HeartbeatServices heartbeatServices = new TestingHeartbeatServices(heartbeatInterval, heartbeatTimeout, scheduledExecutor);
final JobGraph jobGraph = new JobGraph();
try {
final JobMaster jobMaster = new JobMaster(jmResourceId, jobGraph, new Configuration(), rpc, haServices, heartbeatServices, Executors.newScheduledThreadPool(1), mock(BlobLibraryCacheManager.class), mock(RestartStrategyFactory.class), Time.of(10, TimeUnit.SECONDS), null, mock(OnCompletionActions.class), testingFatalErrorHandler, new FlinkUserCodeClassLoader(new URL[0]));
// also start the heartbeat manager in job manager
jobMaster.start(jmLeaderId);
// register task manager will trigger monitoring heartbeat target, schedule heartbeat request in interval time
jobMaster.registerTaskManager(taskManagerAddress, taskManagerLocation, jmLeaderId);
ArgumentCaptor<Runnable> heartbeatRunnableCaptor = ArgumentCaptor.forClass(Runnable.class);
verify(scheduledExecutor, times(1)).scheduleAtFixedRate(heartbeatRunnableCaptor.capture(), eq(0L), eq(heartbeatInterval), eq(TimeUnit.MILLISECONDS));
Runnable heartbeatRunnable = heartbeatRunnableCaptor.getValue();
ArgumentCaptor<Runnable> timeoutRunnableCaptor = ArgumentCaptor.forClass(Runnable.class);
verify(scheduledExecutor).schedule(timeoutRunnableCaptor.capture(), eq(heartbeatTimeout), eq(TimeUnit.MILLISECONDS));
Runnable timeoutRunnable = timeoutRunnableCaptor.getValue();
// run the first heartbeat request
heartbeatRunnable.run();
verify(taskExecutorGateway, times(1)).heartbeatFromJobManager(eq(jmResourceId));
// run the timeout runnable to simulate a heartbeat timeout
timeoutRunnable.run();
verify(taskExecutorGateway).disconnectJobManager(eq(jobGraph.getJobID()), any(TimeoutException.class));
// check if a concurrent error occurred
testingFatalErrorHandler.rethrowError();
} finally {
rpc.stopService();
}
}
use of org.apache.flink.runtime.heartbeat.HeartbeatServices in project flink by apache.
the class JobMasterTest method testTaskManagerRegistrationTriggersHeartbeating.
@Test
public void testTaskManagerRegistrationTriggersHeartbeating() throws Exception {
final CompletableFuture<ResourceID> heartbeatResourceIdFuture = new CompletableFuture<>();
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setHeartbeatJobManagerFunction((taskManagerId, ignored) -> {
heartbeatResourceIdFuture.complete(taskManagerId);
return FutureUtils.completedVoidFuture();
}).createTestingTaskExecutorGateway();
rpcService.registerGateway(taskExecutorGateway.getAddress(), taskExecutorGateway);
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withResourceId(jmResourceId).withConfiguration(configuration).withHighAvailabilityServices(haServices).withHeartbeatServices(new HeartbeatServices(1L, 10000L)).createJobMaster();
jobMaster.start();
try {
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
// register task manager will trigger monitor heartbeat target, schedule heartbeat
// request at interval time
CompletableFuture<RegistrationResponse> registrationResponse = jobMasterGateway.registerTaskManager(jobGraph.getJobID(), TaskManagerRegistrationInformation.create(taskExecutorGateway.getAddress(), unresolvedTaskManagerLocation, TestingUtils.zeroUUID()), testingTimeout);
// wait for the completion of the registration
registrationResponse.get();
assertThat(heartbeatResourceIdFuture.join(), anyOf(nullValue(), equalTo(jmResourceId)));
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
}
}
Aggregations