use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class TaskExecutorTest method testReleaseOfJobResourcesIfJobMasterIsNotCorrect.
/**
* Tests that the TaskExecutor releases all of its job resources if the JobMaster is not running
* the specified job. See FLINK-21606.
*/
@Test
public void testReleaseOfJobResourcesIfJobMasterIsNotCorrect() throws Exception {
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(TaskSlotUtils.createTaskSlotTable(1)).build();
final TestingTaskExecutorPartitionTracker taskExecutorPartitionTracker = new TestingTaskExecutorPartitionTracker();
final CompletableFuture<JobID> jobPartitionsReleaseFuture = new CompletableFuture<>();
// simulate that we have some partitions tracked
taskExecutorPartitionTracker.setIsTrackingPartitionsForFunction(ignored -> true);
taskExecutorPartitionTracker.setStopTrackingAndReleaseAllPartitionsConsumer(jobPartitionsReleaseFuture::complete);
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices, HEARTBEAT_SERVICES, taskExecutorPartitionTracker);
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setRegisterTaskManagerFunction((ignoredJobId, ignoredTaskManagerRegistrationInformation) -> CompletableFuture.completedFuture(new JMTMRegistrationRejection("foobar"))).build();
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final InstanceID registrationId = new InstanceID();
final OneShotLatch taskExecutorIsRegistered = new OneShotLatch();
final CompletableFuture<Tuple3<InstanceID, SlotID, AllocationID>> availableSlotFuture = new CompletableFuture<>();
final TestingResourceManagerGateway resourceManagerGateway = createRmWithTmRegisterAndNotifySlotHooks(registrationId, taskExecutorIsRegistered, availableSlotFuture);
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerGateway.getFencingToken().toUUID());
try {
taskExecutor.start();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
taskExecutorIsRegistered.await();
final AllocationID allocationId = new AllocationID();
final SlotID slotId = new SlotID(taskExecutor.getResourceID(), 0);
requestSlot(taskExecutorGateway, jobId, allocationId, slotId, ResourceProfile.UNKNOWN, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken());
// The JobManager should reject the registration which should release all job resources
// on the TaskExecutor
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
// the slot should be freed
assertThat(availableSlotFuture.get().f1, is(slotId));
assertThat(availableSlotFuture.get().f2, is(allocationId));
// all job partitions should be released
assertThat(jobPartitionsReleaseFuture.get(), is(jobId));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class TaskExecutorSubmissionTest method testCancellingDependentAndStateUpdateFails.
/**
* This tests creates two tasks. The sender sends data but fails to send the state update back
* to the job manager. the second one blocks to be canceled
*/
@Test
public void testCancellingDependentAndStateUpdateFails() throws Exception {
ResourceID producerLocation = ResourceID.generate();
NettyShuffleDescriptor sdd = createRemoteWithIdAndLocation(new IntermediateResultPartitionID(), producerLocation);
TaskDeploymentDescriptor tdd1 = createSender(sdd);
TaskDeploymentDescriptor tdd2 = createReceiver(sdd);
ExecutionAttemptID eid1 = tdd1.getExecutionAttemptId();
ExecutionAttemptID eid2 = tdd2.getExecutionAttemptId();
final CompletableFuture<Void> task1RunningFuture = new CompletableFuture<>();
final CompletableFuture<Void> task2RunningFuture = new CompletableFuture<>();
final CompletableFuture<Void> task1FailedFuture = new CompletableFuture<>();
final CompletableFuture<Void> task2CanceledFuture = new CompletableFuture<>();
final JobMasterId jobMasterId = JobMasterId.generate();
TestingJobMasterGateway testingJobMasterGateway = new TestingJobMasterGatewayBuilder().setFencingTokenSupplier(() -> jobMasterId).setUpdateTaskExecutionStateFunction(taskExecutionState -> {
if (taskExecutionState != null && taskExecutionState.getID().equals(eid1) && taskExecutionState.getExecutionState() == ExecutionState.RUNNING) {
return FutureUtils.completedExceptionally(new ExecutionGraphException("The execution attempt " + eid2 + " was not found."));
} else {
return CompletableFuture.completedFuture(Acknowledge.get());
}
}).build();
try (TaskSubmissionTestEnvironment env = new TaskSubmissionTestEnvironment.Builder(jobId).setResourceID(producerLocation).setSlotSize(2).addTaskManagerActionListener(eid1, ExecutionState.RUNNING, task1RunningFuture).addTaskManagerActionListener(eid2, ExecutionState.RUNNING, task2RunningFuture).addTaskManagerActionListener(eid1, ExecutionState.FAILED, task1FailedFuture).addTaskManagerActionListener(eid2, ExecutionState.CANCELED, task2CanceledFuture).setJobMasterId(jobMasterId).setJobMasterGateway(testingJobMasterGateway).useRealNonMockShuffleEnvironment().build()) {
TaskExecutorGateway tmGateway = env.getTaskExecutorGateway();
TaskSlotTable<Task> taskSlotTable = env.getTaskSlotTable();
taskSlotTable.allocateSlot(0, jobId, tdd1.getAllocationId(), Time.seconds(60));
tmGateway.submitTask(tdd1, jobMasterId, timeout).get();
task1RunningFuture.get();
taskSlotTable.allocateSlot(1, jobId, tdd2.getAllocationId(), Time.seconds(60));
tmGateway.submitTask(tdd2, jobMasterId, timeout).get();
task2RunningFuture.get();
task1FailedFuture.get();
assertSame(taskSlotTable.getTask(eid1).getExecutionState(), ExecutionState.FAILED);
tmGateway.cancelTask(eid2, timeout);
task2CanceledFuture.get();
assertSame(taskSlotTable.getTask(eid2).getExecutionState(), ExecutionState.CANCELED);
}
}
use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class DefaultJobLeaderServiceTest method doesNotReconnectAfterTargetLostLeadership.
/**
* Tests that the JobLeaderService won't try to reconnect to JobMaster after it has lost the
* leadership. See FLINK-16836.
*/
@Test
public void doesNotReconnectAfterTargetLostLeadership() throws Exception {
final JobID jobId = new JobID();
final SettableLeaderRetrievalService leaderRetrievalService = new SettableLeaderRetrievalService();
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setJobMasterLeaderRetrieverFunction(ignored -> leaderRetrievalService).build();
final TestingJobMasterGateway jobMasterGateway = registerJobMaster();
final OneShotLatch jobManagerGainedLeadership = new OneShotLatch();
final TestingJobLeaderListener testingJobLeaderListener = new TestingJobLeaderListener(ignored -> jobManagerGainedLeadership.trigger());
final JobLeaderService jobLeaderService = createAndStartJobLeaderService(haServices, testingJobLeaderListener);
try {
jobLeaderService.addJob(jobId, jobMasterGateway.getAddress());
leaderRetrievalService.notifyListener(jobMasterGateway.getAddress(), UUID.randomUUID());
jobManagerGainedLeadership.await();
// revoke the leadership
leaderRetrievalService.notifyListener(null, null);
testingJobLeaderListener.waitUntilJobManagerLostLeadership();
jobLeaderService.reconnect(jobId);
} finally {
jobLeaderService.stop();
}
}
use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class DefaultJobLeaderServiceTest method canReconnectToOldLeaderWithSameLeaderAddress.
/**
* Tests that the JobLeaderService can reconnect to an old leader which seemed to have lost the
* leadership in between. See FLINK-14316.
*/
@Test
public void canReconnectToOldLeaderWithSameLeaderAddress() throws Exception {
final JobID jobId = new JobID();
final SettableLeaderRetrievalService leaderRetrievalService = new SettableLeaderRetrievalService();
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setJobMasterLeaderRetrieverFunction(ignored -> leaderRetrievalService).build();
final TestingJobMasterGateway jobMasterGateway = registerJobMaster();
final BlockingQueue<JobID> leadershipQueue = new ArrayBlockingQueue<>(1);
final TestingJobLeaderListener testingJobLeaderListener = new TestingJobLeaderListener(leadershipQueue::offer);
final JobLeaderService jobLeaderService = createAndStartJobLeaderService(haServices, testingJobLeaderListener);
try {
jobLeaderService.addJob(jobId, jobMasterGateway.getAddress());
final UUID leaderSessionId = UUID.randomUUID();
leaderRetrievalService.notifyListener(jobMasterGateway.getAddress(), leaderSessionId);
// wait for the first leadership
assertThat(leadershipQueue.take(), is(jobId));
// revoke the leadership
leaderRetrievalService.notifyListener(null, null);
testingJobLeaderListener.waitUntilJobManagerLostLeadership();
leaderRetrievalService.notifyListener(jobMasterGateway.getAddress(), leaderSessionId);
// check that we obtain the leadership a second time
assertThat(leadershipQueue.take(), is(jobId));
} finally {
jobLeaderService.stop();
}
}
use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class DefaultJobLeaderServiceTest method rejectedJobManagerRegistrationCallsJobLeaderListener.
@Test
public void rejectedJobManagerRegistrationCallsJobLeaderListener() throws Exception {
final SettableLeaderRetrievalService leaderRetrievalService = new SettableLeaderRetrievalService();
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setJobMasterLeaderRetrieverFunction(ignored -> leaderRetrievalService).build();
final JobID jobId = new JobID();
final CompletableFuture<JobID> rejectedRegistrationFuture = new CompletableFuture<>();
final TestingJobLeaderListener testingJobLeaderListener = new TestingJobLeaderListener(ignored -> {
}, rejectedRegistrationFuture::complete);
final JobLeaderService jobLeaderService = createAndStartJobLeaderService(haServices, testingJobLeaderListener);
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setRegisterTaskManagerFunction((jobID, taskManagerRegistrationInformation) -> CompletableFuture.completedFuture(new JMTMRegistrationRejection("foobar"))).build();
rpcServiceResource.getTestingRpcService().registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
try {
jobLeaderService.addJob(jobId, "foobar");
leaderRetrievalService.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
assertThat(rejectedRegistrationFuture.get(), is(jobId));
} finally {
jobLeaderService.stop();
}
}
Aggregations