use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class TaskExecutorRecoveryTest method testRecoveredTaskExecutorWillRestoreAllocationState.
@Test
public void testRecoveredTaskExecutorWillRestoreAllocationState(@TempDir File tempDir) throws Exception {
final ResourceID resourceId = ResourceID.generate();
final Configuration configuration = new Configuration();
configuration.set(TaskManagerOptions.NUM_TASK_SLOTS, 2);
configuration.set(CheckpointingOptions.LOCAL_RECOVERY, true);
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final ArrayBlockingQueue<TaskExecutorSlotReport> queue = new ArrayBlockingQueue<>(2);
testingResourceManagerGateway.setSendSlotReportFunction(slotReportInformation -> {
queue.offer(TaskExecutorSlotReport.create(slotReportInformation.f0, slotReportInformation.f2));
return CompletableFuture.completedFuture(Acknowledge.get());
});
final TestingRpcService rpcService = rpcServiceExtension.getTestingRpcService();
rpcService.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
final JobID jobId = new JobID();
final TestingHighAvailabilityServices highAvailabilityServices = new TestingHighAvailabilityServices();
highAvailabilityServices.setResourceManagerLeaderRetriever(new SettableLeaderRetrievalService(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID()));
final SettableLeaderRetrievalService jobMasterLeaderRetriever = new SettableLeaderRetrievalService();
highAvailabilityServices.setJobMasterLeaderRetriever(jobId, jobMasterLeaderRetriever);
final WorkingDirectory workingDirectory = WorkingDirectory.create(tempDir);
final TaskExecutor taskExecutor = TaskExecutorBuilder.newBuilder(rpcService, highAvailabilityServices, workingDirectory).setConfiguration(configuration).setResourceId(resourceId).build();
taskExecutor.start();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final TaskExecutorSlotReport taskExecutorSlotReport = queue.take();
final SlotReport slotReport = taskExecutorSlotReport.getSlotReport();
assertThat(slotReport.getNumSlotStatus(), is(2));
final SlotStatus slotStatus = slotReport.iterator().next();
final SlotID allocatedSlotID = slotStatus.getSlotID();
final AllocationID allocationId = new AllocationID();
taskExecutorGateway.requestSlot(allocatedSlotID, jobId, allocationId, slotStatus.getResourceProfile(), "localhost", testingResourceManagerGateway.getFencingToken(), Time.seconds(10L)).join();
taskExecutor.close();
final BlockingQueue<Collection<SlotOffer>> offeredSlots = new ArrayBlockingQueue<>(1);
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offeredSlots.offer(new HashSet<>(slotOffers));
return CompletableFuture.completedFuture(slotOffers);
}).build();
rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
jobMasterLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
// recover the TaskExecutor
final TaskExecutor recoveredTaskExecutor = TaskExecutorBuilder.newBuilder(rpcService, highAvailabilityServices, workingDirectory).setConfiguration(configuration).setResourceId(resourceId).build();
recoveredTaskExecutor.start();
final TaskExecutorSlotReport recoveredSlotReport = queue.take();
for (SlotStatus status : recoveredSlotReport.getSlotReport()) {
if (status.getSlotID().equals(allocatedSlotID)) {
assertThat(status.getJobID(), is(jobId));
assertThat(status.getAllocationID(), is(allocationId));
} else {
assertThat(status.getJobID(), is(nullValue()));
}
}
final Collection<SlotOffer> take = offeredSlots.take();
assertThat(take, hasSize(1));
final SlotOffer offeredSlot = take.iterator().next();
assertThat(offeredSlot.getAllocationId(), is(allocationId));
}
use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class TaskExecutorSlotLifetimeTest method testUserCodeClassLoaderIsBoundToSlot.
/**
* Tests that the user code class loader is bound to the lifetime of the slot. This means that
* it is being reused across a failover, for example. See FLINK-16408.
*/
@Test
public void testUserCodeClassLoaderIsBoundToSlot() throws Exception {
final Configuration configuration = new Configuration();
final TestingRpcService rpcService = TESTING_RPC_SERVICE_RESOURCE.getTestingRpcService();
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
final CompletableFuture<SlotReport> firstSlotReportFuture = new CompletableFuture<>();
resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
firstSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
return CompletableFuture.completedFuture(Acknowledge.get());
});
final BlockingQueue<TaskExecutionState> taskExecutionStates = new ArrayBlockingQueue<>(3);
final OneShotLatch slotsOfferedLatch = new OneShotLatch();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
slotsOfferedLatch.trigger();
return CompletableFuture.completedFuture(slotOffers);
}).setUpdateTaskExecutionStateFunction(FunctionUtils.uncheckedFunction(taskExecutionState -> {
taskExecutionStates.put(taskExecutionState);
return CompletableFuture.completedFuture(Acknowledge.get());
})).build();
final LeaderRetrievalService resourceManagerLeaderRetriever = new SettableLeaderRetrievalService(resourceManagerGateway.getAddress(), resourceManagerGateway.getFencingToken().toUUID());
final LeaderRetrievalService jobMasterLeaderRetriever = new SettableLeaderRetrievalService(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
final TestingHighAvailabilityServices haServices = new TestingHighAvailabilityServicesBuilder().setResourceManagerLeaderRetriever(resourceManagerLeaderRetriever).setJobMasterLeaderRetrieverFunction(ignored -> jobMasterLeaderRetriever).build();
rpcService.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final LocalUnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
try (final TaskExecutor taskExecutor = createTaskExecutor(configuration, rpcService, haServices, unresolvedTaskManagerLocation)) {
taskExecutor.start();
final SlotReport slotReport = firstSlotReportFuture.join();
final SlotID firstSlotId = slotReport.iterator().next().getSlotID();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final JobID jobId = new JobID();
final AllocationID allocationId = new AllocationID();
taskExecutorGateway.requestSlot(firstSlotId, jobId, allocationId, ResourceProfile.ZERO, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
final TaskDeploymentDescriptor tdd = TaskDeploymentDescriptorBuilder.newBuilder(jobId, UserClassLoaderExtractingInvokable.class).setAllocationId(allocationId).build();
slotsOfferedLatch.await();
taskExecutorGateway.submitTask(tdd, jobMasterGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
final ClassLoader firstClassLoader = UserClassLoaderExtractingInvokable.take();
// wait for the first task to finish
TaskExecutionState taskExecutionState;
do {
taskExecutionState = taskExecutionStates.take();
} while (!taskExecutionState.getExecutionState().isTerminal());
// check that a second task will re-use the same class loader
taskExecutorGateway.submitTask(tdd, jobMasterGateway.getFencingToken(), RpcUtils.INF_TIMEOUT).join();
final ClassLoader secondClassLoader = UserClassLoaderExtractingInvokable.take();
assertThat(firstClassLoader, sameInstance(secondClassLoader));
}
}
use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class TaskExecutorTest method testFreeingInactiveSlotDoesNotFail.
/**
* Tests that freeing an inactive slot is a legal operation that does not throw an exception.
*/
@Test
public void testFreeingInactiveSlotDoesNotFail() throws Exception {
final OneShotLatch taskExecutorIsRegistered = new OneShotLatch();
final CompletableFuture<Tuple3<InstanceID, SlotID, AllocationID>> availableSlotFuture = new CompletableFuture<>();
final TestingResourceManagerGateway resourceManagerGateway = createRmWithTmRegisterAndNotifySlotHooks(new InstanceID(), taskExecutorIsRegistered, availableSlotFuture);
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
final MultiShotLatch offerSlotsLatch = new MultiShotLatch();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offerSlotsLatch.trigger();
return new CompletableFuture<>();
}).build();
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices);
final ThreadSafeTaskSlotTable<Task> threadSafeTaskSlotTable = new ThreadSafeTaskSlotTable<>(taskSlotTable, taskExecutor.getMainThreadExecutableForTesting());
try {
taskExecutor.start();
taskExecutor.waitUntilStarted();
final TaskExecutorGateway tmGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
taskExecutorIsRegistered.await();
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
final AllocationID allocationId = new AllocationID();
requestSlot(tmGateway, jobId, allocationId, buildSlotID(0), ResourceProfile.UNKNOWN, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken());
offerSlotsLatch.await();
tmGateway.freeSlot(allocationId, new RuntimeException("test exception"), timeout).get();
assertThat(availableSlotFuture.get().f2, is(allocationId));
assertThat(threadSafeTaskSlotTable.getAllocationIdsPerJob(jobId), empty());
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class TaskExecutorTest method testDisconnectFromJobMasterWhenNewLeader.
/**
* Tests that the TaskExecutor disconnects from the JobMaster if a new leader is detected.
*/
@Test
public void testDisconnectFromJobMasterWhenNewLeader() throws Exception {
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(TaskSlotUtils.createTaskSlotTable(1)).build();
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices);
final CompletableFuture<Integer> offeredSlotsFuture = new CompletableFuture<>();
final CompletableFuture<ResourceID> disconnectFuture = new CompletableFuture<>();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offeredSlotsFuture.complete(slotOffers.size());
return CompletableFuture.completedFuture(slotOffers);
}).setDisconnectTaskManagerFunction(resourceID -> {
disconnectFuture.complete(resourceID);
return CompletableFuture.completedFuture(Acknowledge.get());
}).build();
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
final CompletableFuture<Void> initialSlotReportFuture = new CompletableFuture<>();
resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReportFuture.complete(null);
return CompletableFuture.completedFuture(Acknowledge.get());
});
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
try {
taskExecutor.start();
TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerGateway.getFencingToken().toUUID());
initialSlotReportFuture.get();
ResourceID resourceID = taskManagerServices.getUnresolvedTaskManagerLocation().getResourceID();
requestSlot(taskExecutorGateway, jobId, new AllocationID(), new SlotID(resourceID, 0), ResourceProfile.ZERO, "foobar", resourceManagerGateway.getFencingToken());
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), UUID.randomUUID());
assertThat(offeredSlotsFuture.get(), is(1));
// notify loss of leadership
jobManagerLeaderRetriever.notifyListener(null, null);
assertThat(disconnectFuture.get(), is(resourceID));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway in project flink by apache.
the class TaskExecutorTest method createTaskExecutorTestingContext.
private TaskExecutorTestingContext createTaskExecutorTestingContext(final TaskSlotTable<Task> taskSlotTable) throws IOException {
final OneShotLatch offerSlotsLatch = new OneShotLatch();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offerSlotsLatch.trigger();
return CompletableFuture.completedFuture(slotOffers);
}).build();
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, RetryingRegistrationConfiguration.defaultConfiguration());
TaskExecutorLocalStateStoresManager stateStoresManager = createTaskExecutorLocalStateStoresManager();
TaskExecutorStateChangelogStoragesManager changelogStoragesManager = new TaskExecutorStateChangelogStoragesManager();
TaskManagerMetricGroup metricGroup = TaskManagerMetricGroup.createTaskManagerMetricGroup(NoOpMetricRegistry.INSTANCE, "", ResourceID.generate());
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setJobLeaderService(jobLeaderService).setTaskStateManager(stateStoresManager).setTaskChangelogStoragesManager(changelogStoragesManager).build(), HEARTBEAT_SERVICES, metricGroup);
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
return new TaskExecutorTestingContext(jobMasterGateway, taskSlotTable, taskExecutor, changelogStoragesManager, metricGroup, offerSlotsLatch);
}
Aggregations