use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorPartitionLifecycleTest method internalTestPartitionRelease.
private void internalTestPartitionRelease(TaskExecutorPartitionTracker partitionTracker, ShuffleEnvironment<?, ?> shuffleEnvironment, CompletableFuture<ResultPartitionID> startTrackingFuture, TestAction testAction) throws Exception {
final ResultPartitionDeploymentDescriptor taskResultPartitionDescriptor = PartitionTestUtils.createPartitionDeploymentDescriptor(ResultPartitionType.BLOCKING);
final ExecutionAttemptID eid1 = taskResultPartitionDescriptor.getShuffleDescriptor().getResultPartitionID().getProducerId();
final TaskDeploymentDescriptor taskDeploymentDescriptor = TaskExecutorSubmissionTest.createTaskDeploymentDescriptor(jobId, "job", eid1, new SerializedValue<>(new ExecutionConfig()), "Sender", 1, 0, 1, 0, new Configuration(), new Configuration(), TestingInvokable.class.getName(), Collections.singletonList(taskResultPartitionDescriptor), Collections.emptyList(), Collections.emptyList(), Collections.emptyList());
final TaskSlotTable<Task> taskSlotTable = createTaskSlotTable();
final TaskExecutorLocalStateStoresManager localStateStoresManager = new TaskExecutorLocalStateStoresManager(false, Reference.owned(new File[] { tmp.newFolder() }), Executors.directExecutor());
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).setShuffleEnvironment(shuffleEnvironment).build();
final CompletableFuture<Void> taskFinishedFuture = new CompletableFuture<>();
final OneShotLatch slotOfferedLatch = new OneShotLatch();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setRegisterTaskManagerFunction((ignoredJobId, ignoredTaskManagerRegistrationInformation) -> CompletableFuture.completedFuture(new JMTMRegistrationSuccess(ResourceID.generate()))).setOfferSlotsFunction((resourceID, slotOffers) -> {
slotOfferedLatch.trigger();
return CompletableFuture.completedFuture(slotOffers);
}).setUpdateTaskExecutionStateFunction(taskExecutionState -> {
if (taskExecutionState.getExecutionState() == ExecutionState.FINISHED) {
taskFinishedFuture.complete(null);
}
return CompletableFuture.completedFuture(Acknowledge.get());
}).build();
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices, partitionTracker);
final CompletableFuture<SlotReport> initialSlotReportFuture = new CompletableFuture<>();
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
testingResourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
return CompletableFuture.completedFuture(Acknowledge.get());
});
testingResourceManagerGateway.setRegisterTaskExecutorFunction(input -> CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), testingResourceManagerGateway.getOwnResourceId(), new ClusterInformation("blobServerHost", 55555))));
try {
taskExecutor.start();
taskExecutor.waitUntilStarted();
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final String jobMasterAddress = "jm";
rpc.registerGateway(jobMasterAddress, jobMasterGateway);
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
// inform the task manager about the job leader
taskManagerServices.getJobLeaderService().addJob(jobId, jobMasterAddress);
jobManagerLeaderRetriever.notifyListener(jobMasterAddress, UUID.randomUUID());
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
final Optional<SlotStatus> slotStatusOptional = StreamSupport.stream(initialSlotReportFuture.get().spliterator(), false).findAny();
assertTrue(slotStatusOptional.isPresent());
final SlotStatus slotStatus = slotStatusOptional.get();
while (true) {
try {
taskExecutorGateway.requestSlot(slotStatus.getSlotID(), jobId, taskDeploymentDescriptor.getAllocationId(), ResourceProfile.ZERO, jobMasterAddress, testingResourceManagerGateway.getFencingToken(), timeout).get();
break;
} catch (Exception e) {
// the proper establishment of the RM connection is tracked
// asynchronously, so we have to poll here until it went through
// until then, slot requests will fail with an exception
Thread.sleep(50);
}
}
TestingInvokable.sync = new BlockerSync();
// Wait till the slot has been successfully offered before submitting the task.
// This ensures TM has been successfully registered to JM.
slotOfferedLatch.await();
taskExecutorGateway.submitTask(taskDeploymentDescriptor, jobMasterGateway.getFencingToken(), timeout).get();
TestingInvokable.sync.awaitBlocker();
// the task is still running => the partition is in in-progress and should be tracked
assertThat(startTrackingFuture.get(), equalTo(taskResultPartitionDescriptor.getShuffleDescriptor().getResultPartitionID()));
TestingInvokable.sync.releaseBlocker();
taskFinishedFuture.get(timeout.getSize(), timeout.getUnit());
testAction.accept(jobId, taskResultPartitionDescriptor, taskExecutor, taskExecutorGateway);
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
// the shutdown of the backing shuffle environment releases all partitions
// the book-keeping is not aware of this
assertTrue(shuffleEnvironment.getPartitionsOccupyingLocalResources().isEmpty());
}
use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method testFreeingInactiveSlotDoesNotFail.
/**
* Tests that freeing an inactive slot is a legal operation that does not throw an exception.
*/
@Test
public void testFreeingInactiveSlotDoesNotFail() throws Exception {
final OneShotLatch taskExecutorIsRegistered = new OneShotLatch();
final CompletableFuture<Tuple3<InstanceID, SlotID, AllocationID>> availableSlotFuture = new CompletableFuture<>();
final TestingResourceManagerGateway resourceManagerGateway = createRmWithTmRegisterAndNotifySlotHooks(new InstanceID(), taskExecutorIsRegistered, availableSlotFuture);
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
final MultiShotLatch offerSlotsLatch = new MultiShotLatch();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offerSlotsLatch.trigger();
return new CompletableFuture<>();
}).build();
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices);
final ThreadSafeTaskSlotTable<Task> threadSafeTaskSlotTable = new ThreadSafeTaskSlotTable<>(taskSlotTable, taskExecutor.getMainThreadExecutableForTesting());
try {
taskExecutor.start();
taskExecutor.waitUntilStarted();
final TaskExecutorGateway tmGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
taskExecutorIsRegistered.await();
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
final AllocationID allocationId = new AllocationID();
requestSlot(tmGateway, jobId, allocationId, buildSlotID(0), ResourceProfile.UNKNOWN, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken());
offerSlotsLatch.await();
tmGateway.freeSlot(allocationId, new RuntimeException("test exception"), timeout).get();
assertThat(availableSlotFuture.get().f2, is(allocationId));
assertThat(threadSafeTaskSlotTable.getAllocationIdsPerJob(jobId), empty());
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method testHeartbeatReporting.
/**
* Tests that the correct partition/slot report is sent as part of the heartbeat response.
*/
@Test
public void testHeartbeatReporting() throws Exception {
final String rmAddress = "rm";
final UUID rmLeaderId = UUID.randomUUID();
// register the mock resource manager gateway
final TestingResourceManagerGateway rmGateway = new TestingResourceManagerGateway();
final CompletableFuture<ResourceID> taskExecutorRegistrationFuture = new CompletableFuture<>();
final ResourceID rmResourceId = rmGateway.getOwnResourceId();
final CompletableFuture<RegistrationResponse> registrationResponse = CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), rmResourceId, new ClusterInformation("localhost", 1234)));
rmGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
taskExecutorRegistrationFuture.complete(taskExecutorRegistration.getResourceId());
return registrationResponse;
});
final CompletableFuture<SlotReport> initialSlotReportFuture = new CompletableFuture<>();
rmGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
return CompletableFuture.completedFuture(Acknowledge.get());
});
final CompletableFuture<TaskExecutorHeartbeatPayload> heartbeatPayloadCompletableFuture = new CompletableFuture<>();
rmGateway.setTaskExecutorHeartbeatFunction((resourceID, heartbeatPayload) -> {
heartbeatPayloadCompletableFuture.complete(heartbeatPayload);
return FutureUtils.completedVoidFuture();
});
rpc.registerGateway(rmAddress, rmGateway);
final SlotID slotId = buildSlotID(0);
final ResourceProfile resourceProfile = ResourceProfile.fromResources(1.0, 1);
final SlotReport slotReport1 = new SlotReport(new SlotStatus(slotId, resourceProfile));
final SlotReport slotReport2 = new SlotReport(new SlotStatus(slotId, resourceProfile, new JobID(), new AllocationID()));
final Queue<SlotReport> reports = new ArrayDeque<>(Arrays.asList(slotReport1, slotReport2));
final TaskSlotTable<Task> taskSlotTable = TestingTaskSlotTable.<Task>newBuilder().createSlotReportSupplier(reports::poll).closeAsyncReturns(CompletableFuture.completedFuture(null)).build();
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TaskExecutorPartitionTracker partitionTracker = createPartitionTrackerWithFixedPartitionReport(taskManagerServices.getShuffleEnvironment());
final TaskExecutor taskManager = createTaskExecutor(taskManagerServices, HEARTBEAT_SERVICES, partitionTracker);
try {
taskManager.start();
// define a leader and see that a registration happens
resourceManagerLeaderRetriever.notifyListener(rmAddress, rmLeaderId);
// register resource manager success will trigger monitoring heartbeat target between tm
// and rm
assertThat(taskExecutorRegistrationFuture.get(), equalTo(unresolvedTaskManagerLocation.getResourceID()));
assertThat(initialSlotReportFuture.get(), equalTo(slotReport1));
TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
// trigger the heartbeat asynchronously
taskExecutorGateway.heartbeatFromResourceManager(rmResourceId);
// wait for heartbeat response
SlotReport actualSlotReport = heartbeatPayloadCompletableFuture.get().getSlotReport();
// the new slot report should be reported
assertEquals(slotReport2, actualSlotReport);
ClusterPartitionReport actualClusterPartitionReport = heartbeatPayloadCompletableFuture.get().getClusterPartitionReport();
assertEquals(partitionTracker.createClusterPartitionReport(), actualClusterPartitionReport);
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method testTriggerRegistrationOnLeaderChange.
@Test
public void testTriggerRegistrationOnLeaderChange() throws Exception {
final UUID leaderId1 = UUID.randomUUID();
final UUID leaderId2 = UUID.randomUUID();
// register the mock resource manager gateways
final CompletableFuture<TaskExecutorRegistration> rmGateway1TaskExecutorRegistration = new CompletableFuture<>();
TestingResourceManagerGateway rmGateway1 = new TestingResourceManagerGateway();
rmGateway1.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
rmGateway1TaskExecutorRegistration.complete(taskExecutorRegistration);
return createRegistrationResponse(rmGateway1);
});
final CompletableFuture<TaskExecutorRegistration> rmGateway2TaskExecutorRegistration = new CompletableFuture<>();
TestingResourceManagerGateway rmGateway2 = new TestingResourceManagerGateway();
rmGateway2.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
rmGateway2TaskExecutorRegistration.complete(taskExecutorRegistration);
return createRegistrationResponse(rmGateway2);
});
rpc.registerGateway(rmGateway1.getAddress(), rmGateway1);
rpc.registerGateway(rmGateway2.getAddress(), rmGateway2);
final TaskSlotTable<Task> taskSlotTable = TestingTaskSlotTable.<Task>newBuilder().createSlotReportSupplier(SlotReport::new).closeAsyncReturns(CompletableFuture.completedFuture(null)).build();
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
TaskExecutor taskManager = createTaskExecutor(taskManagerServices);
try {
taskManager.start();
String taskManagerAddress = taskManager.getAddress();
// no connection initially, since there is no leader
assertNull(taskManager.getResourceManagerConnection());
// define a leader and see that a registration happens
resourceManagerLeaderRetriever.notifyListener(rmGateway1.getAddress(), leaderId1);
final TaskExecutorRegistration taskExecutorRegistration1 = rmGateway1TaskExecutorRegistration.join();
assertThat(taskExecutorRegistration1.getTaskExecutorAddress(), is(taskManagerAddress));
assertThat(taskExecutorRegistration1.getResourceId(), is(unresolvedTaskManagerLocation.getResourceID()));
assertNotNull(taskManager.getResourceManagerConnection());
// cancel the leader
resourceManagerLeaderRetriever.notifyListener(null, null);
// set a new leader, see that a registration happens
resourceManagerLeaderRetriever.notifyListener(rmGateway2.getAddress(), leaderId2);
final TaskExecutorRegistration taskExecutorRegistration2 = rmGateway2TaskExecutorRegistration.join();
assertThat(taskExecutorRegistration2.getTaskExecutorAddress(), is(taskManagerAddress));
assertThat(taskExecutorRegistration2.getResourceId(), is(unresolvedTaskManagerLocation.getResourceID()));
assertNotNull(taskManager.getResourceManagerConnection());
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
use of org.apache.flink.runtime.state.TaskExecutorLocalStateStoresManager in project flink by apache.
the class TaskExecutorTest method createTaskExecutorTestingContext.
private TaskExecutorTestingContext createTaskExecutorTestingContext(final TaskSlotTable<Task> taskSlotTable) throws IOException {
final OneShotLatch offerSlotsLatch = new OneShotLatch();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offerSlotsLatch.trigger();
return CompletableFuture.completedFuture(slotOffers);
}).build();
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final JobLeaderService jobLeaderService = new DefaultJobLeaderService(unresolvedTaskManagerLocation, RetryingRegistrationConfiguration.defaultConfiguration());
TaskExecutorLocalStateStoresManager stateStoresManager = createTaskExecutorLocalStateStoresManager();
TaskExecutorStateChangelogStoragesManager changelogStoragesManager = new TaskExecutorStateChangelogStoragesManager();
TaskManagerMetricGroup metricGroup = TaskManagerMetricGroup.createTaskManagerMetricGroup(NoOpMetricRegistry.INSTANCE, "", ResourceID.generate());
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(new TaskManagerServicesBuilder().setTaskSlotTable(taskSlotTable).setJobLeaderService(jobLeaderService).setTaskStateManager(stateStoresManager).setTaskChangelogStoragesManager(changelogStoragesManager).build(), HEARTBEAT_SERVICES, metricGroup);
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
return new TaskExecutorTestingContext(jobMasterGateway, taskSlotTable, taskExecutor, changelogStoragesManager, metricGroup, offerSlotsLatch);
}
Aggregations