use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class TaskExecutorTest method testFreeingInactiveSlotDoesNotFail.
/**
* Tests that freeing an inactive slot is a legal operation that does not throw an exception.
*/
@Test
public void testFreeingInactiveSlotDoesNotFail() throws Exception {
final OneShotLatch taskExecutorIsRegistered = new OneShotLatch();
final CompletableFuture<Tuple3<InstanceID, SlotID, AllocationID>> availableSlotFuture = new CompletableFuture<>();
final TestingResourceManagerGateway resourceManagerGateway = createRmWithTmRegisterAndNotifySlotHooks(new InstanceID(), taskExecutorIsRegistered, availableSlotFuture);
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
final MultiShotLatch offerSlotsLatch = new MultiShotLatch();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offerSlotsLatch.trigger();
return new CompletableFuture<>();
}).build();
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(1);
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TestingTaskExecutor taskExecutor = createTestingTaskExecutor(taskManagerServices);
final ThreadSafeTaskSlotTable<Task> threadSafeTaskSlotTable = new ThreadSafeTaskSlotTable<>(taskSlotTable, taskExecutor.getMainThreadExecutableForTesting());
try {
taskExecutor.start();
taskExecutor.waitUntilStarted();
final TaskExecutorGateway tmGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
taskExecutorIsRegistered.await();
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
final AllocationID allocationId = new AllocationID();
requestSlot(tmGateway, jobId, allocationId, buildSlotID(0), ResourceProfile.UNKNOWN, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken());
offerSlotsLatch.await();
tmGateway.freeSlot(allocationId, new RuntimeException("test exception"), timeout).get();
assertThat(availableSlotFuture.get().f2, is(allocationId));
assertThat(threadSafeTaskSlotTable.getAllocationIdsPerJob(jobId), empty());
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class TaskExecutorTest method testHeartbeatReporting.
/**
* Tests that the correct partition/slot report is sent as part of the heartbeat response.
*/
@Test
public void testHeartbeatReporting() throws Exception {
final String rmAddress = "rm";
final UUID rmLeaderId = UUID.randomUUID();
// register the mock resource manager gateway
final TestingResourceManagerGateway rmGateway = new TestingResourceManagerGateway();
final CompletableFuture<ResourceID> taskExecutorRegistrationFuture = new CompletableFuture<>();
final ResourceID rmResourceId = rmGateway.getOwnResourceId();
final CompletableFuture<RegistrationResponse> registrationResponse = CompletableFuture.completedFuture(new TaskExecutorRegistrationSuccess(new InstanceID(), rmResourceId, new ClusterInformation("localhost", 1234)));
rmGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
taskExecutorRegistrationFuture.complete(taskExecutorRegistration.getResourceId());
return registrationResponse;
});
final CompletableFuture<SlotReport> initialSlotReportFuture = new CompletableFuture<>();
rmGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReportFuture.complete(resourceIDInstanceIDSlotReportTuple3.f2);
return CompletableFuture.completedFuture(Acknowledge.get());
});
final CompletableFuture<TaskExecutorHeartbeatPayload> heartbeatPayloadCompletableFuture = new CompletableFuture<>();
rmGateway.setTaskExecutorHeartbeatFunction((resourceID, heartbeatPayload) -> {
heartbeatPayloadCompletableFuture.complete(heartbeatPayload);
return FutureUtils.completedVoidFuture();
});
rpc.registerGateway(rmAddress, rmGateway);
final SlotID slotId = buildSlotID(0);
final ResourceProfile resourceProfile = ResourceProfile.fromResources(1.0, 1);
final SlotReport slotReport1 = new SlotReport(new SlotStatus(slotId, resourceProfile));
final SlotReport slotReport2 = new SlotReport(new SlotStatus(slotId, resourceProfile, new JobID(), new AllocationID()));
final Queue<SlotReport> reports = new ArrayDeque<>(Arrays.asList(slotReport1, slotReport2));
final TaskSlotTable<Task> taskSlotTable = TestingTaskSlotTable.<Task>newBuilder().createSlotReportSupplier(reports::poll).closeAsyncReturns(CompletableFuture.completedFuture(null)).build();
final TaskExecutorLocalStateStoresManager localStateStoresManager = createTaskExecutorLocalStateStoresManager();
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setUnresolvedTaskManagerLocation(unresolvedTaskManagerLocation).setTaskSlotTable(taskSlotTable).setTaskStateManager(localStateStoresManager).build();
final TaskExecutorPartitionTracker partitionTracker = createPartitionTrackerWithFixedPartitionReport(taskManagerServices.getShuffleEnvironment());
final TaskExecutor taskManager = createTaskExecutor(taskManagerServices, HEARTBEAT_SERVICES, partitionTracker);
try {
taskManager.start();
// define a leader and see that a registration happens
resourceManagerLeaderRetriever.notifyListener(rmAddress, rmLeaderId);
// register resource manager success will trigger monitoring heartbeat target between tm
// and rm
assertThat(taskExecutorRegistrationFuture.get(), equalTo(unresolvedTaskManagerLocation.getResourceID()));
assertThat(initialSlotReportFuture.get(), equalTo(slotReport1));
TaskExecutorGateway taskExecutorGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
// trigger the heartbeat asynchronously
taskExecutorGateway.heartbeatFromResourceManager(rmResourceId);
// wait for heartbeat response
SlotReport actualSlotReport = heartbeatPayloadCompletableFuture.get().getSlotReport();
// the new slot report should be reported
assertEquals(slotReport2, actualSlotReport);
ClusterPartitionReport actualClusterPartitionReport = heartbeatPayloadCompletableFuture.get().getClusterPartitionReport();
assertEquals(partitionTracker.createClusterPartitionReport(), actualClusterPartitionReport);
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class TaskExecutorTest method testIgnoringSlotRequestsIfNotRegistered.
/**
* Tests that we ignore slot requests if the TaskExecutor is not registered at a
* ResourceManager.
*/
@Test
public void testIgnoringSlotRequestsIfNotRegistered() throws Exception {
final TaskExecutor taskExecutor = createTaskExecutor(1);
taskExecutor.start();
try {
final TestingResourceManagerGateway testingResourceManagerGateway = new TestingResourceManagerGateway();
final CompletableFuture<RegistrationResponse> registrationFuture = new CompletableFuture<>();
final CompletableFuture<ResourceID> taskExecutorResourceIdFuture = new CompletableFuture<>();
testingResourceManagerGateway.setRegisterTaskExecutorFunction(taskExecutorRegistration -> {
taskExecutorResourceIdFuture.complete(taskExecutorRegistration.getResourceId());
return registrationFuture;
});
rpc.registerGateway(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway);
resourceManagerLeaderRetriever.notifyListener(testingResourceManagerGateway.getAddress(), testingResourceManagerGateway.getFencingToken().toUUID());
final TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
final ResourceID resourceId = taskExecutorResourceIdFuture.get();
final CompletableFuture<Acknowledge> slotRequestResponse = taskExecutorGateway.requestSlot(new SlotID(resourceId, 0), jobId, new AllocationID(), ResourceProfile.ZERO, "foobar", testingResourceManagerGateway.getFencingToken(), timeout);
try {
slotRequestResponse.get();
fail("We should not be able to request slots before the TaskExecutor is registered at the ResourceManager.");
} catch (ExecutionException ee) {
assertThat(ExceptionUtils.stripExecutionException(ee), instanceOf(TaskManagerException.class));
}
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class TaskExecutorTest method testDisconnectFromJobMasterWhenNewLeader.
/**
* Tests that the TaskExecutor disconnects from the JobMaster if a new leader is detected.
*/
@Test
public void testDisconnectFromJobMasterWhenNewLeader() throws Exception {
final TaskManagerServices taskManagerServices = new TaskManagerServicesBuilder().setTaskSlotTable(TaskSlotUtils.createTaskSlotTable(1)).build();
final TaskExecutor taskExecutor = createTaskExecutor(taskManagerServices);
final CompletableFuture<Integer> offeredSlotsFuture = new CompletableFuture<>();
final CompletableFuture<ResourceID> disconnectFuture = new CompletableFuture<>();
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setOfferSlotsFunction((resourceID, slotOffers) -> {
offeredSlotsFuture.complete(slotOffers.size());
return CompletableFuture.completedFuture(slotOffers);
}).setDisconnectTaskManagerFunction(resourceID -> {
disconnectFuture.complete(resourceID);
return CompletableFuture.completedFuture(Acknowledge.get());
}).build();
final TestingResourceManagerGateway resourceManagerGateway = new TestingResourceManagerGateway();
final CompletableFuture<Void> initialSlotReportFuture = new CompletableFuture<>();
resourceManagerGateway.setSendSlotReportFunction(resourceIDInstanceIDSlotReportTuple3 -> {
initialSlotReportFuture.complete(null);
return CompletableFuture.completedFuture(Acknowledge.get());
});
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
try {
taskExecutor.start();
TaskExecutorGateway taskExecutorGateway = taskExecutor.getSelfGateway(TaskExecutorGateway.class);
resourceManagerLeaderRetriever.notifyListener(resourceManagerGateway.getAddress(), resourceManagerGateway.getFencingToken().toUUID());
initialSlotReportFuture.get();
ResourceID resourceID = taskManagerServices.getUnresolvedTaskManagerLocation().getResourceID();
requestSlot(taskExecutorGateway, jobId, new AllocationID(), new SlotID(resourceID, 0), ResourceProfile.ZERO, "foobar", resourceManagerGateway.getFencingToken());
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), UUID.randomUUID());
assertThat(offeredSlotsFuture.get(), is(1));
// notify loss of leadership
jobManagerLeaderRetriever.notifyListener(null, null);
assertThat(disconnectFuture.get(), is(resourceID));
} finally {
RpcUtils.terminateRpcEndpoint(taskExecutor, timeout);
}
}
use of org.apache.flink.runtime.clusterframework.types.AllocationID in project flink by apache.
the class TaskExecutorTest method testSubmitTaskBeforeAcceptSlot.
/**
* This tests task executor receive SubmitTask before OfferSlot response.
*/
@Test
public void testSubmitTaskBeforeAcceptSlot() throws Exception {
final InstanceID registrationId = new InstanceID();
final OneShotLatch taskExecutorIsRegistered = new OneShotLatch();
final CompletableFuture<Tuple3<InstanceID, SlotID, AllocationID>> availableSlotFuture = new CompletableFuture<>();
final TestingResourceManagerGateway resourceManagerGateway = createRmWithTmRegisterAndNotifySlotHooks(registrationId, taskExecutorIsRegistered, availableSlotFuture);
final AllocationID allocationId1 = new AllocationID();
final AllocationID allocationId2 = new AllocationID();
final SlotOffer offer1 = new SlotOffer(allocationId1, 0, ResourceProfile.ANY);
final OneShotLatch offerSlotsLatch = new OneShotLatch();
final OneShotLatch taskInTerminalState = new OneShotLatch();
final CompletableFuture<Collection<SlotOffer>> offerResultFuture = new CompletableFuture<>();
final TestingJobMasterGateway jobMasterGateway = createJobMasterWithSlotOfferAndTaskTerminationHooks(offerSlotsLatch, taskInTerminalState, offerResultFuture);
rpc.registerGateway(resourceManagerGateway.getAddress(), resourceManagerGateway);
rpc.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final TaskSlotTable<Task> taskSlotTable = TaskSlotUtils.createTaskSlotTable(2);
final TaskManagerServices taskManagerServices = createTaskManagerServicesWithTaskSlotTable(taskSlotTable);
final TestingTaskExecutor taskManager = createTestingTaskExecutor(taskManagerServices);
try {
taskManager.start();
taskManager.waitUntilStarted();
final TaskExecutorGateway tmGateway = taskManager.getSelfGateway(TaskExecutorGateway.class);
// wait until registered at the RM
taskExecutorIsRegistered.await();
// request 2 slots for the given allocation ids
AllocationID[] allocationIds = new AllocationID[] { allocationId1, allocationId2 };
for (int i = 0; i < allocationIds.length; i++) {
requestSlot(tmGateway, jobId, allocationIds[i], buildSlotID(i), ResourceProfile.UNKNOWN, jobMasterGateway.getAddress(), resourceManagerGateway.getFencingToken());
}
// notify job leader to start slot offering
jobManagerLeaderRetriever.notifyListener(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID());
// wait until slots have been offered
offerSlotsLatch.await();
submit(allocationId1, jobMasterGateway, tmGateway, NoOpInvokable.class);
// acknowledge the offered slots
offerResultFuture.complete(Collections.singleton(offer1));
// check that the rejected slot will be made available again
final Tuple3<InstanceID, SlotID, AllocationID> instanceIDSlotIDAllocationIDTuple3 = availableSlotFuture.get();
assertThat(instanceIDSlotIDAllocationIDTuple3.f2, equalTo(allocationId2));
// wait for the task completion
taskInTerminalState.await();
} finally {
RpcUtils.terminateRpcEndpoint(taskManager, timeout);
}
}
Aggregations