use of org.apache.flink.runtime.taskexecutor.TaskExecutorGateway in project flink by apache.
the class ResourceManagerTest method testTaskExecutorBecomesUnreachableTriggersDisconnect.
@Test
public void testTaskExecutorBecomesUnreachableTriggersDisconnect() throws Exception {
final ResourceID taskExecutorId = ResourceID.generate();
final CompletableFuture<Exception> disconnectFuture = new CompletableFuture<>();
final CompletableFuture<ResourceID> stopWorkerFuture = new CompletableFuture<>();
final TaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setAddress(UUID.randomUUID().toString()).setDisconnectResourceManagerConsumer(disconnectFuture::complete).setHeartbeatResourceManagerFunction(resourceId -> FutureUtils.completedExceptionally(new RecipientUnreachableException("sender", "recipient", "task executor is unreachable"))).createTestingTaskExecutorGateway();
rpcService.registerGateway(taskExecutorGateway.getAddress(), taskExecutorGateway);
runHeartbeatTargetBecomesUnreachableTest(builder -> builder.withStopWorkerFunction((worker) -> {
stopWorkerFuture.complete(worker);
return true;
}), resourceManagerGateway -> registerTaskExecutor(resourceManagerGateway, taskExecutorId, taskExecutorGateway.getAddress()), resourceManagerResourceId -> {
assertThat(disconnectFuture.get(), instanceOf(ResourceManagerException.class));
assertThat(stopWorkerFuture.get(), is(taskExecutorId));
});
}
use of org.apache.flink.runtime.taskexecutor.TaskExecutorGateway in project flink by apache.
the class DeclarativeSlotManager method allocateSlot.
/**
* Allocates the given slot. This entails sending a registration message to the task manager and
* treating failures.
*
* @param taskManagerSlot slot to allocate
* @param jobId job for which the slot should be allocated for
* @param targetAddress address of the job master
* @param resourceProfile resource profile for the requirement for which the slot is used
*/
private void allocateSlot(TaskManagerSlotInformation taskManagerSlot, JobID jobId, String targetAddress, ResourceProfile resourceProfile) {
final SlotID slotId = taskManagerSlot.getSlotId();
LOG.debug("Starting allocation of slot {} for job {} with resource profile {}.", slotId, jobId, resourceProfile);
final InstanceID instanceId = taskManagerSlot.getInstanceId();
if (!taskExecutorManager.isTaskManagerRegistered(instanceId)) {
throw new IllegalStateException("Could not find a registered task manager for instance id " + instanceId + '.');
}
final TaskExecutorConnection taskExecutorConnection = taskManagerSlot.getTaskManagerConnection();
final TaskExecutorGateway gateway = taskExecutorConnection.getTaskExecutorGateway();
final AllocationID allocationId = new AllocationID();
slotTracker.notifyAllocationStart(slotId, jobId);
taskExecutorManager.markUsed(instanceId);
pendingSlotAllocations.put(slotId, allocationId);
// RPC call to the task manager
CompletableFuture<Acknowledge> requestFuture = gateway.requestSlot(slotId, jobId, allocationId, resourceProfile, targetAddress, resourceManagerId, taskManagerRequestTimeout);
CompletableFuture<Void> slotAllocationResponseProcessingFuture = requestFuture.handleAsync((Acknowledge acknowledge, Throwable throwable) -> {
final AllocationID currentAllocationForSlot = pendingSlotAllocations.get(slotId);
if (currentAllocationForSlot == null || !currentAllocationForSlot.equals(allocationId)) {
LOG.debug("Ignoring slot allocation update from task executor {} for slot {} and job {}, because the allocation was already completed or cancelled.", instanceId, slotId, jobId);
return null;
}
if (acknowledge != null) {
LOG.trace("Completed allocation of slot {} for job {}.", slotId, jobId);
slotTracker.notifyAllocationComplete(slotId, jobId);
} else {
if (throwable instanceof SlotOccupiedException) {
SlotOccupiedException exception = (SlotOccupiedException) throwable;
LOG.debug("Tried allocating slot {} for job {}, but it was already allocated for job {}.", slotId, jobId, exception.getJobId());
// report as a slot status to force the state transition
// this could be a problem if we ever assume that the task
// executor always reports about all slots
slotTracker.notifySlotStatus(Collections.singleton(new SlotStatus(slotId, taskManagerSlot.getResourceProfile(), exception.getJobId(), exception.getAllocationId())));
} else {
LOG.warn("Slot allocation for slot {} for job {} failed.", slotId, jobId, throwable);
slotTracker.notifyFree(slotId);
}
checkResourceRequirements();
}
return null;
}, mainThreadExecutor);
FutureUtils.assertNoException(slotAllocationResponseProcessingFuture);
}
use of org.apache.flink.runtime.taskexecutor.TaskExecutorGateway in project flink by apache.
the class ResourceManager method requestTaskManagerMetricQueryServiceAddresses.
@Override
public CompletableFuture<Collection<Tuple2<ResourceID, String>>> requestTaskManagerMetricQueryServiceAddresses(Time timeout) {
final ArrayList<CompletableFuture<Optional<Tuple2<ResourceID, String>>>> metricQueryServiceAddressFutures = new ArrayList<>(taskExecutors.size());
for (Map.Entry<ResourceID, WorkerRegistration<WorkerType>> workerRegistrationEntry : taskExecutors.entrySet()) {
final ResourceID tmResourceId = workerRegistrationEntry.getKey();
final WorkerRegistration<WorkerType> workerRegistration = workerRegistrationEntry.getValue();
final TaskExecutorGateway taskExecutorGateway = workerRegistration.getTaskExecutorGateway();
final CompletableFuture<Optional<Tuple2<ResourceID, String>>> metricQueryServiceAddressFuture = taskExecutorGateway.requestMetricQueryServiceAddress(timeout).thenApply(o -> o.toOptional().map(address -> Tuple2.of(tmResourceId, address)));
metricQueryServiceAddressFutures.add(metricQueryServiceAddressFuture);
}
return FutureUtils.combineAll(metricQueryServiceAddressFutures).thenApply(collection -> collection.stream().filter(Optional::isPresent).map(Optional::get).collect(Collectors.toList()));
}
use of org.apache.flink.runtime.taskexecutor.TaskExecutorGateway in project flink by apache.
the class JobMasterTest method testAllocatedSlotReportDoesNotContainStaleInformation.
/**
* Tests that the {@link AllocatedSlotReport} contains up to date information and not stale
* information about the allocated slots on the {@link JobMaster}.
*
* <p>This is a probabilistic test case which only fails if executed repeatedly without the fix
* for FLINK-12863.
*/
@Test
public void testAllocatedSlotReportDoesNotContainStaleInformation() throws Exception {
final CompletableFuture<Void> assertionFuture = new CompletableFuture<>();
final UnresolvedTaskManagerLocation unresolvedTaskManagerLocation = new LocalUnresolvedTaskManagerLocation();
final AtomicBoolean terminateHeartbeatVerification = new AtomicBoolean(false);
final OneShotLatch hasReceivedSlotOffers = new OneShotLatch();
final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setHeartbeatJobManagerFunction((taskManagerId, allocatedSlotReport) -> {
try {
if (hasReceivedSlotOffers.isTriggered()) {
assertThat(allocatedSlotReport.getAllocatedSlotInfos(), hasSize(1));
} else {
assertThat(allocatedSlotReport.getAllocatedSlotInfos(), empty());
}
} catch (AssertionError e) {
assertionFuture.completeExceptionally(e);
}
if (terminateHeartbeatVerification.get()) {
assertionFuture.complete(null);
}
return FutureUtils.completedVoidFuture();
}).createTestingTaskExecutorGateway();
rpcService.registerGateway(taskExecutorGateway.getAddress(), taskExecutorGateway);
final JobManagerSharedServices jobManagerSharedServices = new TestingJobManagerSharedServicesBuilder().build();
final JobGraph jobGraph = JobGraphTestUtils.singleNoOpJobGraph();
final JobMaster jobMaster = new JobMasterBuilder(jobGraph, rpcService).withHeartbeatServices(new HeartbeatServices(5L, 1000L)).withSlotPoolServiceSchedulerFactory(DefaultSlotPoolServiceSchedulerFactory.create(new TestingSlotPoolFactory(hasReceivedSlotOffers), new DefaultSchedulerFactory())).createJobMaster();
jobMaster.start();
try {
final JobMasterGateway jobMasterGateway = jobMaster.getSelfGateway(JobMasterGateway.class);
// register task manager will trigger monitor heartbeat target, schedule heartbeat
// request at interval time
CompletableFuture<RegistrationResponse> registrationResponse = jobMasterGateway.registerTaskManager(jobGraph.getJobID(), TaskManagerRegistrationInformation.create(taskExecutorGateway.getAddress(), unresolvedTaskManagerLocation, TestingUtils.zeroUUID()), testingTimeout);
// wait for the completion of the registration
registrationResponse.get();
final SlotOffer slotOffer = new SlotOffer(new AllocationID(), 0, ResourceProfile.ANY);
final CompletableFuture<Collection<SlotOffer>> slotOfferFuture = jobMasterGateway.offerSlots(unresolvedTaskManagerLocation.getResourceID(), Collections.singleton(slotOffer), testingTimeout);
assertThat(slotOfferFuture.get(), containsInAnyOrder(slotOffer));
terminateHeartbeatVerification.set(true);
// make sure that no assertion has been violated
assertionFuture.get();
} finally {
RpcUtils.terminateRpcEndpoint(jobMaster, testingTimeout);
jobManagerSharedServices.shutdown();
}
}
use of org.apache.flink.runtime.taskexecutor.TaskExecutorGateway in project flink by apache.
the class AbstractFineGrainedSlotManagerITCase method testAllocationUpdatesIgnoredIfTaskExecutorUnregistered.
// ---------------------------------------------------------------------------------------------
// Allocation update
// ---------------------------------------------------------------------------------------------
/**
* Verify that the ack of request slot form unregistered task manager will not cause system
* breakdown.
*/
@Test
public void testAllocationUpdatesIgnoredIfTaskExecutorUnregistered() throws Exception {
final CompletableFuture<Acknowledge> slotRequestFuture = new CompletableFuture<>();
final CompletableFuture<Void> slotRequestCallFuture = new CompletableFuture<>();
final TestingTaskExecutorGateway taskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setRequestSlotFunction(ignored -> {
slotRequestCallFuture.complete(null);
return slotRequestFuture;
}).createTestingTaskExecutorGateway();
// The fatal error handler will exit the system if there is any exceptions in handling the
// ack of request slot. We need the security manager to verify that would not happen.
final SystemExitTrackingSecurityManager trackingSecurityManager = new SystemExitTrackingSecurityManager();
System.setSecurityManager(trackingSecurityManager);
final JobID jobId = new JobID();
final ResourceID taskExecutorResourceId = ResourceID.generate();
final TaskExecutorConnection taskExecutionConnection = new TaskExecutorConnection(taskExecutorResourceId, taskExecutorGateway);
final SlotReport slotReport = new SlotReport();
new Context() {
{
runTest(() -> {
runInMainThread(() -> {
getSlotManager().processResourceRequirements(createResourceRequirements(jobId, 1));
getSlotManager().registerTaskManager(taskExecutionConnection, slotReport, DEFAULT_TOTAL_RESOURCE_PROFILE, DEFAULT_SLOT_RESOURCE_PROFILE);
});
assertFutureCompleteAndReturn(slotRequestCallFuture);
runInMainThread(() -> {
getSlotManager().unregisterTaskManager(taskExecutionConnection.getInstanceID(), TEST_EXCEPTION);
slotRequestFuture.complete(Acknowledge.get());
});
assertThat(trackingSecurityManager.getSystemExitFuture().isDone(), is(false));
});
}
};
System.setSecurityManager(null);
}
Aggregations