use of org.apache.flink.util.FlinkException in project flink by apache.
the class StopWithSavepointTerminationHandlerImpl method terminateExceptionallyWithGlobalFailover.
/**
* Handles the termination of the {@code StopWithSavepointTerminationHandler} exceptionally
* after triggering a global job fail-over.
*
* @param unfinishedExecutionStates the unfinished states that caused the failure.
* @param savepointPath the path to the successfully created savepoint.
*/
private void terminateExceptionallyWithGlobalFailover(Iterable<ExecutionState> unfinishedExecutionStates, String savepointPath) {
String errorMessage = String.format("Inconsistent execution state after stopping with savepoint. At least one execution is still in one of the following states: %s. A global fail-over is triggered to recover the job %s.", StringUtils.join(unfinishedExecutionStates, ", "), jobId);
FlinkException inconsistentFinalStateException = new FlinkException(errorMessage);
log.warn("A savepoint was created at {} but the corresponding job {} didn't terminate successfully.", savepointPath, jobId, inconsistentFinalStateException);
scheduler.handleGlobalFailure(inconsistentFinalStateException);
result.completeExceptionally(inconsistentFinalStateException);
}
use of org.apache.flink.util.FlinkException in project flink by apache.
the class SlotSharingExecutionSlotAllocator method releaseSharedSlot.
private void releaseSharedSlot(ExecutionSlotSharingGroup executionSlotSharingGroup) {
SharedSlot slot = sharedSlots.remove(executionSlotSharingGroup);
Preconditions.checkNotNull(slot);
Preconditions.checkState(slot.isEmpty(), "Trying to remove a shared slot with physical request id %s which has assigned logical slots", slot.getPhysicalSlotRequestId());
slotProvider.cancelSlotRequest(slot.getPhysicalSlotRequestId(), new FlinkException("Slot is being returned from SlotSharingExecutionSlotAllocator."));
}
use of org.apache.flink.util.FlinkException in project flink by apache.
the class KubernetesResourceManagerDriver method onPodTerminated.
private void onPodTerminated(KubernetesPod pod) {
final String podName = pod.getName();
log.debug("TaskManager pod {} is terminated.", podName);
// this is a safe net, in case onModified/onDeleted/onError is
// received before onAdded
final CompletableFuture<KubernetesWorkerNode> requestResourceFuture = requestResourceFutures.remove(podName);
if (requestResourceFuture != null) {
log.warn("Pod {} is terminated before being scheduled.", podName);
requestResourceFuture.completeExceptionally(new FlinkException("Pod is terminated."));
}
getResourceEventHandler().onWorkerTerminated(new ResourceID(podName), pod.getTerminatedDiagnostics());
stopPod(podName);
}
use of org.apache.flink.util.FlinkException in project flink by apache.
the class DefaultDeclarativeSlotPool method releaseIdleSlots.
@Override
public void releaseIdleSlots(long currentTimeMillis) {
final Collection<AllocatedSlotPool.FreeSlotInfo> freeSlotsInformation = slotPool.getFreeSlotsInformation();
ResourceCounter excessResources = fulfilledResourceRequirements.subtract(totalResourceRequirements);
final Iterator<AllocatedSlotPool.FreeSlotInfo> freeSlotIterator = freeSlotsInformation.iterator();
final Collection<AllocatedSlot> slotsToReturnToOwner = new ArrayList<>();
while (!excessResources.isEmpty() && freeSlotIterator.hasNext()) {
final AllocatedSlotPool.FreeSlotInfo idleSlot = freeSlotIterator.next();
if (currentTimeMillis >= idleSlot.getFreeSince() + idleSlotTimeout.toMilliseconds()) {
final ResourceProfile matchingProfile = getMatchingResourceProfile(idleSlot.getAllocationId());
if (excessResources.containsResource(matchingProfile)) {
excessResources = excessResources.subtract(matchingProfile, 1);
final Optional<AllocatedSlot> removedSlot = slotPool.removeSlot(idleSlot.getAllocationId());
final AllocatedSlot allocatedSlot = removedSlot.orElseThrow(() -> new IllegalStateException(String.format("Could not find slot for allocation id %s.", idleSlot.getAllocationId())));
slotsToReturnToOwner.add(allocatedSlot);
}
}
}
releaseSlots(slotsToReturnToOwner, new FlinkException("Returning idle slots to their owners."));
LOG.debug("Idle slots have been returned; new total acquired resources: {}", fulfilledResourceRequirements);
}
use of org.apache.flink.util.FlinkException in project flink by apache.
the class JobMaster method registerTaskManager.
@Override
public CompletableFuture<RegistrationResponse> registerTaskManager(final JobID jobId, final TaskManagerRegistrationInformation taskManagerRegistrationInformation, final Time timeout) {
if (!jobGraph.getJobID().equals(jobId)) {
log.debug("Rejecting TaskManager registration attempt because of wrong job id {}.", jobId);
return CompletableFuture.completedFuture(new JMTMRegistrationRejection(String.format("The JobManager is not responsible for job %s. Maybe the TaskManager used outdated connection information.", jobId)));
}
final TaskManagerLocation taskManagerLocation;
try {
taskManagerLocation = resolveTaskManagerLocation(taskManagerRegistrationInformation.getUnresolvedTaskManagerLocation());
} catch (FlinkException exception) {
log.error("Could not accept TaskManager registration.", exception);
return CompletableFuture.completedFuture(new RegistrationResponse.Failure(exception));
}
final ResourceID taskManagerId = taskManagerLocation.getResourceID();
final UUID sessionId = taskManagerRegistrationInformation.getTaskManagerSession();
final TaskManagerRegistration taskManagerRegistration = registeredTaskManagers.get(taskManagerId);
if (taskManagerRegistration != null) {
if (taskManagerRegistration.getSessionId().equals(sessionId)) {
log.debug("Ignoring registration attempt of TaskManager {} with the same session id {}.", taskManagerId, sessionId);
final RegistrationResponse response = new JMTMRegistrationSuccess(resourceId);
return CompletableFuture.completedFuture(response);
} else {
disconnectTaskManager(taskManagerId, new FlinkException("A registered TaskManager re-registered with a new session id. This indicates a restart of the TaskManager. Closing the old connection."));
}
}
return getRpcService().connect(taskManagerRegistrationInformation.getTaskManagerRpcAddress(), TaskExecutorGateway.class).handleAsync((TaskExecutorGateway taskExecutorGateway, Throwable throwable) -> {
if (throwable != null) {
return new RegistrationResponse.Failure(throwable);
}
slotPoolService.registerTaskManager(taskManagerId);
registeredTaskManagers.put(taskManagerId, TaskManagerRegistration.create(taskManagerLocation, taskExecutorGateway, sessionId));
// monitor the task manager as heartbeat target
taskManagerHeartbeatManager.monitorTarget(taskManagerId, new TaskExecutorHeartbeatSender(taskExecutorGateway));
return new JMTMRegistrationSuccess(resourceId);
}, getMainThreadExecutor());
}
Aggregations