Search in sources :

Example 56 with FlinkException

use of org.apache.flink.util.FlinkException in project flink by apache.

the class StopWithSavepointTerminationHandlerImpl method terminateExceptionallyWithGlobalFailover.

/**
 * Handles the termination of the {@code StopWithSavepointTerminationHandler} exceptionally
 * after triggering a global job fail-over.
 *
 * @param unfinishedExecutionStates the unfinished states that caused the failure.
 * @param savepointPath the path to the successfully created savepoint.
 */
private void terminateExceptionallyWithGlobalFailover(Iterable<ExecutionState> unfinishedExecutionStates, String savepointPath) {
    String errorMessage = String.format("Inconsistent execution state after stopping with savepoint. At least one execution is still in one of the following states: %s. A global fail-over is triggered to recover the job %s.", StringUtils.join(unfinishedExecutionStates, ", "), jobId);
    FlinkException inconsistentFinalStateException = new FlinkException(errorMessage);
    log.warn("A savepoint was created at {} but the corresponding job {} didn't terminate successfully.", savepointPath, jobId, inconsistentFinalStateException);
    scheduler.handleGlobalFailure(inconsistentFinalStateException);
    result.completeExceptionally(inconsistentFinalStateException);
}
Also used : FlinkException(org.apache.flink.util.FlinkException)

Example 57 with FlinkException

use of org.apache.flink.util.FlinkException in project flink by apache.

the class SlotSharingExecutionSlotAllocator method releaseSharedSlot.

private void releaseSharedSlot(ExecutionSlotSharingGroup executionSlotSharingGroup) {
    SharedSlot slot = sharedSlots.remove(executionSlotSharingGroup);
    Preconditions.checkNotNull(slot);
    Preconditions.checkState(slot.isEmpty(), "Trying to remove a shared slot with physical request id %s which has assigned logical slots", slot.getPhysicalSlotRequestId());
    slotProvider.cancelSlotRequest(slot.getPhysicalSlotRequestId(), new FlinkException("Slot is being returned from SlotSharingExecutionSlotAllocator."));
}
Also used : FlinkException(org.apache.flink.util.FlinkException)

Example 58 with FlinkException

use of org.apache.flink.util.FlinkException in project flink by apache.

the class KubernetesResourceManagerDriver method onPodTerminated.

private void onPodTerminated(KubernetesPod pod) {
    final String podName = pod.getName();
    log.debug("TaskManager pod {} is terminated.", podName);
    // this is a safe net, in case onModified/onDeleted/onError is
    // received before onAdded
    final CompletableFuture<KubernetesWorkerNode> requestResourceFuture = requestResourceFutures.remove(podName);
    if (requestResourceFuture != null) {
        log.warn("Pod {} is terminated before being scheduled.", podName);
        requestResourceFuture.completeExceptionally(new FlinkException("Pod is terminated."));
    }
    getResourceEventHandler().onWorkerTerminated(new ResourceID(podName), pod.getTerminatedDiagnostics());
    stopPod(podName);
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) FlinkException(org.apache.flink.util.FlinkException)

Example 59 with FlinkException

use of org.apache.flink.util.FlinkException in project flink by apache.

the class DefaultDeclarativeSlotPool method releaseIdleSlots.

@Override
public void releaseIdleSlots(long currentTimeMillis) {
    final Collection<AllocatedSlotPool.FreeSlotInfo> freeSlotsInformation = slotPool.getFreeSlotsInformation();
    ResourceCounter excessResources = fulfilledResourceRequirements.subtract(totalResourceRequirements);
    final Iterator<AllocatedSlotPool.FreeSlotInfo> freeSlotIterator = freeSlotsInformation.iterator();
    final Collection<AllocatedSlot> slotsToReturnToOwner = new ArrayList<>();
    while (!excessResources.isEmpty() && freeSlotIterator.hasNext()) {
        final AllocatedSlotPool.FreeSlotInfo idleSlot = freeSlotIterator.next();
        if (currentTimeMillis >= idleSlot.getFreeSince() + idleSlotTimeout.toMilliseconds()) {
            final ResourceProfile matchingProfile = getMatchingResourceProfile(idleSlot.getAllocationId());
            if (excessResources.containsResource(matchingProfile)) {
                excessResources = excessResources.subtract(matchingProfile, 1);
                final Optional<AllocatedSlot> removedSlot = slotPool.removeSlot(idleSlot.getAllocationId());
                final AllocatedSlot allocatedSlot = removedSlot.orElseThrow(() -> new IllegalStateException(String.format("Could not find slot for allocation id %s.", idleSlot.getAllocationId())));
                slotsToReturnToOwner.add(allocatedSlot);
            }
        }
    }
    releaseSlots(slotsToReturnToOwner, new FlinkException("Returning idle slots to their owners."));
    LOG.debug("Idle slots have been returned; new total acquired resources: {}", fulfilledResourceRequirements);
}
Also used : ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) ArrayList(java.util.ArrayList) ResourceCounter(org.apache.flink.runtime.util.ResourceCounter) FlinkException(org.apache.flink.util.FlinkException)

Example 60 with FlinkException

use of org.apache.flink.util.FlinkException in project flink by apache.

the class JobMaster method registerTaskManager.

@Override
public CompletableFuture<RegistrationResponse> registerTaskManager(final JobID jobId, final TaskManagerRegistrationInformation taskManagerRegistrationInformation, final Time timeout) {
    if (!jobGraph.getJobID().equals(jobId)) {
        log.debug("Rejecting TaskManager registration attempt because of wrong job id {}.", jobId);
        return CompletableFuture.completedFuture(new JMTMRegistrationRejection(String.format("The JobManager is not responsible for job %s. Maybe the TaskManager used outdated connection information.", jobId)));
    }
    final TaskManagerLocation taskManagerLocation;
    try {
        taskManagerLocation = resolveTaskManagerLocation(taskManagerRegistrationInformation.getUnresolvedTaskManagerLocation());
    } catch (FlinkException exception) {
        log.error("Could not accept TaskManager registration.", exception);
        return CompletableFuture.completedFuture(new RegistrationResponse.Failure(exception));
    }
    final ResourceID taskManagerId = taskManagerLocation.getResourceID();
    final UUID sessionId = taskManagerRegistrationInformation.getTaskManagerSession();
    final TaskManagerRegistration taskManagerRegistration = registeredTaskManagers.get(taskManagerId);
    if (taskManagerRegistration != null) {
        if (taskManagerRegistration.getSessionId().equals(sessionId)) {
            log.debug("Ignoring registration attempt of TaskManager {} with the same session id {}.", taskManagerId, sessionId);
            final RegistrationResponse response = new JMTMRegistrationSuccess(resourceId);
            return CompletableFuture.completedFuture(response);
        } else {
            disconnectTaskManager(taskManagerId, new FlinkException("A registered TaskManager re-registered with a new session id. This indicates a restart of the TaskManager. Closing the old connection."));
        }
    }
    return getRpcService().connect(taskManagerRegistrationInformation.getTaskManagerRpcAddress(), TaskExecutorGateway.class).handleAsync((TaskExecutorGateway taskExecutorGateway, Throwable throwable) -> {
        if (throwable != null) {
            return new RegistrationResponse.Failure(throwable);
        }
        slotPoolService.registerTaskManager(taskManagerId);
        registeredTaskManagers.put(taskManagerId, TaskManagerRegistration.create(taskManagerLocation, taskExecutorGateway, sessionId));
        // monitor the task manager as heartbeat target
        taskManagerHeartbeatManager.monitorTarget(taskManagerId, new TaskExecutorHeartbeatSender(taskExecutorGateway));
        return new JMTMRegistrationSuccess(resourceId);
    }, getMainThreadExecutor());
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) UUID(java.util.UUID) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) FlinkException(org.apache.flink.util.FlinkException)

Aggregations

FlinkException (org.apache.flink.util.FlinkException)197 Test (org.junit.Test)91 CompletableFuture (java.util.concurrent.CompletableFuture)59 IOException (java.io.IOException)38 ExecutionException (java.util.concurrent.ExecutionException)26 ArrayList (java.util.ArrayList)25 JobID (org.apache.flink.api.common.JobID)24 Collection (java.util.Collection)22 CompletionException (java.util.concurrent.CompletionException)22 Configuration (org.apache.flink.configuration.Configuration)21 TimeoutException (java.util.concurrent.TimeoutException)19 FutureUtils (org.apache.flink.util.concurrent.FutureUtils)19 Time (org.apache.flink.api.common.time.Time)16 OneShotLatch (org.apache.flink.core.testutils.OneShotLatch)16 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)16 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)15 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)14 Collections (java.util.Collections)13 List (java.util.List)13 ExecutorService (java.util.concurrent.ExecutorService)13